from paddleocr import PPStructureV3
from typing import Optional
import base64, io
import numpy as np
from PIL import Image

def _b64_to_bgr(b64: str) -> Optional[np.ndarray]:
    """Decode base64 (with or without data URL) to BGR numpy image."""
    if "," in b64 and ";base64" in b64[:64]:
        b64 = b64.split(",", 1)[1]
    try:
        data = base64.b64decode(b64, validate=True)
        img = Image.open(io.BytesIO(data)).convert("RGB")
        return np.array(img)[:, :, ::-1]  # RGB->BGR
    except Exception:
        return None

# to learn more about model choosing refer to https://www.paddleocr.ai/latest/en/version3.x/pipeline_usage/PP-StructureV3.html
def extract_name_from_id(id_front: str, 
                         layout_detection_model: Optional[str] = "PP-DocLayout-M", 
                         text_detection_model: Optional[str] = "PP-OCRv5_server_det",
                         text_recognition_model: Optional[str] = "en_PP-OCRv4_mobile_rec",
                         ) -> str:
    
    pipeline = PPStructureV3(layout_detection_model_name=layout_detection_model, 
                             text_detection_model_name=text_detection_model,
                             text_recognition_model_name=text_recognition_model,
                             lang="en")
    
    image = _b64_to_bgr(id_front)
    output = pipeline.predict(image)
    results = output[0].get("overall_ocr_res").get("rec_texts", [])
    name = extract_name_from_boxes(results)
    return name

def extract_name_from_boxes(rec_texts) -> str:
    rec_texts = [t.strip() for t in rec_texts if t.strip()]

    for i, t in enumerate(rec_texts):
        lower = t.lower()

        if "name" in lower:
            parts = t.split()
            idx = next((j for j, w in enumerate(parts) if "name" in w.lower()), None)

            if idx is not None and idx + 1 < len(parts):
                return " ".join(parts[idx+1: idx+5])

            if i + 1 < len(rec_texts):
                next_items = " ".join(rec_texts[i+1:i+5])
                return next_items

    return ""