structured well

2025-08-29 18:16:51 +05:30
parent c9ad84c3a8
commit d89bee4f07
16 changed files with 3080 additions and 0 deletions
--- a/apps/PaymentOCRService/app/init.py
+++ b/apps/PaymentOCRService/app/init.py
--- a/apps/PaymentOCRService/app/main.py
+++ b/apps/PaymentOCRService/app/main.py
@@ -0,0 +1,81 @@
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse
+from typing import List, Optional
+import io
+import os
+
+from app.pipeline_adapter import (
+    process_images_to_rows,
+    rows_to_csv_bytes,
+)
+
+app = FastAPI(
+    title="Medical Billing OCR API",
+    description="FastAPI wrapper around the complete OCR pipeline (Google Vision + deskew + line clustering + extraction).",
+    version="1.0.0",
+)
+
+ALLOWED_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp"}
+
+@app.get("/health", response_class=PlainTextResponse)
+def health():
+    # Simple sanity check (also ensures GCP creds var visibility)
+    creds = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", "")
+    return f"OK | GOOGLE_APPLICATION_CREDENTIALS set: {bool(creds)}"
+
+@app.post("/extract/json")
+async def extract_json(files: List[UploadFile] = File(...)):
+    if not files:
+        raise HTTPException(status_code=400, detail="No files provided.")
+
+    # Validate extensions early (not bulletproof, but helpful)
+    bad = [f.filename for f in files if os.path.splitext(f.filename or "")[1].lower() not in ALLOWED_EXTS]
+    if bad:
+        raise HTTPException(
+            status_code=415,
+            detail=f"Unsupported file types: {', '.join(bad)}. Allowed: {', '.join(sorted(ALLOWED_EXTS))}"
+        )
+
+    # Read blobs in-memory
+    blobs = []
+    filenames = []
+    for f in files:
+        blobs.append(await f.read())
+        filenames.append(f.filename or "upload.bin")
+
+    try:
+        rows = process_images_to_rows(blobs, filenames)
+        # rows is a list[dict] where each dict contains the columns you already emit (Patient Name, etc.)
+        return JSONResponse(content={"rows": rows})
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Processing error: {e}")
+
+@app.post("/extract/csv")
+async def extract_csv(files: List[UploadFile] = File(...), filename: Optional[str] = None):
+    if not files:
+        raise HTTPException(status_code=400, detail="No files provided.")
+
+    bad = [f.filename for f in files if os.path.splitext(f.filename or "")[1].lower() not in ALLOWED_EXTS]
+    if bad:
+        raise HTTPException(
+            status_code=415,
+            detail=f"Unsupported file types: {', '.join(bad)}. Allowed: {', '.join(sorted(ALLOWED_EXTS))}"
+        )
+
+    blobs = []
+    filenames = []
+    for f in files:
+        blobs.append(await f.read())
+        filenames.append(f.filename or "upload.bin")
+
+    try:
+        rows = process_images_to_rows(blobs, filenames)
+        csv_bytes = rows_to_csv_bytes(rows)
+        out_name = filename or "medical_billing_extract.csv"
+        return StreamingResponse(
+            io.BytesIO(csv_bytes),
+            media_type="text/csv",
+            headers={"Content-Disposition": f'attachment; filename="{out_name}"'}
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Processing error: {e}")
--- a/apps/PaymentOCRService/app/pipeline-adaptor.py
+++ b/apps/PaymentOCRService/app/pipeline-adaptor.py
@@ -0,0 +1,77 @@
+import os
+import tempfile
+from typing import List, Dict
+import pandas as pd
+
+# Import your existing functions directly from complete_pipeline.py
+from complete_pipeline import (
+    smart_deskew_with_lines,
+    extract_all_clients_from_lines,
+)
+
+def _process_single_image_bytes(blob: bytes, display_name: str) -> List[Dict]:
+    """
+    Saves bytes to a temp file (so OpenCV + Google Vision can read it),
+    runs your existing pipeline functions, and returns extracted rows.
+    """
+    suffix = os.path.splitext(display_name)[1] or ".jpg"
+    tmp_path = None
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+            tmp.write(blob)
+            tmp_path = tmp.name
+
+        # Uses Google Vision + deskew + post-line grouping
+        info = smart_deskew_with_lines(tmp_path, None, clamp_deg=30.0, use_vision=True)
+        post_lines = info.get("post_lines", []) if info else []
+        rows = extract_all_clients_from_lines(post_lines) if post_lines else []
+
+        # Add source file information (same as your Streamlit app)
+        for r in rows:
+            r["Source File"] = display_name
+
+        # If nothing parsed, still return a placeholder row to indicate failure (optional)
+        if not rows:
+            rows.append({
+                'Patient Name': "", 'Patient ID': "", 'ICN': "", 'CDT Code': "",
+                'Tooth': "", 'Date SVC': "",
+                'Billed Amount': "", 'Allowed Amount': "", 'Paid Amount': "",
+                'Extraction Success': False, 'Source File': display_name,
+            })
+
+        return rows
+
+    finally:
+        if tmp_path:
+            try:
+                os.unlink(tmp_path)
+            except Exception:
+                pass
+
+def process_images_to_rows(blobs: List[bytes], filenames: List[str]) -> List[Dict]:
+    """
+    Public API used by FastAPI routes.
+    blobs: list of image bytes
+    filenames: matching names for display / Source File column
+    """
+    all_rows: List[Dict] = []
+    for blob, name in zip(blobs, filenames):
+        rows = _process_single_image_bytes(blob, name)
+        all_rows.extend(rows)
+
+    return all_rows
+
+def rows_to_csv_bytes(rows: List[Dict]) -> bytes:
+    """
+    Convert pipeline rows to CSV bytes (for frontend to consume as a table).
+    """
+    df = pd.DataFrame(rows)
+    # Keep a stable column order if present (mirrors your Excel order)
+    desired = [
+        'Patient Name', 'Patient ID', 'ICN', 'CDT Code', 'Tooth', 'Date SVC',
+        'Billed Amount', 'Allowed Amount', 'Paid Amount',
+        'Extraction Success', 'Source File'
+    ]
+    cols = [c for c in desired if c in df.columns] + [c for c in df.columns if c not in desired]
+    df = df[cols]
+    return df.to_csv(index=False).encode("utf-8")