feat(ocr payment page) - added backend routes on app

2025-08-30 01:11:59 +05:30
parent e5271d6979
commit c5edac2ab8
19 changed files with 314 additions and 109 deletions
--- a/apps/PaymentOCRService/.env.example
+++ b/apps/PaymentOCRService/.env.example
@@ -0,0 +1,3 @@
+GOOGLE_APPLICATION_CREDENTIALS=google-credentials.json
+HOST="0.0.0.0"
+PORT="5003"
--- a/apps/PaymentOCRService/.gitignore
+++ b/apps/PaymentOCRService/.gitignore
@@ -0,0 +1 @@
+google_credentials.json
--- a/apps/PaymentOCRService/pycache/complete_pipeline.cpython-313.pyc
+++ b/apps/PaymentOCRService/pycache/complete_pipeline.cpython-313.pyc
--- a/apps/PaymentOCRService/pycache/complete_pipeline_adapter.cpython-313.pyc
+++ b/apps/PaymentOCRService/pycache/complete_pipeline_adapter.cpython-313.pyc
--- a/apps/PaymentOCRService/app/init.py
+++ b/apps/PaymentOCRService/app/init.py
--- a/apps/PaymentOCRService/app/main.py
+++ b/apps/PaymentOCRService/app/main.py
@@ -1,81 +0,0 @@
-from fastapi import FastAPI, UploadFile, File, HTTPException
-from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse
-from typing import List, Optional
-import io
-import os
-
-from app.pipeline_adapter import (
-    process_images_to_rows,
-    rows_to_csv_bytes,
-)
-
-app = FastAPI(
-    title="Medical Billing OCR API",
-    description="FastAPI wrapper around the complete OCR pipeline (Google Vision + deskew + line clustering + extraction).",
-    version="1.0.0",
-)
-
-ALLOWED_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp"}
-
-@app.get("/health", response_class=PlainTextResponse)
-def health():
-    # Simple sanity check (also ensures GCP creds var visibility)
-    creds = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", "")
-    return f"OK | GOOGLE_APPLICATION_CREDENTIALS set: {bool(creds)}"
-
-@app.post("/extract/json")
-async def extract_json(files: List[UploadFile] = File(...)):
-    if not files:
-        raise HTTPException(status_code=400, detail="No files provided.")
-
-    # Validate extensions early (not bulletproof, but helpful)
-    bad = [f.filename for f in files if os.path.splitext(f.filename or "")[1].lower() not in ALLOWED_EXTS]
-    if bad:
-        raise HTTPException(
-            status_code=415,
-            detail=f"Unsupported file types: {', '.join(bad)}. Allowed: {', '.join(sorted(ALLOWED_EXTS))}"
-        )
-
-    # Read blobs in-memory
-    blobs = []
-    filenames = []
-    for f in files:
-        blobs.append(await f.read())
-        filenames.append(f.filename or "upload.bin")
-
-    try:
-        rows = process_images_to_rows(blobs, filenames)
-        # rows is a list[dict] where each dict contains the columns you already emit (Patient Name, etc.)
-        return JSONResponse(content={"rows": rows})
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Processing error: {e}")
-
-@app.post("/extract/csv")
-async def extract_csv(files: List[UploadFile] = File(...), filename: Optional[str] = None):
-    if not files:
-        raise HTTPException(status_code=400, detail="No files provided.")
-
-    bad = [f.filename for f in files if os.path.splitext(f.filename or "")[1].lower() not in ALLOWED_EXTS]
-    if bad:
-        raise HTTPException(
-            status_code=415,
-            detail=f"Unsupported file types: {', '.join(bad)}. Allowed: {', '.join(sorted(ALLOWED_EXTS))}"
-        )
-
-    blobs = []
-    filenames = []
-    for f in files:
-        blobs.append(await f.read())
-        filenames.append(f.filename or "upload.bin")
-
-    try:
-        rows = process_images_to_rows(blobs, filenames)
-        csv_bytes = rows_to_csv_bytes(rows)
-        out_name = filename or "medical_billing_extract.csv"
-        return StreamingResponse(
-            io.BytesIO(csv_bytes),
-            media_type="text/csv",
-            headers={"Content-Disposition": f'attachment; filename="{out_name}"'}
-        )
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Processing error: {e}")
--- a/apps/PaymentOCRService/complete_pipeline.py
+++ b/apps/PaymentOCRService/complete_pipeline.py
@@ -2,6 +2,8 @@
 # -*- coding: utf-8 -*-

 """
+ALL IS GENERATED BY REPLIT: 
+
 End-to-end local pipeline (single script)

 - One Google Vision pass per image (DOCUMENT_TEXT_DETECTION)
--- a/apps/PaymentOCRService/complete_pipeline_adapter.py
+++ b/apps/PaymentOCRService/complete_pipeline_adapter.py
@@ -4,10 +4,7 @@ from typing import List, Dict
 import pandas as pd

 # Import your existing functions directly from complete_pipeline.py
-from complete_pipeline import (
-    smart_deskew_with_lines,
-    extract_all_clients_from_lines,
-)
+from complete_pipeline import smart_deskew_with_lines, extract_all_clients_from_lines

 def _process_single_image_bytes(blob: bytes, display_name: str) -> List[Dict]:
    """
--- a/apps/PaymentOCRService/main.py
+++ b/apps/PaymentOCRService/main.py
@@ -0,0 +1,168 @@
+from fastapi import FastAPI, UploadFile, File, HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse
+from typing import List, Optional
+import io
+import os
+import asyncio
+
+from dotenv import load_dotenv
+load_dotenv()  # loads .env (GOOGLE_APPLICATION_CREDENTIALS, HOST, PORT, etc.)
+
+# Your adapter that calls the pipeline
+from complete_pipeline_adapter import process_images_to_rows,rows_to_csv_bytes
+
+# -------------------------------------------------
+# App + concurrency controls (similar to your other app)
+# -------------------------------------------------
+app = FastAPI(
+    title="Payment OCR Services API",
+    description="FastAPI wrapper around the OCR pipeline (Google Vision + deskew + line grouping + extraction).",
+    version="1.0.0",
+)
+
+# Concurrency/semaphore (optional but useful for OCR)
+MAX_CONCURRENCY = int(os.getenv("MAX_CONCURRENCY", "2"))
+semaphore = asyncio.Semaphore(MAX_CONCURRENCY)
+
+active_jobs = 0
+waiting_jobs = 0
+lock = asyncio.Lock()
+
+# CORS
+cors_origins = os.getenv("CORS_ORIGINS", "*")
+allow_origins = ["*"] if cors_origins.strip() == "*" else [o.strip() for o in cors_origins.split(",") if o.strip()]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=allow_origins,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+ALLOWED_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp"}
+
+# -------------------------------------------------
+# Health + status
+# -------------------------------------------------
+@app.get("/health", response_class=PlainTextResponse)
+def health():
+    creds = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", "")
+    return f"OK | GOOGLE_APPLICATION_CREDENTIALS set: {bool(creds)}"
+
+@app.get("/status")
+async def get_status():
+    async with lock:
+        return {
+            "active_jobs": active_jobs,
+            "queued_jobs": waiting_jobs,
+            "max_concurrency": MAX_CONCURRENCY,
+            "status": "busy" if active_jobs > 0 or waiting_jobs > 0 else "idle",
+        }
+
+# -------------------------------------------------
+# Helpers
+# -------------------------------------------------
+def _validate_files(files: List[UploadFile]):
+    if not files:
+        raise HTTPException(status_code=400, detail="No files provided.")
+    bad = [f.filename for f in files if os.path.splitext(f.filename or "")[1].lower() not in ALLOWED_EXTS]
+    if bad:
+        raise HTTPException(
+            status_code=415,
+            detail=f"Unsupported file types: {', '.join(bad)}. Allowed: {', '.join(sorted(ALLOWED_EXTS))}"
+        )
+
+# -------------------------------------------------
+# Endpoints
+# -------------------------------------------------
+@app.post("/extract/json")
+async def extract_json(files: List[UploadFile] = File(...)):
+    _validate_files(files)
+
+    async with lock:
+        global waiting_jobs
+        waiting_jobs += 1
+
+    async with semaphore:
+        async with lock:
+            waiting_jobs -= 1
+            global active_jobs
+            active_jobs += 1
+
+        try:
+            blobs = [await f.read() for f in files]
+            names = [f.filename or "upload.bin" for f in files]
+            rows = process_images_to_rows(blobs, names)  # calls your pipeline
+            return JSONResponse(content={"rows": rows})
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Processing error: {e}")
+        finally:
+            async with lock:
+                active_jobs -= 1
+
+@app.post("/extract/csvtext", response_class=PlainTextResponse)
+async def extract_csvtext(files: List[UploadFile] = File(...)):
+    _validate_files(files)
+
+    async with lock:
+        global waiting_jobs
+        waiting_jobs += 1
+
+    async with semaphore:
+        async with lock:
+            waiting_jobs -= 1
+            global active_jobs
+            active_jobs += 1
+
+        try:
+            blobs = [await f.read() for f in files]
+            names = [f.filename or "upload.bin" for f in files]
+            rows = process_images_to_rows(blobs, names)
+            csv_bytes = rows_to_csv_bytes(rows)
+            return PlainTextResponse(csv_bytes.decode("utf-8"), media_type="text/csv")
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Processing error: {e}")
+        finally:
+            async with lock:
+                active_jobs -= 1
+
+@app.post("/extract/csv")
+async def extract_csv(files: List[UploadFile] = File(...), filename: Optional[str] = None):
+    _validate_files(files)
+
+    async with lock:
+        global waiting_jobs
+        waiting_jobs += 1
+
+    async with semaphore:
+        async with lock:
+            waiting_jobs -= 1
+            global active_jobs
+            active_jobs += 1
+
+        try:
+            blobs = [await f.read() for f in files]
+            names = [f.filename or "upload.bin" for f in files]
+            rows = process_images_to_rows(blobs, names)
+            csv_bytes = rows_to_csv_bytes(rows)
+            out_name = filename or "medical_billing_extract.csv"
+            return StreamingResponse(
+                io.BytesIO(csv_bytes),
+                media_type="text/csv",
+                headers={"Content-Disposition": f'attachment; filename="{out_name}"'}
+            )
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=f"Processing error: {e}")
+        finally:
+            async with lock:
+                active_jobs -= 1
+
+# -------------------------------------------------
+# Entrypoint (same pattern as your selenium app)
+# -------------------------------------------------
+if __name__ == "__main__":
+    import uvicorn
+    host = os.getenv("HOST")
+    port = int(os.getenv("PORT"))
+    reload_flag = os.getenv("RELOAD", "false").lower() == "true"
+    uvicorn.run(app, host=host, port=port, reload=reload_flag)
--- a/apps/PaymentOCRService/package.json
+++ b/apps/PaymentOCRService/package.json
@@ -1,5 +1,5 @@
 {
-  "name": "pdfservice",
+  "name": "paymentocrservice",
  "private": true,
  "scripts": {
    "postinstall": "pip install -r requirements.txt",
--- a/apps/PaymentOCRService/requirements.txt
+++ b/apps/PaymentOCRService/requirements.txt
@@ -1,10 +1,26 @@
-fastapi
-uvicorn[standard]
-google-cloud-vision
-opencv-python-headless
-pytesseract
-pillow
-pandas
-openpyxl
-numpy
-python-multipart
+annotated-types==0.7.0
+anyio==4.10.0
+click==8.2.1
+colorama==0.4.6
+et_xmlfile==2.0.0
+fastapi==0.116.1
+h11==0.16.0
+idna==3.10
+numpy==2.2.6
+google-cloud-vision>=3.10.2
+opencv-python==4.12.0.88
+openpyxl==3.1.5
+pandas==2.3.2
+pydantic==2.11.7
+pydantic_core==2.33.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.1
+pytz==2025.2
+six==1.17.0
+sniffio==1.3.1
+starlette==0.47.3
+typing-inspection==0.4.1
+typing_extensions==4.15.0
+tzdata==2025.2
+uvicorn==0.35.0
+python-multipart==0.0.20