initial commit

2026-04-04 22:13:55 -04:00
commit 5d77e207c9
10181 changed files with 522212 additions and 0 deletions
--- a/apps/PatientDataExtractorService/main.py
+++ b/apps/PatientDataExtractorService/main.py
@@ -0,0 +1,95 @@
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+import uvicorn
+import fitz  # PyMuPDF
+import re
+import os
+
+from dotenv import load_dotenv
+load_dotenv() 
+
+app = FastAPI()
+
+# Optional: allow CORS for development
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # change in production
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+DOB_RE = re.compile(r'(?<!\d)(\d{1,2})/(\d{1,2})/(\d{4})(?!\d)')
+ID_RE  = re.compile(r'^\d{8,14}$')  # 8–14 digits, whole line
+
+# lines that tell us we've moved past the name/DOB area
+STOP_WORDS = {
+    'eligibility', 'coverage', 'age band', 'date of', 'service',
+    'tooth', 'number', 'surface', 'procedure', 'code', 'description',
+    'provider', 'printed on', 'member id', 'name', 'date of birth'
+}
+
+@app.post("/extract")
+async def extract(pdf: UploadFile = File(...)):
+    if not pdf:
+        raise HTTPException(status_code=400, detail="Missing 'pdf' file")
+
+    content = await pdf.read()
+    try:
+        doc = fitz.open(stream=content, filetype="pdf")
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Unable to open PDF: {e}")
+    
+    # Extract text from all pages
+    text = "\n".join(page.get_text("text") for page in doc)
+    lines = [line.strip() for line in text.splitlines() if line.strip()]
+
+    member_id = ""
+    name = ""
+    dob = ""
+
+    # 1) Find the first plausible member ID (8–14 digits)
+    id_idx = -1
+    for i, line in enumerate(lines):
+        if ID_RE.match(line):
+            member_id = line
+            id_idx = i
+            break
+
+    if id_idx == -1:
+        return {"memberId": "", "name": "", "dob": ""}
+
+    # 2) Scan forward to collect name + DOB; handle both same-line and next-line cases
+    collected = []
+    j = id_idx + 1
+    while j < len(lines):
+        low = lines[j].lower()
+        if any(sw in low for sw in STOP_WORDS):
+            break
+        collected.append(lines[j])
+        # If we already found a DOB, we can stop early
+        if DOB_RE.search(lines[j]):
+            break
+        j += 1
+
+    # Flatten the collected chunk to search for a date (works if DOB is on same line or next)
+    blob = " ".join(collected).strip()
+
+    m = DOB_RE.search(blob)
+    if m:
+        dob = m.group(0)
+        # name is everything before the date within the same blob
+        name = blob[:m.start()].strip()
+    else:
+        # fallback: if we didn't find a date, assume first collected line(s) are name
+        name = blob
+
+    return {
+        "memberId": member_id,
+        "name": name,
+        "dob": dob
+    }
+
+if __name__ == "__main__":
+    host = os.getenv("HOST")
+    port = int(os.getenv("PORT"))
+    uvicorn.run(app, host=host, port=port)