structured well

2025-08-29 18:16:51 +05:30
parent c9ad84c3a8
commit d89bee4f07
16 changed files with 3080 additions and 0 deletions
--- a/apps/PaymentOCRService/.env.example
+++ b/apps/PaymentOCRService/.env.example
--- a/apps/PaymentOCRService/README.md
+++ b/apps/PaymentOCRService/README.md
@@ -0,0 +1,13 @@
 # Medical Billing OCR API (FastAPI)
 ## 1) Prereqs
 - Google Cloud Vision service-account JSON.
 - `GOOGLE_APPLICATION_CREDENTIALS` env var pointing to that JSON.
 - Tesseract installed (for fallback OCR), and on PATH.
 ## 2) Install & run (local)
 ```bash
 python -m venv .venv && source .venv/bin/activate
 pip install -r requirements.txt
 export GOOGLE_APPLICATION_CREDENTIALS=/absolute/path/to/service-account.json
 uvicorn app.main:app --reload --port 8080
--- a/apps/PaymentOCRService/app/init.py
+++ b/apps/PaymentOCRService/app/init.py
--- a/apps/PaymentOCRService/app/main.py
+++ b/apps/PaymentOCRService/app/main.py
@@ -0,0 +1,81 @@
 from fastapi import FastAPI, UploadFile, File, HTTPException
 from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse
 from typing import List, Optional
 import io
 import os
 from app.pipeline_adapter import (
    process_images_to_rows,
    rows_to_csv_bytes,
 )
 app = FastAPI(
    title="Medical Billing OCR API",
    description="FastAPI wrapper around the complete OCR pipeline (Google Vision + deskew + line clustering + extraction).",
    version="1.0.0",
 )
 ALLOWED_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp"}
@app.get("/health", response_class=PlainTextResponse)
 def health():
    # Simple sanity check (also ensures GCP creds var visibility)
    creds = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", "")
    return f"OK | GOOGLE_APPLICATION_CREDENTIALS set: {bool(creds)}"
@app.post("/extract/json")
 async def extract_json(files: List[UploadFile] = File(...)):
    if not files:
        raise HTTPException(status_code=400, detail="No files provided.")
    # Validate extensions early (not bulletproof, but helpful)
    bad = [f.filename for f in files if os.path.splitext(f.filename or "")[1].lower() not in ALLOWED_EXTS]
    if bad:
        raise HTTPException(
            status_code=415,
            detail=f"Unsupported file types: {', '.join(bad)}. Allowed: {', '.join(sorted(ALLOWED_EXTS))}"
        )
    # Read blobs in-memory
    blobs = []
    filenames = []
    for f in files:
        blobs.append(await f.read())
        filenames.append(f.filename or "upload.bin")
    try:
        rows = process_images_to_rows(blobs, filenames)
        # rows is a list[dict] where each dict contains the columns you already emit (Patient Name, etc.)
        return JSONResponse(content={"rows": rows})
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Processing error: {e}")
@app.post("/extract/csv")
 async def extract_csv(files: List[UploadFile] = File(...), filename: Optional[str] = None):
    if not files:
        raise HTTPException(status_code=400, detail="No files provided.")
    bad = [f.filename for f in files if os.path.splitext(f.filename or "")[1].lower() not in ALLOWED_EXTS]
    if bad:
        raise HTTPException(
            status_code=415,
            detail=f"Unsupported file types: {', '.join(bad)}. Allowed: {', '.join(sorted(ALLOWED_EXTS))}"
        )
    blobs = []
    filenames = []
    for f in files:
        blobs.append(await f.read())
        filenames.append(f.filename or "upload.bin")
    try:
        rows = process_images_to_rows(blobs, filenames)
        csv_bytes = rows_to_csv_bytes(rows)
        out_name = filename or "medical_billing_extract.csv"
        return StreamingResponse(
            io.BytesIO(csv_bytes),
            media_type="text/csv",
            headers={"Content-Disposition": f'attachment; filename="{out_name}"'}
        )
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Processing error: {e}")
--- a/apps/PaymentOCRService/app/pipeline-adaptor.py
+++ b/apps/PaymentOCRService/app/pipeline-adaptor.py
@@ -0,0 +1,77 @@
 import os
 import tempfile
 from typing import List, Dict
 import pandas as pd
 # Import your existing functions directly from complete_pipeline.py
 from complete_pipeline import (
    smart_deskew_with_lines,
    extract_all_clients_from_lines,
 )
 def _process_single_image_bytes(blob: bytes, display_name: str) -> List[Dict]:
    """
    Saves bytes to a temp file (so OpenCV + Google Vision can read it),
    runs your existing pipeline functions, and returns extracted rows.
    """
    suffix = os.path.splitext(display_name)[1] or ".jpg"
    tmp_path = None
    try:
        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
            tmp.write(blob)
            tmp_path = tmp.name
        # Uses Google Vision + deskew + post-line grouping
        info = smart_deskew_with_lines(tmp_path, None, clamp_deg=30.0, use_vision=True)
        post_lines = info.get("post_lines", []) if info else []
        rows = extract_all_clients_from_lines(post_lines) if post_lines else []
        # Add source file information (same as your Streamlit app)
        for r in rows:
            r["Source File"] = display_name
        # If nothing parsed, still return a placeholder row to indicate failure (optional)
        if not rows:
            rows.append({
                'Patient Name': "", 'Patient ID': "", 'ICN': "", 'CDT Code': "",
                'Tooth': "", 'Date SVC': "",
                'Billed Amount': "", 'Allowed Amount': "", 'Paid Amount': "",
                'Extraction Success': False, 'Source File': display_name,
            })
        return rows
    finally:
        if tmp_path:
            try:
                os.unlink(tmp_path)
            except Exception:
                pass
 def process_images_to_rows(blobs: List[bytes], filenames: List[str]) -> List[Dict]:
    """
    Public API used by FastAPI routes.
    blobs: list of image bytes
    filenames: matching names for display / Source File column
    """
    all_rows: List[Dict] = []
    for blob, name in zip(blobs, filenames):
        rows = _process_single_image_bytes(blob, name)
        all_rows.extend(rows)
    return all_rows
 def rows_to_csv_bytes(rows: List[Dict]) -> bytes:
    """
    Convert pipeline rows to CSV bytes (for frontend to consume as a table).
    """
    df = pd.DataFrame(rows)
    # Keep a stable column order if present (mirrors your Excel order)
    desired = [
        'Patient Name', 'Patient ID', 'ICN', 'CDT Code', 'Tooth', 'Date SVC',
        'Billed Amount', 'Allowed Amount', 'Paid Amount',
        'Extraction Success', 'Source File'
    ]
    cols = [c for c in desired if c in df.columns] + [c for c in df.columns if c not in desired]
    df = df[cols]
    return df.to_csv(index=False).encode("utf-8")
--- a/apps/PaymentOCRService/complete-pipeline.py
+++ b/apps/PaymentOCRService/complete-pipeline.py
@@ -0,0 +1,837 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 End-to-end local pipeline (single script)
 - One Google Vision pass per image (DOCUMENT_TEXT_DETECTION)
 - Smart deskew (Hough + OCR pairs) with fine grid search (in-memory)
 - Build slope-aware (pre) and horizontal (post) line dumps (in-memory)
 - Extract all clients & PD rows per page (robust to headers/EOBS)
 - Export nicely formatted Excel via ExcelGenerator
 Usage:
  python ocr_pipeline.py --input "C:\\imgs" --out "results.xlsx"
  python ocr_pipeline.py --files s1.jpg s2.jpg --out results.xlsx
  python ocr_pipeline.py --input "C:\\imgs" --out results.xlsx --deskewed-only
 """
 import os
 import re
 import io
 import cv2
 import math
 import glob
 import argparse
 import numpy as np
 import pandas as pd
 from typing import List, Dict, Tuple, Any, Optional
 from datetime import datetime
 # ========= Debug switch =========
 # Set to True to re-enable saving deskewed images, writing *_lines_*.txt,
 # and printing progress messages.
 DEBUG = False
 # ---------- Google Vision ----------
 from google.cloud import vision
 # ---------- openpyxl helpers ----------
 from openpyxl.utils import get_column_letter
 from openpyxl.cell.cell import MergedCell
 from openpyxl import Workbook
 from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
 from openpyxl.utils.dataframe import dataframe_to_rows
 # ============================================================
 # Config (tuning)
 # ============================================================
 PERP_TOL_FACTOR = 0.6
 SEED_BAND_H     = 3.0
 ALLOW_SINGLETON = True
 POST_Y_TOL_FACTOR = 0.55
 # ============================================================
 # Vision OCR (ONE pass per image)
 # ============================================================
 def _open_bytes(path: str) -> bytes:
    with open(path, "rb") as f:
        return f.read()
 def extract_words_and_text(image_path: str) -> Tuple[List[Dict], str]:
    client = vision.ImageAnnotatorClient()
    resp = client.document_text_detection(image=vision.Image(content=_open_bytes(image_path)))
    if resp.error.message:
        raise RuntimeError(resp.error.message)
    full_text = resp.full_text_annotation.text or ""
    words: List[Dict] = []
    for page in resp.full_text_annotation.pages:
        for block in page.blocks:
            for para in block.paragraphs:
                for word in para.words:
                    text = "".join(s.text for s in word.symbols)
                    vs = word.bounding_box.vertices
                    xs = [v.x for v in vs]; ys = [v.y for v in vs]
                    left, top = min(xs), min(ys)
                    w, h = max(xs) - left, max(ys) - top
                    cx, cy = left + w/2.0, top + h/2.0
                    words.append({"text": text, "left": left, "top": top,
                                  "w": w, "h": h, "cx": cx, "cy": cy})
    return words, full_text
 # ============================================================
 # Skew estimation (Hough + OCR pairs)
 # ============================================================
 def weighted_median(pairs: List[Tuple[float, float]]) -> float:
    if not pairs: return 0.0
    arr = sorted(pairs, key=lambda t: t[0])
    tot = sum(w for _, w in arr)
    acc = 0.0
    for v, w in arr:
        acc += w
        if acc >= tot/2.0:
            return v
    return arr[-1][0]
 def estimate_skew_pairs(words: List[Dict],
                        y_band_mult: float = 2.0,
                        min_dx_mult: float = 0.8,
                        max_abs_deg: float = 15.0) -> Tuple[float,int]:
    if not words: return 0.0, 0
    widths  = [w["w"] for w in words if w["w"]>0]
    heights = [w["h"] for w in words if w["h"]>0]
    w_med = float(np.median(widths) if widths else 10.0)
    h_med = float(np.median(heights) if heights else 16.0)
    y_band = y_band_mult * h_med
    min_dx = max(4.0, min_dx_mult * w_med)
    words_sorted = sorted(words, key=lambda w: (w["cy"], w["cx"]))
    pairs: List[Tuple[float,float]] = []
    for i, wi in enumerate(words_sorted):
        best_j = None; best_dx = None
        for j in range(i+1, len(words_sorted)):
            wj = words_sorted[j]
            dy = wj["cy"] - wi["cy"]
            if dy > y_band: break
            if abs(dy) <= y_band:
                dx = wj["cx"] - wi["cx"]
                if dx <= 0 or dx < min_dx: continue
                if best_dx is None or dx < best_dx:
                    best_dx, best_j = dx, j
        if best_j is None: continue
        wj = words_sorted[best_j]
        dx = wj["cx"] - wi["cx"]; dy = wj["cy"] - wi["cy"]
        ang = math.degrees(math.atan2(dy, dx))
        if abs(ang) <= max_abs_deg:
            pairs.append((ang, max(1.0, dx)))
    if not pairs: return 0.0, 0
    vals = np.array([v for v,_ in pairs], dtype=float)
    q1, q3 = np.percentile(vals, [25,75]); iqr = q3-q1
    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
    trimmed = [(v,w) for v,w in pairs if lo <= v <= hi] or pairs
    return float(weighted_median(trimmed)), len(trimmed)
 def estimate_skew_hough(img: np.ndarray, thr: int = 180) -> Tuple[float,int]:
    g = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    g = cv2.GaussianBlur(g, (3,3), 0)
    edges = cv2.Canny(g, 60, 160, apertureSize=3)
    lines = cv2.HoughLines(edges, 1, np.pi/180, threshold=thr)
    if lines is None: return 0.0, 0
    angs = []
    for (rho, theta) in lines[:,0,:]:
        ang = (theta - np.pi/2.0) * 180.0/np.pi
        while ang > 45: ang -= 90
        while ang < -45: ang += 90
        angs.append(ang)
    angs = np.array(angs, dtype=float)
    med = float(np.median(angs))
    keep = angs[np.abs(angs - med) <= 10.0]
    return (float(np.median(keep)) if keep.size else med), int(angs.size)
 # ============================================================
 # Rotation (image + coordinates) and scoring
 # ============================================================
 def rotation_matrix_keep_bounds(shape_hw: Tuple[int,int], angle_deg: float) -> Tuple[np.ndarray, Tuple[int,int]]:
    h, w = shape_hw
    center = (w/2.0, h/2.0)
    M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
    cos, sin = abs(M[0,0]), abs(M[0,1])
    new_w = int(h*sin + w*cos)
    new_h = int(h*cos + w*sin)
    M[0,2] += (new_w/2) - center[0]
    M[1,2] += (new_h/2) - center[1]
    return M, (new_h, new_w)
 def rotate_image_keep_bounds(img: np.ndarray, angle_deg: float, border_value=255) -> np.ndarray:
    M, (nh, nw) = rotation_matrix_keep_bounds(img.shape[:2], angle_deg)
    return cv2.warpAffine(img, M, (nw, nh),
                          flags=cv2.INTER_LINEAR,
                          borderMode=cv2.BORDER_CONSTANT,
                          borderValue=border_value)
 def transform_words(words: List[Dict], shape_hw: Tuple[int,int], angle_deg: float) -> List[Dict]:
    M, _ = rotation_matrix_keep_bounds(shape_hw, angle_deg)
    out = []
    for w in words:
        x, y = (M @ np.array([w["cx"], w["cy"], 1.0])).tolist()
        ww = dict(w)
        ww["cx_rot"], ww["cy_rot"] = float(x), float(y)
        out.append(ww)
    return out
 def preview_score(img: np.ndarray, deskew_angle: float) -> float:
    h, w = img.shape[:2]
    scale = 1200.0 / max(h, w)
    small = cv2.resize(img, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA) if scale < 1 else img
    rot = rotate_image_keep_bounds(small, deskew_angle, border_value=255)
    resid, n = estimate_skew_hough(rot, thr=140)
    return abs(resid) if n > 0 else 90.0
 # ============================================================
 # Slope-based clustering (pre-rotation)
 # ============================================================
 def line_from_points(p0, p1):
    (x0,y0),(x1,y1)=p0,p1
    dx = x1-x0
    if abs(dx) < 1e-9: return float("inf"), x0
    m = (y1-y0)/dx; b = y0 - m*x0
    return m,b
 def perp_distance(m,b,x,y):
    if math.isinf(m): return abs(x-b)
    return abs(m*x - y + b) / math.sqrt(m*m + 1.0)
 def refit_line(points: List[Tuple[float,float]]) -> Tuple[float,float]:
    if len(points) == 1:
        x,y = points[0]; return 0.0, y
    xs=[p[0] for p in points]; ys=[p[1] for p in points]
    xm = sum(xs)/len(xs); ym = sum(ys)/len(ys)
    num = sum((x-xm)*(y-ym) for x,y in zip(xs,ys))
    den = sum((x-xm)**2 for x in xs)
    if abs(den) < 1e-12: return float("inf"), xm
    m = num/den; b = ym - m*xm
    return m,b
 def project_t(m,b,x0,y0,x,y):
    if math.isinf(m): return y - y0
    denom = math.sqrt(1+m*m)
    return ((x-x0) + m*(y-y0))/denom
 def _build_line_result(words, idxs, m, b, rotated=False):
    origin_idx = min(idxs, key=lambda i: (words[i]["cx_rot"] if rotated else words[i]["cx"]))
    x0 = words[origin_idx]["cx_rot"] if rotated else words[origin_idx]["cx"]
    y0 = words[origin_idx]["cy_rot"] if rotated else words[origin_idx]["cy"]
    ordered = sorted(
        idxs,
        key=lambda i: project_t(
            m, b, x0, y0,
            words[i]["cx_rot"] if rotated else words[i]["cx"],
            words[i]["cy_rot"] if rotated else words[i]["cy"]
        )
    )
    line_words = [words[i] for i in ordered]
    text = " ".join(w["text"] for w in line_words)
    xs = [(w["cx_rot"] if rotated else w["cx"]) for w in line_words]
    ys = [(w["cy_rot"] if rotated else w["cy"]) for w in line_words]
    return {
        "text": text,
        "words": line_words,
        "slope": m,
        "center_x": float(sum(xs)/len(xs)),
        "center_y": float(sum(ys)/len(ys)),
        "count": len(line_words),
    }
 def cluster_tilted_lines(words: List[Dict]) -> List[Dict]:
    if not words: return []
    hs = sorted([w["h"] for w in words if w["h"]>0])
    h_med = hs[len(hs)//2] if hs else 16.0
    perp_tol = PERP_TOL_FACTOR * h_med
    band_dy  = SEED_BAND_H * h_med
    remaining = set(range(len(words)))
    order = sorted(remaining, key=lambda i: (words[i]["cy"], words[i]["cx"]))
    lines = []
    while remaining:
        seed_idx = next(i for i in order if i in remaining)
        remaining.remove(seed_idx)
        sx, sy = words[seed_idx]["cx"], words[seed_idx]["cy"]
        cand_idxs = [j for j in remaining if abs(words[j]["cy"] - sy) <= band_dy]
        if not cand_idxs:
            if ALLOW_SINGLETON:
                m,b = refit_line([(sx,sy)])
                lines.append(_build_line_result(words, {seed_idx}, m, b))
            continue
        cand_idxs.sort(key=lambda j: abs(words[j]["cx"] - sx))
        best_inliers = None; best_mb = None
        for j in cand_idxs[:min(10, len(cand_idxs))]:
            m,b = line_from_points((sx,sy), (words[j]["cx"], words[j]["cy"]))
            inliers = {seed_idx, j}
            for k in remaining:
                xk, yk = words[k]["cx"], words[k]["cy"]
                if perp_distance(m,b,xk,yk) <= perp_tol:
                    inliers.add(k)
            if best_inliers is None or len(inliers) > len(best_inliers):
                best_inliers, best_mb = inliers, (m,b)
        m,b = best_mb
        pts = [(words[i]["cx"], words[i]["cy"]) for i in best_inliers]
        m,b = refit_line(pts)
        expanded = set(best_inliers)
        for idx in list(remaining):
            xk, yk = words[idx]["cx"], words[idx]["cy"]
            if perp_distance(m,b,xk,yk) <= perp_tol:
                expanded.add(idx)
        for idx in expanded:
            if idx in remaining:
                remaining.remove(idx)
        lines.append(_build_line_result(words, expanded, m, b))
    lines.sort(key=lambda L: L["center_y"])
    return lines
 # ============================================================
 # Post-rotation grouping (simple horizontal lines)
 # ============================================================
 def group_horizontal_lines(rotated_words: List[Dict]) -> List[Dict]:
    if not rotated_words: return []
    hs = sorted([w["h"] for w in rotated_words if w["h"]>0])
    h_med = hs[len(hs)//2] if hs else 16.0
    y_tol = POST_Y_TOL_FACTOR * h_med
    idxs = list(range(len(rotated_words)))
    idxs.sort(key=lambda i: (rotated_words[i]["cy_rot"], rotated_words[i]["cx_rot"]))
    lines = []
    cur = []
    def flush():
        nonlocal cur
        if not cur: return
        xs = [rotated_words[i]["cx_rot"] for i in cur]
        ys = [rotated_words[i]["cy_rot"] for i in cur]
        m,b = refit_line(list(zip(xs,ys)))
        cur_sorted = sorted(cur, key=lambda i: rotated_words[i]["cx_rot"])
        lines.append(_build_line_result(rotated_words, set(cur_sorted), m, b, rotated=True))
        cur = []
    for i in idxs:
        if not cur:
            cur = [i]
        else:
            y0 = rotated_words[cur[0]]["cy_rot"]
            yi = rotated_words[i]["cy_rot"]
            if abs(yi - y0) <= y_tol:
                cur.append(i)
            else:
                flush()
                cur = [i]
    flush()
    lines.sort(key=lambda L: L["center_y"])
    return lines
 # ============================================================
 # Utilities: dump lines to txt (only if DEBUG)
 # ============================================================
 def slope_to_deg(m: float) -> float:
    if math.isinf(m): return 90.0
    return math.degrees(math.atan(m))
 def write_lines_txt(base_path: str, suffix: str, lines: List[Dict]) -> Optional[str]:
    if not DEBUG:
        return None
    txt_path = f"{os.path.splitext(base_path)[0]}_{suffix}.txt"
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(f"# {os.path.basename(base_path)}  ({suffix})\n")
        for i, L in enumerate(lines, 1):
            ang = slope_to_deg(L["slope"])
            f.write(f"[{i:03d}] words={L['count']:>3}  slope={ang:+.3f}°\n")
            f.write(L["text"] + "\n\n")
    return txt_path
 # ============================================================
 # Smart deskew + full pipeline (in-memory; returns words + full_text)
 # ============================================================
 def smart_deskew_with_lines(image_path: str,
                            out_path: Optional[str] = None,
                            clamp_deg: float = 30.0,
                            use_vision: bool = True) -> Dict:
    img = cv2.imread(image_path, cv2.IMREAD_COLOR)
    if img is None: raise FileNotFoundError(image_path)
    words, full_text = ([], "")
    if use_vision:
        words, full_text = extract_words_and_text(image_path)
    a_h, n_h = estimate_skew_hough(img)
    a_p, n_p = (0.0, 0)
    if words:
        a_p, n_p = estimate_skew_pairs(words, y_band_mult=2.0, min_dx_mult=0.8, max_abs_deg=15.0)
    candidates = []
    if n_h >= 10: candidates += [a_h, -a_h]
    if n_p >= 10: candidates += [a_p, -a_p]
    if not candidates: candidates = [0.0]
    cand = []
    for a in candidates:
        a = float(max(-clamp_deg, min(clamp_deg, a)))
        if all(abs(a - b) > 0.05 for b in cand):
            cand.append(a)
    grid = []
    for a in cand:
        for d in (-0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6):
            g = a + d
            if all(abs(g - x) > 0.05 for x in grid):
                grid.append(g)
    scored = [(a, preview_score(img, -a)) for a in grid]
    best_angle, best_cost = min(scored, key=lambda t: t[1])
    # Debug print kept as a comment
    # print(f"[smart] hough={a_h:.3f}°(n={n_h})  pairs={a_p:.3f}°(n={n_p})  tried={', '.join(f'{a:+.2f}°' for a,_ in scored)}  → chosen {best_angle:+.2f}° (cost={best_cost:.3f})")
    # Rotate in-memory. Save only if DEBUG.
    rotated = rotate_image_keep_bounds(img, -best_angle, border_value=255)
    if DEBUG and out_path:
        cv2.imwrite(out_path, rotated)
    result = {
        "angle_deg": float(best_angle),
        "hough_lines": int(n_h),
        "pair_samples": int(n_p),
        "out_path": out_path if DEBUG else None,
        "pre_txt": None,
        "post_txt": None,
        "pre_lines": [],
        "post_lines": [],
        "words": words,
        "full_text": full_text,
    }
    if words:
        pre_lines = cluster_tilted_lines(words)
        result["pre_lines"] = pre_lines
        result["pre_txt"] = write_lines_txt(image_path, "lines_pre", pre_lines)  # only if DEBUG
        rot_words = transform_words(words, img.shape[:2], -best_angle)
        post_lines = group_horizontal_lines(rot_words)
        result["post_lines"] = post_lines
        result["post_txt"] = write_lines_txt(image_path, "lines_post", post_lines)  # only if DEBUG
        # More debug prints kept as comments
        # def preview(lines, tag):
        #     print(f"  {tag} ({len(lines)} lines)")
        #     for L in lines[:5]:
        #         ang = slope_to_deg(L["slope"])
        #         print(f"    [{L['count']:>3} w] slope={ang:+.3f}° | {L['text'][:90]}")
        # preview(pre_lines, "pre (slope-aware)")
        # preview(post_lines, "post (horizontal)")
        # if DEBUG:
        #     print(f"  → wrote: {result['pre_txt']}  and  {result['post_txt']}")
    return result
 # ============================================================
 # Multi-client extraction from post lines (robust)
 # ============================================================
 MEMBER_RE   = re.compile(r'\bMEMBER NAME\s*:\s*(.+)', re.IGNORECASE)
 MEMBERID_RE = re.compile(r'\bMEMBER ID\s*:\s*([A-Za-z0-9]+)', re.IGNORECASE)
 ICN_LINE_RE = re.compile(r'^\s*\d{12,}\b')
 AMOUNT_RE   = re.compile(r'(\d{1,3}(?:,\d{3})*\.\d{2})')  # decimals only
 DATE6_RE    = re.compile(r'\b\d{6}\b')
 PD_ROW_RE   = re.compile(r'\bPD\s+(D?\d{4})\b', re.IGNORECASE)
 TOOTH_RE  = re.compile(r'^(?:[1-9]|[12][0-9]|3[0-2]|[A-Ta-t])$')
 SURFACE_RE = re.compile(r'^[MDBOILFP]{1,4}$', re.IGNORECASE)
 def _to_float(s: str) -> float:
    try:
        return float(s.replace(',', ''))
    except Exception:
        return 0.0
 def _parse_pd_line(t: str) -> Optional[Tuple[str, Optional[float], Optional[float], Optional[float], Optional[str], Optional[str], Optional[str]]]:
    """
    Parse a single PD line.
    Returns: (CDT, billed, allowed, paid, date6, tooth, surface)
    """
    m = PD_ROW_RE.search(t)
    if not m:
        return None
    code = m.group(1)
    code = code if code.upper().startswith('D') else f'D{code}'
    amts = [_to_float(x) for x in AMOUNT_RE.findall(t)]
    billed = allowed = paid = None
    if len(amts) >= 3:
        billed, allowed, paid = amts[-3:]
    d = None
    md = DATE6_RE.search(t)
    if md:
        d = md.group(0)
    tooth = None
    surface = None
    tokens = t.split()
    try:
        code_idx = tokens.index(code)
    except ValueError:
        code_idx = None
        for i, tok in enumerate(tokens):
            if PD_ROW_RE.match(f'PD {tok}'):
                code_idx = i
                break
    if code_idx is not None:
        date_idx = None
        for i in range(code_idx + 1, len(tokens)):
            if DATE6_RE.fullmatch(tokens[i]):
                date_idx = i
                break
        window = tokens[code_idx + 1: date_idx if date_idx is not None else len(tokens)]
        for tok in window:
            if TOOTH_RE.fullmatch(tok):
                tooth = tok.upper()
                break
        start_j = 0
        if tooth is not None:
            for j, tok in enumerate(window):
                if tok.upper() == tooth:
                    start_j = j + 1
                    break
        for tok in window[start_j:]:
            if SURFACE_RE.fullmatch(tok):
                surface = tok.upper()
                break
    return code, billed, allowed, paid, d, tooth, surface
 def extract_all_clients_from_lines(post_lines: List[dict]) -> List[dict]:
    """
    Split strictly by MEMBER NAME lines; ignore anything before the first name.
    For each member block, look up ICN from the nearest line above the member header.
    Parse each PD line for CDT, Date SVC, Billed, Allowed, Paid (decimals only).
    """
    texts = [L["text"] for L in post_lines]
    starts = [i for i,t in enumerate(texts) if MEMBER_RE.search(t)]
    if not starts:
        return []
    out_rows = []
    for si, start in enumerate(starts):
        end = starts[si+1] if si+1 < len(starts) else len(texts)
        # header line with MEMBER NAME
        name_line = texts[start]
        raw_name = MEMBER_RE.search(name_line).group(1).strip()
        # Stop at "MEMBER ID" (case-insensitive) and other headers
        cut_points = ["MEMBER ID", "OTH INS CD", "PA:", "DIAG:"]
        mname = raw_name
        for cp in cut_points:
            idx = mname.upper().find(cp)
            if idx != -1:
                mname = mname[:idx].strip()
        # Debug
        # print(raw_name); print(mname)
        # member id: search within the block
        mid = ""
        for t in texts[start:end]:
            m = MEMBERID_RE.search(t)
            if m:
                mid = m.group(1).strip()
                break
        # ICN: search a few lines ABOVE the member header
        icn = ""
        for k in range(start-1, max(-1, start-6), -1):
            if k < 0: break
            mm = ICN_LINE_RE.match(texts[k])
            if mm:
                icn = mm.group(0)
                break
        # PD lines in the block
        had_pd = False
        for t in texts[start:end]:
            if " PD " not in f" {t} ":
                continue
            parsed = _parse_pd_line(t)
            if not parsed:
                continue
            had_pd = True
            code, billed, allowed, paid, dsvc, tooth, surface = parsed
            out_rows.append({
                'Patient Name': mname.title() if mname else "",
                'Patient ID': mid,
                'ICN': icn,
                'CDT Code': code,
                'Tooth': tooth if tooth else "",
                #'Surface': surface if surface else "",
                'Date SVC': dsvc if dsvc else "",
                'Billed Amount': billed if billed is not None else "",
                'Allowed Amount': allowed if allowed is not None else "",
                'Paid Amount':   paid   if paid   is not None else "",
                'Extraction Success': True,
            })
        if not had_pd:
            out_rows.append({
                'Patient Name': mname.title() if mname else "",
                'Patient ID': mid,
                'ICN': icn,
                'CDT Code': "",
                'Tooth': "",
                #'Surface': "",
                'Date SVC': "",
                'Billed Amount': "",
                'Allowed Amount': "",
                'Paid Amount': "",
                'Extraction Success': bool(mname or mid),
            })
    return out_rows
 # ============================================================
 # ExcelGenerator
 # ============================================================
 class ExcelGenerator:
    def __init__(self):
        self.header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
        self.header_font = Font(color="FFFFFF", bold=True)
        self.border = Border(
            left=Side(style='thin'),
            right=Side(style='thin'),
            top=Side(style='thin'),
            bottom=Side(style='thin')
        )
        self.center_alignment = Alignment(horizontal='center', vertical='center')
    def create_excel_file(self, df: pd.DataFrame) -> bytes:
        wb = Workbook()
        ws = wb.active
        ws.title = "Medical Billing Extract"
        ws['A1'] = f"Medical Billing OCR Extract - Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
        ws.merge_cells('A1:H1')
        ws['A1'].font = Font(size=14, bold=True)
        ws['A1'].alignment = self.center_alignment
        ws.append([])
        excel_df = self.prepare_dataframe_for_excel(df)
        for r in dataframe_to_rows(excel_df, index=False, header=True):
            ws.append(r)
        self.format_worksheet(ws, len(excel_df) + 3)
        self.add_summary_sheet(wb, excel_df)
        output = io.BytesIO()
        wb.save(output)
        output.seek(0)
        return output.getvalue()
    def prepare_dataframe_for_excel(self, df: pd.DataFrame) -> pd.DataFrame:
        excel_df = df.copy()
        column_order = [
            'Patient Name', 'Patient ID', 'ICN', 'CDT Code', 'Tooth', 'Date SVC', #'Surface',
            'Billed Amount', 'Allowed Amount', 'Paid Amount',
            'Extraction Success', 'Source File'
        ]
        existing = [c for c in column_order if c in excel_df.columns]
        excel_df = excel_df[existing]
        for amount_col in ['Billed Amount', 'Allowed Amount', 'Paid Amount']:
            if amount_col in excel_df.columns:
                excel_df[amount_col] = excel_df[amount_col].apply(self.format_currency)
        if 'Extraction Success' in excel_df.columns:
            excel_df['Extraction Success'] = excel_df['Extraction Success'].apply(lambda x: 'Yes' if x else 'No')
        return excel_df
    def format_currency(self, value):
        if pd.isna(value) or value == "":
            return ""
        try:
            if isinstance(value, str):
                clean_value = value.replace('$', '').replace(',', '')
                value = float(clean_value)
            return f"${value:,.2f}"
        except (ValueError, TypeError):
            return str(value)
    def format_worksheet(self, ws, data_rows):
        header_row = 3
        for cell in ws[header_row]:
            if cell.value:
                cell.fill = self.header_fill
                cell.font = self.header_font
                cell.alignment = self.center_alignment
                cell.border = self.border
        for row in range(header_row + 1, data_rows + 1):
            for cell in ws[row]:
                cell.border = self.border
                cell.alignment = Alignment(horizontal='left', vertical='center')
        self.auto_adjust_columns(ws)
        self.add_conditional_formatting(ws, header_row, data_rows)
    def auto_adjust_columns(self, ws):
        max_col = ws.max_column
        max_row = ws.max_row
        for col_idx in range(1, max_col + 1):
            max_len = 0
            for row in range(1, max_row + 1):
                cell = ws.cell(row=row, column=col_idx)
                if isinstance(cell, MergedCell):
                    continue
                try:
                    val = cell.value
                    if val is None:
                        continue
                    max_len = max(max_len, len(str(val)))
                except Exception:
                    pass
            letter = get_column_letter(col_idx)
            ws.column_dimensions[letter].width = min(max_len + 2, 50)
    def add_conditional_formatting(self, ws, header_row, data_rows):
        success_col = None
        for col, cell in enumerate(ws[header_row], 1):
            if cell.value == 'Extraction Success':
                success_col = col
                break
        if success_col:
            for row in range(header_row + 1, data_rows + 1):
                cell = ws.cell(row=row, column=success_col)
                if cell.value == 'Yes':
                    cell.fill = PatternFill(start_color="90EE90", end_color="90EE90", fill_type="solid")
                elif cell.value == 'No':
                    cell.fill = PatternFill(start_color="FFB6C1", end_color="FFB6C1", fill_type="solid")
    def add_summary_sheet(self, wb, df):
        ws = wb.create_sheet(title="Summary")
        ws['A1'] = "Extraction Summary"
        ws['A1'].font = Font(size=16, bold=True)
        ws.merge_cells('A1:B1')
        row = 3
        stats = [
            ("Total Rows", len(df)),
            ("Successful", len(df[df['Extraction Success'] == 'Yes']) if 'Extraction Success' in df.columns else 0),
            ("Failed", len(df[df['Extraction Success'] == 'No']) if 'Extraction Success' in df.columns else 0),
        ]
        for name, val in stats:
            ws[f'A{row}'] = name
            ws[f'B{row}'] = val
            ws[f'A{row}'].font = Font(bold=True)
            row += 1
        ExcelGenerator().auto_adjust_columns(ws)
        row += 2
        ws[f'A{row}'] = f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
        ws[f'A{row}'].font = Font(italic=True)
 # ============================================================
 # Runner: glue everything together
 # ============================================================
 def process_images_to_excel(files: List[str], out_excel: str, deskewed_only: bool=False) -> None:
    excel_gen = ExcelGenerator()
    records: List[Dict[str, Any]] = []
    for src in files:
        try:
            if deskewed_only:
                img = cv2.imread(src, cv2.IMREAD_COLOR)
                if img is None:
                    raise FileNotFoundError(src)
                words, _ = extract_words_and_text(src)
                rot_words = []
                for w in words:
                    ww = dict(w)
                    ww["cx_rot"], ww["cy_rot"] = w["cx"], w["cy"]
                    rot_words.append(ww)
                post_lines = group_horizontal_lines(rot_words)
                post_txt = write_lines_txt(src, "lines_post", post_lines)  # only if DEBUG
                rows = extract_all_clients_from_lines(post_lines)
                for r in rows:
                    r["Source File"] = os.path.basename(src)
                    records.append(r)
                # if DEBUG: print(f"{src} → parsed {len(rows)} PD rows (wrote {post_txt})")
            else:
                base, ext = os.path.splitext(src)
                dst = f"{base}_deskewed{ext if ext else '.jpg'}" if DEBUG else None
                info = smart_deskew_with_lines(src, dst, clamp_deg=30.0, use_vision=True)
                post_lines = info.get("post_lines", []) if info else []
                rows = extract_all_clients_from_lines(post_lines) if post_lines else []
                for r in rows:
                    r["Source File"] = os.path.basename(src)
                    records.append(r)
                # if DEBUG: print(f"{src} → rotated by {-info['angle_deg']:.3f}° → {dst}")
        except Exception as e:
            # if DEBUG: print(f"{src}: {e}")
            records.append({
                'Patient Name': "", 'Patient ID': "", 'ICN': "", 'CDT Code': "",
                'Date SVC': "", 'Billed Amount': "", 'Allowed Amount': "", 'Paid Amount': "",
                'Extraction Success': False, 'Source File': os.path.basename(src),
            })
    df = pd.DataFrame.from_records(records)
    data = excel_gen.create_excel_file(df)
    with open(out_excel, "wb") as f:
        f.write(data)
    # if DEBUG:
    #     print(f"\n✅ Wrote Excel → {out_excel}")
    #     print("   (and per-image: *_lines_pre.txt, *_lines_post.txt, *_deskewed.* when DEBUG=True)")
 # ============================================================
 # CLI
 # ============================================================
 def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--input", help="Folder of images (jpg/png/tif).", default=None)
    ap.add_argument("--files", nargs="*", help="Specific image files.", default=None)
    ap.add_argument("--out", help="Output Excel path.", required=True)
    ap.add_argument("--deskewed-only", action="store_true",
                    help="Only process files whose name contains '_deskewed'; skip deskew step.")
    args = ap.parse_args()
    paths: List[str] = []
    if args.files:
        for f in args.files:
            if os.path.isfile(f):
                paths.append(f)
    if args.input and os.path.isdir(args.input):
        for ext in ("*.jpg","*.jpeg","*.png","*.tif","*.tiff","*.bmp"):
            paths.extend(glob.glob(os.path.join(args.input, ext)))
    if args.deskewed_only:
        paths = [p for p in paths if "_deskewed" in os.path.basename(p).lower()]
    if not paths:
        raise SystemExit("No input images found. Use --files or --input (and --deskewed-only if desired).")
    if not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS"):
        # print("WARNING: GOOGLE_APPLICATION_CREDENTIALS not set. Set it to your local service account JSON path.")
        pass
    process_images_to_excel(paths, args.out, deskewed_only=args.deskewed_only)
 if __name__ == "__main__":
    main()
--- a/apps/PaymentOCRService/package.json
+++ b/apps/PaymentOCRService/package.json
@@ -0,0 +1,8 @@
 {
  "name": "pdfservice",
  "private": true,
  "scripts": {
    "postinstall": "pip install -r requirements.txt",
    "dev": "python main.py"
  }
 }
--- a/apps/PaymentOCRService/requirements.txt
+++ b/apps/PaymentOCRService/requirements.txt
@@ -0,0 +1,10 @@
 fastapi
 uvicorn[standard]
 google-cloud-vision
 opencv-python-headless
 pytesseract
 pillow
 pandas
 openpyxl
 numpy
 python-multipart
--- a/apps/ProcedureCodeFromMhPdf/MH.pdf
+++ b/apps/ProcedureCodeFromMhPdf/MH.pdf
--- a/apps/ProcedureCodeFromMhPdf/Readme.md
+++ b/apps/ProcedureCodeFromMhPdf/Readme.md
@@ -0,0 +1,5 @@
 This code was written only while extracting procedure code data from Mass Health pdf, to make process easy. 
 Only was a one time process, not used as core functionality in this whole app. 
 Keeping it as in future might need to extract again.
--- a/apps/ProcedureCodeFromMhPdf/compareJson.py
+++ b/apps/ProcedureCodeFromMhPdf/compareJson.py
@@ -0,0 +1,96 @@
 #!/usr/bin/env python3
 """
 Compare a main dental JSON file with one or more other JSON files and
 return all records whose 'Procedure Code' is NOT present in the main file.
 - Matching key: 'Procedure Code' (case-insensitive, trimmed).
 - Keeps the full record from the other files (including extra fields like 'Full Price').
 - Deduplicates by Procedure Code across the collected "missing" results.
 CONFIG: set MAIN_PATH, OTHER_PATHS, OUT_PATH below.
 """
 import json
 from pathlib import Path
 from typing import List, Dict, Any
 # =========================
 # CONFIG — EDIT THESE ONLY
 # =========================
 MAIN_PATH = "procedureCodesMain.json"  # your main JSON (with PriceLTEQ21/PriceGT21)
 OTHER_PATHS = [
    "procedureCodesOld.json",       # one or more other JSON files to compare against the main
    # "other2.json",
 ]
 OUT_PATH = "not_in_main.json"  # where to write the results
 # =========================
 def _load_json_any(path: str) -> List[Dict[str, Any]]:
    """
    Load JSON. Accept:
      - a list of objects
      - a single object (wraps into a list)
    """
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, dict):
        return [data]
    if isinstance(data, list):
        # filter out non-dict items defensively
        return [x for x in data if isinstance(x, dict)]
    raise ValueError(f"Unsupported JSON top-level type in {path}: {type(data)}")
 def _norm_code(record: Dict[str, Any]) -> str:
    # Normalize the 'Procedure Code' for matching
    code = str(record.get("Procedure Code", "")).strip().upper()
    # Some PDFs might have stray spaces, tabs, or zero-width chars
    code = "".join(ch for ch in code if not ch.isspace())
    return code
 def collect_main_codes(main_path: str) -> set:
    main_items = _load_json_any(main_path)
    codes = {_norm_code(rec) for rec in main_items if _norm_code(rec)}
    return codes
 def collect_missing_records(other_paths: List[str], main_codes: set) -> List[Dict[str, Any]]:
    missing: Dict[str, Dict[str, Any]] = {}  # map normalized code -> record
    for p in other_paths:
        items = _load_json_any(p)
        for rec in items:
            code_norm = _norm_code(rec)
            if not code_norm:
                continue
            if code_norm not in main_codes and code_norm not in missing:
                # Keep the full original record
                missing[code_norm] = rec
    # return in a stable, sorted order by code
    return [missing[k] for k in sorted(missing.keys())]
 def main():
    # Validate files exist
    if not Path(MAIN_PATH).exists():
        raise FileNotFoundError(f"Main file not found: {MAIN_PATH}")
    for p in OTHER_PATHS:
        if not Path(p).exists():
            raise FileNotFoundError(f"Other file not found: {p}")
    main_codes = collect_main_codes(MAIN_PATH)
    missing_records = collect_missing_records(OTHER_PATHS, main_codes)
    with open(OUT_PATH, "w", encoding="utf-8") as f:
        json.dump(missing_records, f, ensure_ascii=False, indent=2)
    print(f"Main codes: {len(main_codes)}")
    print(f"Missing from main: {len(missing_records)}")
    print(f"Wrote results to {OUT_PATH}")
    # Also echo to stdout
    print(json.dumps(missing_records, ensure_ascii=False, indent=2))
 if __name__ == "__main__":
    main()
--- a/apps/ProcedureCodeFromMhPdf/extract_bypage.py
+++ b/apps/ProcedureCodeFromMhPdf/extract_bypage.py
@@ -0,0 +1,183 @@
 import re
 import json
 from typing import List, Dict
 import fitz  # PyMuPDF
 # =========================
 # CONFIG — EDIT THESE ONLY
 # =========================
 PDF_PATH = "MH.pdf"   # path to your PDF
 PAGES = [2]                   # 0-based page indexes to parse, e.g., [2] for the page you showed
 OUT_PATH = "output.json"      # where to write JSON
 FIRST_PRICE_IS_LTE21 = True   # True => first price line is <=21; False => first price is >21
 PRINT_PAGE_TEXT = False       # set True if you want to print the raw page text for sanity check
 # =========================
 # --- patterns ---
 code_line_re = re.compile(r"^\s*(D\d{4})\s*$")
 # a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC'
 price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE)
 # lines that definitely start a notes block we should ignore once prices are done
 note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—|–|Age limitation:|CR\b)", re.IGNORECASE)
 def normalize_ws(s: str) -> str:
    s = s.replace("\u00a0", " ")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\s*\n\s*", " ", s)
    s = re.sub(r"\s{2,}", " ", s)
    return s.strip(" ,.;:-•·\n\t")
 def clean_money(token: str) -> str:
    if token.upper() == "NC":
        return "NC"
    return token.replace(",", "").lstrip("$").strip()
 def get_page_lines(pdf_path: str, pages: List[int]) -> List[str]:
    doc = fitz.open(pdf_path)
    try:
        max_idx = len(doc) - 1
        for p in pages:
            if p < 0 or p > max_idx:
                raise ValueError(f"Invalid page index {p}. Valid range is 0..{max_idx}.")
        lines: List[str] = []
        for p in pages:
            text = doc.load_page(p).get_text("text") or ""
            if PRINT_PAGE_TEXT:
                print(f"\n--- RAW PAGE {p} ---\n{text}")
            # keep line boundaries; later we parse line-by-line
            lines.extend(text.splitlines())
        return lines
    finally:
        doc.close()
 def extract_records(lines: List[str]) -> List[Dict[str, str]]:
    out: List[Dict[str, str]] = []
    i = 0
    n = len(lines)
    while i < n:
        line = lines[i].strip()
        # seek a code line
        mcode = code_line_re.match(line)
        if not mcode:
            i += 1
            continue
        code = mcode.group(1)
        i += 1
        # gather description lines until we encounter price lines
        desc_lines: List[str] = []
        # skip blank lines before description
        while i < n and not lines[i].strip():
            i += 1
        # collect description lines (usually 1–3) until first price token
        # stop also if we accidentally hit another code (defensive)
        j = i
        while j < n:
            s = lines[j].strip()
            if not s:
                # blank line inside description — consider description ended if the next is a price
                # but we don't advance here; break and let price parsing handle it
                break
            if code_line_re.match(s):
                # next code — no prices found; abandon this broken record
                break
            if price_line_re.match(s):
                # reached price section
                break
            if note_starters_re.match(s):
                # encountered a note before price — treat as end of description; prices may be missing
                break
            desc_lines.append(s)
            j += 1
        # advance i to where we left off
        i = j
        description = normalize_ws(" ".join(desc_lines))
        # collect up to two price tokens
        prices: List[str] = []
        while i < n and len(prices) < 2:
            s = lines[i].strip()
            if not s:
                i += 1
                continue
            if code_line_re.match(s):
                # new record — stop; this means we never got prices (malformed)
                break
            mprice = price_line_re.match(s)
            if mprice:
                prices.append(clean_money(mprice.group(1)))
                i += 1
                continue
            # if we encounter a note/flags block, skip forward until the next code/blank
            if note_starters_re.match(s) or s in {"Y", "NC"}:
                # skip this block quickly
                i += 1
                # keep skipping subsequent non-empty, non-code lines until a blank or next code
                while i < n:
                    t = lines[i].strip()
                    if not t or code_line_re.match(t):
                        break
                    i += 1
                # now let the outer loop proceed
                continue
            # unrecognized line: if prices already found, we can break; else skip
            if prices:
                break
            i += 1
        if len(prices) < 2:
            # couldn't find 2 prices reliably; skip this record
            continue
        if FIRST_PRICE_IS_LTE21:
            price_lte21, price_gt21 = prices[0], prices[1]
        else:
            price_lte21, price_gt21 = prices[1], prices[0]
        out.append(
            {
                "Procedure Code": code,
                "Description": description,
                "PriceLTEQ21": price_lte21,
                "PriceGT21": price_gt21,
            }
        )
        # after prices, skip forward until next code or blank block end
        while i < n:
            s = lines[i].strip()
            if not s:
                i += 1
                break
            if code_line_re.match(s):
                # next record will pick this up
                break
            i += 1
    return out
 def extract_pdf_to_json(pdf_path: str, pages: List[int], out_path: str) -> List[Dict[str, str]]:
    lines = get_page_lines(pdf_path, pages)
    data = extract_records(lines)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    return data
 if __name__ == "__main__":
    data = extract_pdf_to_json(PDF_PATH, PAGES, OUT_PATH)
    print(f"Wrote {len(data)} rows to {OUT_PATH}")
    print(json.dumps(data, ensure_ascii=False, indent=2))
--- a/apps/ProcedureCodeFromMhPdf/extract_byrange.py
+++ b/apps/ProcedureCodeFromMhPdf/extract_byrange.py
@@ -0,0 +1,208 @@
 #!/usr/bin/env python3
 """
 MassHealth dental PDF parser (PyMuPDF / fitz) — PAGE RANGE VERSION
 Parses rows like:
 D2160
 Amalgam-three surfaces,
 primary or permanent
 $110
 $92
 Y
 Y
 ...
 Outputs a single JSON with records from the chosen page range (inclusive).
 Config:
 - PDF_PATH: path to the PDF
 - PAGE_START, PAGE_END: 1-based page numbers (inclusive)
 - FIRST_PRICE_IS_LTE21: True => first price line is <=21; False => first price is >21
 - OUT_PATH: output JSON path
 """
 import re
 import json
 from typing import List, Dict
 import fitz  # PyMuPDF
 # =========================
 # CONFIG — EDIT THESE ONLY
 # =========================
 PDF_PATH = "MH.pdf"   # path to your PDF
 PAGE_START = 1                # 1-based inclusive start page (e.g., 1)
 PAGE_END   = 12               # 1-based inclusive end page   (e.g., 5)
 OUT_PATH = "output.json"      # single JSON file containing all parsed rows
 FIRST_PRICE_IS_LTE21 = True   # True => first price line is <=21; False => first price is >21
 PRINT_PAGE_TEXT = False       # set True to print raw text for each page
 # =========================
 # --- patterns ---
 code_line_re = re.compile(r"^\s*(D\d{4})\s*$")
 # a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC'
 price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE)
 # lines that definitely start a notes block to ignore once prices are done
 note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—|–|Age limitation:|CR\b)", re.IGNORECASE)
 def normalize_ws(s: str) -> str:
    s = s.replace("\u00a0", " ")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\s*\n\s*", " ", s)
    s = re.sub(r"\s{2,}", " ", s)
    return s.strip(" ,.;:-•·\n\t")
 def clean_money(token: str) -> str:
    if token.upper() == "NC":
        return "NC"
    return token.replace(",", "").lstrip("$").strip()
 def get_page_lines(pdf_path: str, page_start_1b: int, page_end_1b: int) -> List[str]:
    if page_start_1b <= 0 or page_end_1b <= 0:
        raise ValueError("PAGE_START and PAGE_END must be >= 1 (1-based).")
    if page_start_1b > page_end_1b:
        raise ValueError("PAGE_START cannot be greater than PAGE_END.")
    doc = fitz.open(pdf_path)
    try:
        last_idx_0b = len(doc) - 1
        # convert to 0-based inclusive range
        start_0b = page_start_1b - 1
        end_0b = page_end_1b - 1
        if start_0b < 0 or end_0b > last_idx_0b:
            raise ValueError(f"Page range out of bounds. Valid 1-based range is 1..{last_idx_0b + 1}.")
        lines: List[str] = []
        for p in range(start_0b, end_0b + 1):
            text = doc.load_page(p).get_text("text") or ""
            if PRINT_PAGE_TEXT:
                print(f"\n--- RAW PAGE {p} (0-based; shown as {p+1} 1-based) ---\n{text}")
            lines.extend(text.splitlines())
        return lines
    finally:
        doc.close()
 def extract_records(lines: List[str]) -> List[Dict[str, str]]:
    out: List[Dict[str, str]] = []
    i = 0
    n = len(lines)
    while i < n:
        line = lines[i].strip()
        # seek a code line
        mcode = code_line_re.match(line)
        if not mcode:
            i += 1
            continue
        code = mcode.group(1)
        i += 1
        # gather description lines until we encounter price lines
        desc_lines: List[str] = []
        # skip blank lines before description
        while i < n and not lines[i].strip():
            i += 1
        # collect description lines (usually 1–3) until first price token
        # stop also if we accidentally hit another code (defensive)
        j = i
        while j < n:
            s = lines[j].strip()
            if not s:
                break
            if code_line_re.match(s):
                # next code — description ended abruptly (malformed)
                break
            if price_line_re.match(s):
                # reached price section
                break
            if note_starters_re.match(s):
                # encountered a note before price — treat as end of description; prices may be missing
                break
            desc_lines.append(s)
            j += 1
        # advance i to where we left off
        i = j
        description = normalize_ws(" ".join(desc_lines))
        # collect up to two price tokens
        prices: List[str] = []
        while i < n and len(prices) < 2:
            s = lines[i].strip()
            if not s:
                i += 1
                continue
            if code_line_re.match(s):
                # new record — stop; this means we never got prices (malformed)
                break
            mprice = price_line_re.match(s)
            if mprice:
                prices.append(clean_money(mprice.group(1)))
                i += 1
                continue
            # if we encounter a note/flags block, skip forward until a blank or next code
            if note_starters_re.match(s) or s in {"Y", "NC"}:
                i += 1
                while i < n:
                    t = lines[i].strip()
                    if not t or code_line_re.match(t):
                        break
                    i += 1
                continue
            # unrecognized line: if we already captured some prices, break; else skip
            if prices:
                break
            i += 1
        if len(prices) < 2:
            # couldn't find 2 prices reliably; skip this record
            continue
        if FIRST_PRICE_IS_LTE21:
            price_lte21, price_gt21 = prices[0], prices[1]
        else:
            price_lte21, price_gt21 = prices[1], prices[0]
        out.append(
            {
                "Procedure Code": code,
                "Description": description,
                "PriceLTEQ21": price_lte21,
                "PriceGT21": price_gt21,
            }
        )
        # after prices, skip forward until next code or blank block end
        while i < n:
            s = lines[i].strip()
            if not s:
                i += 1
                break
            if code_line_re.match(s):
                break
            i += 1
    return out
 def extract_pdf_range_to_json(pdf_path: str, page_start_1b: int, page_end_1b: int, out_path: str) -> List[Dict[str, str]]:
    lines = get_page_lines(pdf_path, page_start_1b, page_end_1b)
    data = extract_records(lines)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    return data
 if __name__ == "__main__":
    data = extract_pdf_range_to_json(PDF_PATH, PAGE_START, PAGE_END, OUT_PATH)
    print(f"Wrote {len(data)} rows to {OUT_PATH}")
    print(json.dumps(data, ensure_ascii=False, indent=2))
--- a/apps/ProcedureCodeFromMhPdf/not_in_main.json
+++ b/apps/ProcedureCodeFromMhPdf/not_in_main.json
@@ -0,0 +1,192 @@
 [
  {
    "Procedure Code": "D0120",
    "Description": "perio exam",
    "Price": "105"
  },
  {
    "Procedure Code": "D0140",
    "Description": "limited exam",
    "Price": "90"
  },
  {
    "Procedure Code": "D0150",
    "Description": "comprehensive exam",
    "Price": "120"
  },
  {
    "Procedure Code": "D0210",
    "Description": "Fmx.",
    "Price": "120"
  },
  {
    "Procedure Code": "D0220",
    "Description": "first PA.",
    "Price": "60"
  },
  {
    "Procedure Code": "D0230",
    "Description": "2nd PA.",
    "Price": "50"
  },
  {
    "Procedure Code": "D0272",
    "Description": "2 BW",
    "Price": "80"
  },
  {
    "Procedure Code": "D0274",
    "Description": "4BW",
    "Price": "160"
  },
  {
    "Procedure Code": "D0330",
    "Description": "pano",
    "Price": "150"
  },
  {
    "Procedure Code": "D0364",
    "Description": "Less than one jaw",
    "Price": "350"
  },
  {
    "Procedure Code": "D0365",
    "Description": "Mand",
    "Price": "350"
  },
  {
    "Procedure Code": "D0366",
    "Description": "Max",
    "Price": "350"
  },
  {
    "Procedure Code": "D0367",
    "Description": "",
    "Price": "400"
  },
  {
    "Procedure Code": "D0368",
    "Description": "include TMJ",
    "Price": "375"
  },
  {
    "Procedure Code": "D0380",
    "Description": "Less than one jaw",
    "Price": "300"
  },
  {
    "Procedure Code": "D0381",
    "Description": "Mand",
    "Price": "300"
  },
  {
    "Procedure Code": "D0382",
    "Description": "Max",
    "Price": "300"
  },
  {
    "Procedure Code": "D0383",
    "Description": "",
    "Price": "350"
  },
  {
    "Procedure Code": "D1110",
    "Description": "adult prophy",
    "Price": "150"
  },
  {
    "Procedure Code": "D1120",
    "Description": "child prophy",
    "Price": "120"
  },
  {
    "Procedure Code": "D1208",
    "Description": "FL",
    "Price": "90"
  },
  {
    "Procedure Code": "D1351",
    "Description": "sealant",
    "Price": "80"
  },
  {
    "Procedure Code": "D1999",
    "Description": "",
    "Price": "50"
  },
  {
    "Procedure Code": "D2140",
    "Description": "amalgam, one surface",
    "Price": "150"
  },
  {
    "Procedure Code": "D2150",
    "Description": "amalgam, two surface",
    "Price": "200"
  },
  {
    "Procedure Code": "D2955",
    "Description": "post renoval",
    "Price": "350"
  },
  {
    "Procedure Code": "D4910",
    "Description": "perio maintains",
    "Price": "250"
  },
  {
    "Procedure Code": "D5510",
    "Description": "Repair broken complete denture base (QUAD)",
    "Price": "400"
  },
  {
    "Procedure Code": "D6056",
    "Description": "pre fab abut",
    "Price": "750"
  },
  {
    "Procedure Code": "D6057",
    "Description": "custom abut",
    "Price": "800"
  },
  {
    "Procedure Code": "D6058",
    "Description": "porcelain, implant crown, ceramic crown",
    "Price": "1400"
  },
  {
    "Procedure Code": "D6059",
    "Description": "",
    "Price": "1400"
  },
  {
    "Procedure Code": "D6100",
    "Description": "",
    "Price": "320"
  },
  {
    "Procedure Code": "D6110",
    "Description": "implant",
    "Price": "1600"
  },
  {
    "Procedure Code": "D6242",
    "Description": "noble metal. For united",
    "Price": "1400"
  },
  {
    "Procedure Code": "D6245",
    "Description": "porcelain, not for united",
    "Price": "1400"
  },
  {
    "Procedure Code": "D7910",
    "Description": "suture, small wound up to 5 mm",
    "Price": "400"
  },
  {
    "Procedure Code": "D7950",
    "Description": "max",
    "Price": "800"
  }
 ]
--- a/apps/ProcedureCodeFromMhPdf/procedureCodes.json
+++ b/apps/ProcedureCodeFromMhPdf/procedureCodes.json
--- a/apps/ProcedureCodeFromMhPdf/procedureCodesOld.json
+++ b/apps/ProcedureCodeFromMhPdf/procedureCodesOld.json
@@ -0,0 +1,344 @@
 [
    {
        "Procedure Code": "D1999",
        "Description": "",
        "Price": "50"
    },
    {
        "Procedure Code": "D0120",
        "Description": "perio exam",
        "Price": "105"
    },
    {
        "Procedure Code": "D0140",
        "Description": "limited exam",
        "Price": "90"
    },
    {
        "Procedure Code": "D0150",
        "Description": "comprehensive exam",
        "Price": "120"
    },
    {
        "Procedure Code": "D0210",
        "Description": "Fmx.",
        "Price": "120"
    },
    {
        "Procedure Code": "D0220",
        "Description": "first PA.",
        "Price": "60"
    },
    {
        "Procedure Code": "D0230",
        "Description": "2nd PA.",
        "Price": "50"
    },
    {
        "Procedure Code": "D0330",
        "Description": "pano",
        "Price": "150"
    },
    {
        "Procedure Code": "D0272",
        "Description": "2 BW",
        "Price": "80"
    },
    {
        "Procedure Code": "D0274",
        "Description": "4BW",
        "Price": "160"
    },
    {
        "Procedure Code": "D1110",
        "Description": "adult prophy",
        "Price": "150"
    },
    {
        "Procedure Code": "D1120",
        "Description": "child prophy",
        "Price": "120"
    },
    {
        "Procedure Code": "D1351",
        "Description": "sealant",
        "Price": "80"
    },
    {
        "Procedure Code": "D4341",
        "Description": "srp",
        "Price": "250"
    },
    {
        "Procedure Code": "D4910",
        "Description": "perio maintains",
        "Price": "250"
    },
    {
        "Procedure Code": "D1208",
        "Description": "FL",
        "Price": "90"
    },
    {
        "Procedure Code": "D2330",
        "Description": "front composite. 1 s.",
        "Price": "180"
    },
    {
        "Procedure Code": "D2331",
        "Description": "2s",
        "Price": "220"
    },
    {
        "Procedure Code": "D2332",
        "Description": "3s",
        "Price": "280"
    },
    {
        "Procedure Code": "D2335",
        "Description": "4s or more",
        "Price": "350"
    },
    {
        "Procedure Code": "D2391",
        "Description": "back. 1s",
        "Price": "200"
    },
    {
        "Procedure Code": "D2392",
        "Description": "2s",
        "Price": "250"
    },
    {
        "Procedure Code": "D2393",
        "Description": "3s",
        "Price": "280"
    },
    {
        "Procedure Code": "D2394",
        "Description": "4s",
        "Price": "320"
    },
    {
        "Procedure Code": "D2140",
        "Description": "amalgam, one surface",
        "Price": "150"
    },
    {
        "Procedure Code": "D2150",
        "Description": "amalgam, two surface",
        "Price": "200"
    },
    {
        "Procedure Code": "D2750",
        "Description": "high noble",
        "Price": "1300"
    },
    {
        "Procedure Code": "D2751",
        "Description": "base metal",
        "Price": "1200"
    },
    {
        "Procedure Code": "D2740",
        "Description": "crown porcelain",
        "Price": "1300"
    },
    {
        "Procedure Code": "D2954",
        "Description": "p/c",
        "Price": "450"
    },
    {
        "Procedure Code": "D7910",
        "Description": "suture, small wound up to 5 mm",
        "Price": "400"
    },
    {
        "Procedure Code": "D5110",
        "Description": "FU",
        "Price": "1200",
        "Full Price": "1700"
    },
    {
        "Procedure Code": "D5120",
        "Description": "FL",
        "Price": "1700",
        "Full Price": "1700"
    },
    {
        "Procedure Code": "D5211",  
        "Description": "pu",
        "Price": "1300"
    },
    {
        "Procedure Code": "D5212",
        "Description": "pl",
        "Price": "1300"
    },
    {
        "Procedure Code": "D5213",
        "Description": "cast pu.",
        "Price": "1700"
    },
    {
        "Procedure Code": "D5214",
        "Description": "cast pl",
        "Price": "1700"
    },
    {
        "Procedure Code": "D5510",
        "Description": "Repair broken complete denture base (QUAD)",
        "Price": "400"
    },
    {
        "Procedure Code": "D5520",
        "Description": "Replace missing or broken teeth - complete denture (each tooth) (TOOTH)",
        "Price": "200"
    },
    {
        "Procedure Code": "D5750",
        "Description": "lab reline",
        "Price": "600"
    },
    {
        "Procedure Code": "D5730",
        "Description": "chairside reline",
        "Price": "500"
    },
    {
        "Procedure Code": "D2920",
        "Description": "re cement crown",
        "Price": "120"
    },
    {
        "Procedure Code": "D2950",
        "Description": "core buildup",
        "Price": "350"
    },
    {
        "Procedure Code": "D2955",
        "Description": "post renoval",
        "Price": "350"
    },
    {
        "Procedure Code": "D6100",
        "Description": "",
        "Price": "320"
    },
    {
        "Procedure Code": "D6110",
        "Description": "implant",
        "Price": "1600"
    },
    {
        "Procedure Code": "D6056",
        "Description": "pre fab abut",
        "Price": "750"
    },
    {
        "Procedure Code": "D6057",
        "Description": "custom abut",
        "Price": "800"
    },
    {
        "Procedure Code": "D6058",
        "Description": "porcelain, implant crown, ceramic crown",
        "Price": "1400"
    },
    {
        "Procedure Code": "D6059",
        "Description": "",
        "Price": "1400"
    },
    {
        "Procedure Code": "D6242",
        "Description": "noble metal. For united",
        "Price": "1400"
    },
    {
        "Procedure Code": "D6245",
        "Description": "porcelain, not for united",
        "Price": "1400"
    },
    {
        "Procedure Code": "D0367",
        "Description": "",
        "Price": "400"
    },
    {
        "Procedure Code": "D0364",
        "Description": "Less than one jaw",
        "Price": "350"
    },
    {
        "Procedure Code": "D0365",
        "Description": "Mand",
        "Price": "350"
    },
    {
        "Procedure Code": "D0366",
        "Description": "Max",
        "Price": "350"
    },
    {
        "Procedure Code": "D0368",
        "Description": "include TMJ",
        "Price": "375"
    },
    {
        "Procedure Code": "D0383",
        "Description": "",
        "Price": "350"
    },
    {
        "Procedure Code": "D0380",
        "Description": "Less than one jaw",
        "Price": "300"
    },
    {
        "Procedure Code": "D0381",
        "Description": "Mand",
        "Price": "300"
    },
    {
        "Procedure Code": "D0382",
        "Description": "Max",
        "Price": "300"
    },
    {
        "Procedure Code": "D7950",
        "Description": "max",
        "Price": "800"
    },
    {
        "Procedure Code": "D7140",
        "Description": "simple ext",
        "Price": "150"
    },
    {
        "Procedure Code": "D7210",
        "Description": "surgical ext",
        "Price": "280"
    },
    {
        "Procedure Code": "D7220",
        "Description": "soft impacted",
        "Price": "380"
    },
    {
        "Procedure Code": "D7230",
        "Description": "partial bony",
        "Price": "450"
    },
    {
        "Procedure Code": "D7240",
        "Description": "fully bony",
        "Price": "550"
    },
    {
        "Procedure Code": "D3320",
        "Description": "pre M RCT",
        "Price": "1050"
    }
 ]