structured well

2025-08-29 18:16:51 +05:30
parent c9ad84c3a8
commit d89bee4f07
16 changed files with 3080 additions and 0 deletions
--- a/apps/PaymentOCRService/.env.example
+++ b/apps/PaymentOCRService/.env.example
--- a/apps/PaymentOCRService/README.md
+++ b/apps/PaymentOCRService/README.md
@@ -0,0 +1,13 @@
+# Medical Billing OCR API (FastAPI)
+
+## 1) Prereqs
+- Google Cloud Vision service-account JSON.
+- `GOOGLE_APPLICATION_CREDENTIALS` env var pointing to that JSON.
+- Tesseract installed (for fallback OCR), and on PATH.
+
+## 2) Install & run (local)
+```bash
+python -m venv .venv && source .venv/bin/activate
+pip install -r requirements.txt
+export GOOGLE_APPLICATION_CREDENTIALS=/absolute/path/to/service-account.json
+uvicorn app.main:app --reload --port 8080
--- a/apps/PaymentOCRService/app/init.py
+++ b/apps/PaymentOCRService/app/init.py
--- a/apps/PaymentOCRService/app/main.py
+++ b/apps/PaymentOCRService/app/main.py
@@ -0,0 +1,81 @@
+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse
+from typing import List, Optional
+import io
+import os
+
+from app.pipeline_adapter import (
+    process_images_to_rows,
+    rows_to_csv_bytes,
+)
+
+app = FastAPI(
+    title="Medical Billing OCR API",
+    description="FastAPI wrapper around the complete OCR pipeline (Google Vision + deskew + line clustering + extraction).",
+    version="1.0.0",
+)
+
+ALLOWED_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp"}
+
+@app.get("/health", response_class=PlainTextResponse)
+def health():
+    # Simple sanity check (also ensures GCP creds var visibility)
+    creds = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", "")
+    return f"OK | GOOGLE_APPLICATION_CREDENTIALS set: {bool(creds)}"
+
+@app.post("/extract/json")
+async def extract_json(files: List[UploadFile] = File(...)):
+    if not files:
+        raise HTTPException(status_code=400, detail="No files provided.")
+
+    # Validate extensions early (not bulletproof, but helpful)
+    bad = [f.filename for f in files if os.path.splitext(f.filename or "")[1].lower() not in ALLOWED_EXTS]
+    if bad:
+        raise HTTPException(
+            status_code=415,
+            detail=f"Unsupported file types: {', '.join(bad)}. Allowed: {', '.join(sorted(ALLOWED_EXTS))}"
+        )
+
+    # Read blobs in-memory
+    blobs = []
+    filenames = []
+    for f in files:
+        blobs.append(await f.read())
+        filenames.append(f.filename or "upload.bin")
+
+    try:
+        rows = process_images_to_rows(blobs, filenames)
+        # rows is a list[dict] where each dict contains the columns you already emit (Patient Name, etc.)
+        return JSONResponse(content={"rows": rows})
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Processing error: {e}")
+
+@app.post("/extract/csv")
+async def extract_csv(files: List[UploadFile] = File(...), filename: Optional[str] = None):
+    if not files:
+        raise HTTPException(status_code=400, detail="No files provided.")
+
+    bad = [f.filename for f in files if os.path.splitext(f.filename or "")[1].lower() not in ALLOWED_EXTS]
+    if bad:
+        raise HTTPException(
+            status_code=415,
+            detail=f"Unsupported file types: {', '.join(bad)}. Allowed: {', '.join(sorted(ALLOWED_EXTS))}"
+        )
+
+    blobs = []
+    filenames = []
+    for f in files:
+        blobs.append(await f.read())
+        filenames.append(f.filename or "upload.bin")
+
+    try:
+        rows = process_images_to_rows(blobs, filenames)
+        csv_bytes = rows_to_csv_bytes(rows)
+        out_name = filename or "medical_billing_extract.csv"
+        return StreamingResponse(
+            io.BytesIO(csv_bytes),
+            media_type="text/csv",
+            headers={"Content-Disposition": f'attachment; filename="{out_name}"'}
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Processing error: {e}")
--- a/apps/PaymentOCRService/app/pipeline-adaptor.py
+++ b/apps/PaymentOCRService/app/pipeline-adaptor.py
@@ -0,0 +1,77 @@
+import os
+import tempfile
+from typing import List, Dict
+import pandas as pd
+
+# Import your existing functions directly from complete_pipeline.py
+from complete_pipeline import (
+    smart_deskew_with_lines,
+    extract_all_clients_from_lines,
+)
+
+def _process_single_image_bytes(blob: bytes, display_name: str) -> List[Dict]:
+    """
+    Saves bytes to a temp file (so OpenCV + Google Vision can read it),
+    runs your existing pipeline functions, and returns extracted rows.
+    """
+    suffix = os.path.splitext(display_name)[1] or ".jpg"
+    tmp_path = None
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+            tmp.write(blob)
+            tmp_path = tmp.name
+
+        # Uses Google Vision + deskew + post-line grouping
+        info = smart_deskew_with_lines(tmp_path, None, clamp_deg=30.0, use_vision=True)
+        post_lines = info.get("post_lines", []) if info else []
+        rows = extract_all_clients_from_lines(post_lines) if post_lines else []
+
+        # Add source file information (same as your Streamlit app)
+        for r in rows:
+            r["Source File"] = display_name
+
+        # If nothing parsed, still return a placeholder row to indicate failure (optional)
+        if not rows:
+            rows.append({
+                'Patient Name': "", 'Patient ID': "", 'ICN': "", 'CDT Code': "",
+                'Tooth': "", 'Date SVC': "",
+                'Billed Amount': "", 'Allowed Amount': "", 'Paid Amount': "",
+                'Extraction Success': False, 'Source File': display_name,
+            })
+
+        return rows
+
+    finally:
+        if tmp_path:
+            try:
+                os.unlink(tmp_path)
+            except Exception:
+                pass
+
+def process_images_to_rows(blobs: List[bytes], filenames: List[str]) -> List[Dict]:
+    """
+    Public API used by FastAPI routes.
+    blobs: list of image bytes
+    filenames: matching names for display / Source File column
+    """
+    all_rows: List[Dict] = []
+    for blob, name in zip(blobs, filenames):
+        rows = _process_single_image_bytes(blob, name)
+        all_rows.extend(rows)
+
+    return all_rows
+
+def rows_to_csv_bytes(rows: List[Dict]) -> bytes:
+    """
+    Convert pipeline rows to CSV bytes (for frontend to consume as a table).
+    """
+    df = pd.DataFrame(rows)
+    # Keep a stable column order if present (mirrors your Excel order)
+    desired = [
+        'Patient Name', 'Patient ID', 'ICN', 'CDT Code', 'Tooth', 'Date SVC',
+        'Billed Amount', 'Allowed Amount', 'Paid Amount',
+        'Extraction Success', 'Source File'
+    ]
+    cols = [c for c in desired if c in df.columns] + [c for c in df.columns if c not in desired]
+    df = df[cols]
+    return df.to_csv(index=False).encode("utf-8")
--- a/apps/PaymentOCRService/complete-pipeline.py
+++ b/apps/PaymentOCRService/complete-pipeline.py
@@ -0,0 +1,837 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+End-to-end local pipeline (single script)
+
+- One Google Vision pass per image (DOCUMENT_TEXT_DETECTION)
+- Smart deskew (Hough + OCR pairs) with fine grid search (in-memory)
+- Build slope-aware (pre) and horizontal (post) line dumps (in-memory)
+- Extract all clients & PD rows per page (robust to headers/EOBS)
+- Export nicely formatted Excel via ExcelGenerator
+
+Usage:
+  python ocr_pipeline.py --input "C:\\imgs" --out "results.xlsx"
+  python ocr_pipeline.py --files s1.jpg s2.jpg --out results.xlsx
+  python ocr_pipeline.py --input "C:\\imgs" --out results.xlsx --deskewed-only
+"""
+
+import os
+import re
+import io
+import cv2
+import math
+import glob
+import argparse
+import numpy as np
+import pandas as pd
+from typing import List, Dict, Tuple, Any, Optional
+from datetime import datetime
+
+# ========= Debug switch =========
+# Set to True to re-enable saving deskewed images, writing *_lines_*.txt,
+# and printing progress messages.
+DEBUG = False
+
+# ---------- Google Vision ----------
+from google.cloud import vision
+
+# ---------- openpyxl helpers ----------
+from openpyxl.utils import get_column_letter
+from openpyxl.cell.cell import MergedCell
+from openpyxl import Workbook
+from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
+from openpyxl.utils.dataframe import dataframe_to_rows
+
+# ============================================================
+# Config (tuning)
+# ============================================================
+PERP_TOL_FACTOR = 0.6
+SEED_BAND_H     = 3.0
+ALLOW_SINGLETON = True
+
+POST_Y_TOL_FACTOR = 0.55
+
+# ============================================================
+# Vision OCR (ONE pass per image)
+# ============================================================
+def _open_bytes(path: str) -> bytes:
+    with open(path, "rb") as f:
+        return f.read()
+
+def extract_words_and_text(image_path: str) -> Tuple[List[Dict], str]:
+    client = vision.ImageAnnotatorClient()
+    resp = client.document_text_detection(image=vision.Image(content=_open_bytes(image_path)))
+    if resp.error.message:
+        raise RuntimeError(resp.error.message)
+
+    full_text = resp.full_text_annotation.text or ""
+
+    words: List[Dict] = []
+    for page in resp.full_text_annotation.pages:
+        for block in page.blocks:
+            for para in block.paragraphs:
+                for word in para.words:
+                    text = "".join(s.text for s in word.symbols)
+                    vs = word.bounding_box.vertices
+                    xs = [v.x for v in vs]; ys = [v.y for v in vs]
+                    left, top = min(xs), min(ys)
+                    w, h = max(xs) - left, max(ys) - top
+                    cx, cy = left + w/2.0, top + h/2.0
+                    words.append({"text": text, "left": left, "top": top,
+                                  "w": w, "h": h, "cx": cx, "cy": cy})
+    return words, full_text
+
+# ============================================================
+# Skew estimation (Hough + OCR pairs)
+# ============================================================
+def weighted_median(pairs: List[Tuple[float, float]]) -> float:
+    if not pairs: return 0.0
+    arr = sorted(pairs, key=lambda t: t[0])
+    tot = sum(w for _, w in arr)
+    acc = 0.0
+    for v, w in arr:
+        acc += w
+        if acc >= tot/2.0:
+            return v
+    return arr[-1][0]
+
+def estimate_skew_pairs(words: List[Dict],
+                        y_band_mult: float = 2.0,
+                        min_dx_mult: float = 0.8,
+                        max_abs_deg: float = 15.0) -> Tuple[float,int]:
+    if not words: return 0.0, 0
+    widths  = [w["w"] for w in words if w["w"]>0]
+    heights = [w["h"] for w in words if w["h"]>0]
+    w_med = float(np.median(widths) if widths else 10.0)
+    h_med = float(np.median(heights) if heights else 16.0)
+    y_band = y_band_mult * h_med
+    min_dx = max(4.0, min_dx_mult * w_med)
+
+    words_sorted = sorted(words, key=lambda w: (w["cy"], w["cx"]))
+    pairs: List[Tuple[float,float]] = []
+    for i, wi in enumerate(words_sorted):
+        best_j = None; best_dx = None
+        for j in range(i+1, len(words_sorted)):
+            wj = words_sorted[j]
+            dy = wj["cy"] - wi["cy"]
+            if dy > y_band: break
+            if abs(dy) <= y_band:
+                dx = wj["cx"] - wi["cx"]
+                if dx <= 0 or dx < min_dx: continue
+                if best_dx is None or dx < best_dx:
+                    best_dx, best_j = dx, j
+        if best_j is None: continue
+        wj = words_sorted[best_j]
+        dx = wj["cx"] - wi["cx"]; dy = wj["cy"] - wi["cy"]
+        ang = math.degrees(math.atan2(dy, dx))
+        if abs(ang) <= max_abs_deg:
+            pairs.append((ang, max(1.0, dx)))
+
+    if not pairs: return 0.0, 0
+    vals = np.array([v for v,_ in pairs], dtype=float)
+    q1, q3 = np.percentile(vals, [25,75]); iqr = q3-q1
+    lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
+    trimmed = [(v,w) for v,w in pairs if lo <= v <= hi] or pairs
+    return float(weighted_median(trimmed)), len(trimmed)
+
+def estimate_skew_hough(img: np.ndarray, thr: int = 180) -> Tuple[float,int]:
+    g = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    g = cv2.GaussianBlur(g, (3,3), 0)
+    edges = cv2.Canny(g, 60, 160, apertureSize=3)
+    lines = cv2.HoughLines(edges, 1, np.pi/180, threshold=thr)
+    if lines is None: return 0.0, 0
+    angs = []
+    for (rho, theta) in lines[:,0,:]:
+        ang = (theta - np.pi/2.0) * 180.0/np.pi
+        while ang > 45: ang -= 90
+        while ang < -45: ang += 90
+        angs.append(ang)
+    angs = np.array(angs, dtype=float)
+    med = float(np.median(angs))
+    keep = angs[np.abs(angs - med) <= 10.0]
+    return (float(np.median(keep)) if keep.size else med), int(angs.size)
+
+# ============================================================
+# Rotation (image + coordinates) and scoring
+# ============================================================
+def rotation_matrix_keep_bounds(shape_hw: Tuple[int,int], angle_deg: float) -> Tuple[np.ndarray, Tuple[int,int]]:
+    h, w = shape_hw
+    center = (w/2.0, h/2.0)
+    M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
+    cos, sin = abs(M[0,0]), abs(M[0,1])
+    new_w = int(h*sin + w*cos)
+    new_h = int(h*cos + w*sin)
+    M[0,2] += (new_w/2) - center[0]
+    M[1,2] += (new_h/2) - center[1]
+    return M, (new_h, new_w)
+
+def rotate_image_keep_bounds(img: np.ndarray, angle_deg: float, border_value=255) -> np.ndarray:
+    M, (nh, nw) = rotation_matrix_keep_bounds(img.shape[:2], angle_deg)
+    return cv2.warpAffine(img, M, (nw, nh),
+                          flags=cv2.INTER_LINEAR,
+                          borderMode=cv2.BORDER_CONSTANT,
+                          borderValue=border_value)
+
+def transform_words(words: List[Dict], shape_hw: Tuple[int,int], angle_deg: float) -> List[Dict]:
+    M, _ = rotation_matrix_keep_bounds(shape_hw, angle_deg)
+    out = []
+    for w in words:
+        x, y = (M @ np.array([w["cx"], w["cy"], 1.0])).tolist()
+        ww = dict(w)
+        ww["cx_rot"], ww["cy_rot"] = float(x), float(y)
+        out.append(ww)
+    return out
+
+def preview_score(img: np.ndarray, deskew_angle: float) -> float:
+    h, w = img.shape[:2]
+    scale = 1200.0 / max(h, w)
+    small = cv2.resize(img, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA) if scale < 1 else img
+    rot = rotate_image_keep_bounds(small, deskew_angle, border_value=255)
+    resid, n = estimate_skew_hough(rot, thr=140)
+    return abs(resid) if n > 0 else 90.0
+
+# ============================================================
+# Slope-based clustering (pre-rotation)
+# ============================================================
+def line_from_points(p0, p1):
+    (x0,y0),(x1,y1)=p0,p1
+    dx = x1-x0
+    if abs(dx) < 1e-9: return float("inf"), x0
+    m = (y1-y0)/dx; b = y0 - m*x0
+    return m,b
+
+def perp_distance(m,b,x,y):
+    if math.isinf(m): return abs(x-b)
+    return abs(m*x - y + b) / math.sqrt(m*m + 1.0)
+
+def refit_line(points: List[Tuple[float,float]]) -> Tuple[float,float]:
+    if len(points) == 1:
+        x,y = points[0]; return 0.0, y
+    xs=[p[0] for p in points]; ys=[p[1] for p in points]
+    xm = sum(xs)/len(xs); ym = sum(ys)/len(ys)
+    num = sum((x-xm)*(y-ym) for x,y in zip(xs,ys))
+    den = sum((x-xm)**2 for x in xs)
+    if abs(den) < 1e-12: return float("inf"), xm
+    m = num/den; b = ym - m*xm
+    return m,b
+
+def project_t(m,b,x0,y0,x,y):
+    if math.isinf(m): return y - y0
+    denom = math.sqrt(1+m*m)
+    return ((x-x0) + m*(y-y0))/denom
+
+def _build_line_result(words, idxs, m, b, rotated=False):
+    origin_idx = min(idxs, key=lambda i: (words[i]["cx_rot"] if rotated else words[i]["cx"]))
+    x0 = words[origin_idx]["cx_rot"] if rotated else words[origin_idx]["cx"]
+    y0 = words[origin_idx]["cy_rot"] if rotated else words[origin_idx]["cy"]
+
+    ordered = sorted(
+        idxs,
+        key=lambda i: project_t(
+            m, b, x0, y0,
+            words[i]["cx_rot"] if rotated else words[i]["cx"],
+            words[i]["cy_rot"] if rotated else words[i]["cy"]
+        )
+    )
+    line_words = [words[i] for i in ordered]
+    text = " ".join(w["text"] for w in line_words)
+
+    xs = [(w["cx_rot"] if rotated else w["cx"]) for w in line_words]
+    ys = [(w["cy_rot"] if rotated else w["cy"]) for w in line_words]
+    return {
+        "text": text,
+        "words": line_words,
+        "slope": m,
+        "center_x": float(sum(xs)/len(xs)),
+        "center_y": float(sum(ys)/len(ys)),
+        "count": len(line_words),
+    }
+
+def cluster_tilted_lines(words: List[Dict]) -> List[Dict]:
+    if not words: return []
+    hs = sorted([w["h"] for w in words if w["h"]>0])
+    h_med = hs[len(hs)//2] if hs else 16.0
+    perp_tol = PERP_TOL_FACTOR * h_med
+    band_dy  = SEED_BAND_H * h_med
+
+    remaining = set(range(len(words)))
+    order = sorted(remaining, key=lambda i: (words[i]["cy"], words[i]["cx"]))
+    lines = []
+
+    while remaining:
+        seed_idx = next(i for i in order if i in remaining)
+        remaining.remove(seed_idx)
+        sx, sy = words[seed_idx]["cx"], words[seed_idx]["cy"]
+
+        cand_idxs = [j for j in remaining if abs(words[j]["cy"] - sy) <= band_dy]
+        if not cand_idxs:
+            if ALLOW_SINGLETON:
+                m,b = refit_line([(sx,sy)])
+                lines.append(_build_line_result(words, {seed_idx}, m, b))
+            continue
+
+        cand_idxs.sort(key=lambda j: abs(words[j]["cx"] - sx))
+        best_inliers = None; best_mb = None
+        for j in cand_idxs[:min(10, len(cand_idxs))]:
+            m,b = line_from_points((sx,sy), (words[j]["cx"], words[j]["cy"]))
+            inliers = {seed_idx, j}
+            for k in remaining:
+                xk, yk = words[k]["cx"], words[k]["cy"]
+                if perp_distance(m,b,xk,yk) <= perp_tol:
+                    inliers.add(k)
+            if best_inliers is None or len(inliers) > len(best_inliers):
+                best_inliers, best_mb = inliers, (m,b)
+
+        m,b = best_mb
+        pts = [(words[i]["cx"], words[i]["cy"]) for i in best_inliers]
+        m,b = refit_line(pts)
+
+        expanded = set(best_inliers)
+        for idx in list(remaining):
+            xk, yk = words[idx]["cx"], words[idx]["cy"]
+            if perp_distance(m,b,xk,yk) <= perp_tol:
+                expanded.add(idx)
+
+        for idx in expanded:
+            if idx in remaining:
+                remaining.remove(idx)
+        lines.append(_build_line_result(words, expanded, m, b))
+
+    lines.sort(key=lambda L: L["center_y"])
+    return lines
+
+# ============================================================
+# Post-rotation grouping (simple horizontal lines)
+# ============================================================
+def group_horizontal_lines(rotated_words: List[Dict]) -> List[Dict]:
+    if not rotated_words: return []
+    hs = sorted([w["h"] for w in rotated_words if w["h"]>0])
+    h_med = hs[len(hs)//2] if hs else 16.0
+    y_tol = POST_Y_TOL_FACTOR * h_med
+
+    idxs = list(range(len(rotated_words)))
+    idxs.sort(key=lambda i: (rotated_words[i]["cy_rot"], rotated_words[i]["cx_rot"]))
+    lines = []
+    cur = []
+
+    def flush():
+        nonlocal cur
+        if not cur: return
+        xs = [rotated_words[i]["cx_rot"] for i in cur]
+        ys = [rotated_words[i]["cy_rot"] for i in cur]
+        m,b = refit_line(list(zip(xs,ys)))
+        cur_sorted = sorted(cur, key=lambda i: rotated_words[i]["cx_rot"])
+        lines.append(_build_line_result(rotated_words, set(cur_sorted), m, b, rotated=True))
+        cur = []
+
+    for i in idxs:
+        if not cur:
+            cur = [i]
+        else:
+            y0 = rotated_words[cur[0]]["cy_rot"]
+            yi = rotated_words[i]["cy_rot"]
+            if abs(yi - y0) <= y_tol:
+                cur.append(i)
+            else:
+                flush()
+                cur = [i]
+    flush()
+    lines.sort(key=lambda L: L["center_y"])
+    return lines
+
+# ============================================================
+# Utilities: dump lines to txt (only if DEBUG)
+# ============================================================
+def slope_to_deg(m: float) -> float:
+    if math.isinf(m): return 90.0
+    return math.degrees(math.atan(m))
+
+def write_lines_txt(base_path: str, suffix: str, lines: List[Dict]) -> Optional[str]:
+    if not DEBUG:
+        return None
+    txt_path = f"{os.path.splitext(base_path)[0]}_{suffix}.txt"
+    with open(txt_path, "w", encoding="utf-8") as f:
+        f.write(f"# {os.path.basename(base_path)}  ({suffix})\n")
+        for i, L in enumerate(lines, 1):
+            ang = slope_to_deg(L["slope"])
+            f.write(f"[{i:03d}] words={L['count']:>3}  slope={ang:+.3f}°\n")
+            f.write(L["text"] + "\n\n")
+    return txt_path
+
+# ============================================================
+# Smart deskew + full pipeline (in-memory; returns words + full_text)
+# ============================================================
+def smart_deskew_with_lines(image_path: str,
+                            out_path: Optional[str] = None,
+                            clamp_deg: float = 30.0,
+                            use_vision: bool = True) -> Dict:
+    img = cv2.imread(image_path, cv2.IMREAD_COLOR)
+    if img is None: raise FileNotFoundError(image_path)
+
+    words, full_text = ([], "")
+    if use_vision:
+        words, full_text = extract_words_and_text(image_path)
+
+    a_h, n_h = estimate_skew_hough(img)
+    a_p, n_p = (0.0, 0)
+    if words:
+        a_p, n_p = estimate_skew_pairs(words, y_band_mult=2.0, min_dx_mult=0.8, max_abs_deg=15.0)
+
+    candidates = []
+    if n_h >= 10: candidates += [a_h, -a_h]
+    if n_p >= 10: candidates += [a_p, -a_p]
+    if not candidates: candidates = [0.0]
+
+    cand = []
+    for a in candidates:
+        a = float(max(-clamp_deg, min(clamp_deg, a)))
+        if all(abs(a - b) > 0.05 for b in cand):
+            cand.append(a)
+
+    grid = []
+    for a in cand:
+        for d in (-0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6):
+            g = a + d
+            if all(abs(g - x) > 0.05 for x in grid):
+                grid.append(g)
+
+    scored = [(a, preview_score(img, -a)) for a in grid]
+    best_angle, best_cost = min(scored, key=lambda t: t[1])
+
+    # Debug print kept as a comment
+    # print(f"[smart] hough={a_h:.3f}°(n={n_h})  pairs={a_p:.3f}°(n={n_p})  tried={', '.join(f'{a:+.2f}°' for a,_ in scored)}  → chosen {best_angle:+.2f}° (cost={best_cost:.3f})")
+
+    # Rotate in-memory. Save only if DEBUG.
+    rotated = rotate_image_keep_bounds(img, -best_angle, border_value=255)
+    if DEBUG and out_path:
+        cv2.imwrite(out_path, rotated)
+
+    result = {
+        "angle_deg": float(best_angle),
+        "hough_lines": int(n_h),
+        "pair_samples": int(n_p),
+        "out_path": out_path if DEBUG else None,
+        "pre_txt": None,
+        "post_txt": None,
+        "pre_lines": [],
+        "post_lines": [],
+        "words": words,
+        "full_text": full_text,
+    }
+
+    if words:
+        pre_lines = cluster_tilted_lines(words)
+        result["pre_lines"] = pre_lines
+        result["pre_txt"] = write_lines_txt(image_path, "lines_pre", pre_lines)  # only if DEBUG
+
+        rot_words = transform_words(words, img.shape[:2], -best_angle)
+        post_lines = group_horizontal_lines(rot_words)
+        result["post_lines"] = post_lines
+        result["post_txt"] = write_lines_txt(image_path, "lines_post", post_lines)  # only if DEBUG
+
+        # More debug prints kept as comments
+        # def preview(lines, tag):
+        #     print(f"  {tag} ({len(lines)} lines)")
+        #     for L in lines[:5]:
+        #         ang = slope_to_deg(L["slope"])
+        #         print(f"    [{L['count']:>3} w] slope={ang:+.3f}° | {L['text'][:90]}")
+        # preview(pre_lines, "pre (slope-aware)")
+        # preview(post_lines, "post (horizontal)")
+        # if DEBUG:
+        #     print(f"  → wrote: {result['pre_txt']}  and  {result['post_txt']}")
+
+    return result
+
+# ============================================================
+# Multi-client extraction from post lines (robust)
+# ============================================================
+MEMBER_RE   = re.compile(r'\bMEMBER NAME\s*:\s*(.+)', re.IGNORECASE)
+MEMBERID_RE = re.compile(r'\bMEMBER ID\s*:\s*([A-Za-z0-9]+)', re.IGNORECASE)
+ICN_LINE_RE = re.compile(r'^\s*\d{12,}\b')
+
+AMOUNT_RE   = re.compile(r'(\d{1,3}(?:,\d{3})*\.\d{2})')  # decimals only
+DATE6_RE    = re.compile(r'\b\d{6}\b')
+PD_ROW_RE   = re.compile(r'\bPD\s+(D?\d{4})\b', re.IGNORECASE)
+TOOTH_RE  = re.compile(r'^(?:[1-9]|[12][0-9]|3[0-2]|[A-Ta-t])$')
+SURFACE_RE = re.compile(r'^[MDBOILFP]{1,4}$', re.IGNORECASE)
+
+def _to_float(s: str) -> float:
+    try:
+        return float(s.replace(',', ''))
+    except Exception:
+        return 0.0
+
+def _parse_pd_line(t: str) -> Optional[Tuple[str, Optional[float], Optional[float], Optional[float], Optional[str], Optional[str], Optional[str]]]:
+    """
+    Parse a single PD line.
+    Returns: (CDT, billed, allowed, paid, date6, tooth, surface)
+    """
+    m = PD_ROW_RE.search(t)
+    if not m:
+        return None
+
+    code = m.group(1)
+    code = code if code.upper().startswith('D') else f'D{code}'
+
+    amts = [_to_float(x) for x in AMOUNT_RE.findall(t)]
+    billed = allowed = paid = None
+    if len(amts) >= 3:
+        billed, allowed, paid = amts[-3:]
+
+    d = None
+    md = DATE6_RE.search(t)
+    if md:
+        d = md.group(0)
+
+    tooth = None
+    surface = None
+
+    tokens = t.split()
+    try:
+        code_idx = tokens.index(code)
+    except ValueError:
+        code_idx = None
+        for i, tok in enumerate(tokens):
+            if PD_ROW_RE.match(f'PD {tok}'):
+                code_idx = i
+                break
+
+    if code_idx is not None:
+        date_idx = None
+        for i in range(code_idx + 1, len(tokens)):
+            if DATE6_RE.fullmatch(tokens[i]):
+                date_idx = i
+                break
+
+        window = tokens[code_idx + 1: date_idx if date_idx is not None else len(tokens)]
+
+        for tok in window:
+            if TOOTH_RE.fullmatch(tok):
+                tooth = tok.upper()
+                break
+
+        start_j = 0
+        if tooth is not None:
+            for j, tok in enumerate(window):
+                if tok.upper() == tooth:
+                    start_j = j + 1
+                    break
+        for tok in window[start_j:]:
+            if SURFACE_RE.fullmatch(tok):
+                surface = tok.upper()
+                break
+
+    return code, billed, allowed, paid, d, tooth, surface
+
+def extract_all_clients_from_lines(post_lines: List[dict]) -> List[dict]:
+    """
+    Split strictly by MEMBER NAME lines; ignore anything before the first name.
+    For each member block, look up ICN from the nearest line above the member header.
+    Parse each PD line for CDT, Date SVC, Billed, Allowed, Paid (decimals only).
+    """
+    texts = [L["text"] for L in post_lines]
+    starts = [i for i,t in enumerate(texts) if MEMBER_RE.search(t)]
+    if not starts:
+        return []
+
+    out_rows = []
+
+    for si, start in enumerate(starts):
+        end = starts[si+1] if si+1 < len(starts) else len(texts)
+
+        # header line with MEMBER NAME
+        name_line = texts[start]
+        raw_name = MEMBER_RE.search(name_line).group(1).strip()
+        # Stop at "MEMBER ID" (case-insensitive) and other headers
+        cut_points = ["MEMBER ID", "OTH INS CD", "PA:", "DIAG:"]
+        mname = raw_name
+        for cp in cut_points:
+            idx = mname.upper().find(cp)
+            if idx != -1:
+                mname = mname[:idx].strip()
+        # Debug
+        # print(raw_name); print(mname)
+
+        # member id: search within the block
+        mid = ""
+        for t in texts[start:end]:
+            m = MEMBERID_RE.search(t)
+            if m:
+                mid = m.group(1).strip()
+                break
+
+        # ICN: search a few lines ABOVE the member header
+        icn = ""
+        for k in range(start-1, max(-1, start-6), -1):
+            if k < 0: break
+            mm = ICN_LINE_RE.match(texts[k])
+            if mm:
+                icn = mm.group(0)
+                break
+
+        # PD lines in the block
+        had_pd = False
+        for t in texts[start:end]:
+            if " PD " not in f" {t} ":
+                continue
+            parsed = _parse_pd_line(t)
+            if not parsed:
+                continue
+            had_pd = True
+            code, billed, allowed, paid, dsvc, tooth, surface = parsed
+            out_rows.append({
+                'Patient Name': mname.title() if mname else "",
+                'Patient ID': mid,
+                'ICN': icn,
+                'CDT Code': code,
+                'Tooth': tooth if tooth else "",
+                #'Surface': surface if surface else "",
+                'Date SVC': dsvc if dsvc else "",
+                'Billed Amount': billed if billed is not None else "",
+                'Allowed Amount': allowed if allowed is not None else "",
+                'Paid Amount':   paid   if paid   is not None else "",
+                'Extraction Success': True,
+            })
+
+        if not had_pd:
+            out_rows.append({
+                'Patient Name': mname.title() if mname else "",
+                'Patient ID': mid,
+                'ICN': icn,
+                'CDT Code': "",
+                'Tooth': "",
+                #'Surface': "",
+                'Date SVC': "",
+                'Billed Amount': "",
+                'Allowed Amount': "",
+                'Paid Amount': "",
+                'Extraction Success': bool(mname or mid),
+            })
+
+    return out_rows
+
+# ============================================================
+# ExcelGenerator
+# ============================================================
+class ExcelGenerator:
+    def __init__(self):
+        self.header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
+        self.header_font = Font(color="FFFFFF", bold=True)
+        self.border = Border(
+            left=Side(style='thin'),
+            right=Side(style='thin'),
+            top=Side(style='thin'),
+            bottom=Side(style='thin')
+        )
+        self.center_alignment = Alignment(horizontal='center', vertical='center')
+
+    def create_excel_file(self, df: pd.DataFrame) -> bytes:
+        wb = Workbook()
+        ws = wb.active
+        ws.title = "Medical Billing Extract"
+        ws['A1'] = f"Medical Billing OCR Extract - Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+        ws.merge_cells('A1:H1')
+        ws['A1'].font = Font(size=14, bold=True)
+        ws['A1'].alignment = self.center_alignment
+        ws.append([])
+
+        excel_df = self.prepare_dataframe_for_excel(df)
+        for r in dataframe_to_rows(excel_df, index=False, header=True):
+            ws.append(r)
+
+        self.format_worksheet(ws, len(excel_df) + 3)
+        self.add_summary_sheet(wb, excel_df)
+
+        output = io.BytesIO()
+        wb.save(output)
+        output.seek(0)
+        return output.getvalue()
+
+    def prepare_dataframe_for_excel(self, df: pd.DataFrame) -> pd.DataFrame:
+        excel_df = df.copy()
+        column_order = [
+            'Patient Name', 'Patient ID', 'ICN', 'CDT Code', 'Tooth', 'Date SVC', #'Surface',
+            'Billed Amount', 'Allowed Amount', 'Paid Amount',
+            'Extraction Success', 'Source File'
+        ]
+        existing = [c for c in column_order if c in excel_df.columns]
+        excel_df = excel_df[existing]
+        for amount_col in ['Billed Amount', 'Allowed Amount', 'Paid Amount']:
+            if amount_col in excel_df.columns:
+                excel_df[amount_col] = excel_df[amount_col].apply(self.format_currency)
+        if 'Extraction Success' in excel_df.columns:
+            excel_df['Extraction Success'] = excel_df['Extraction Success'].apply(lambda x: 'Yes' if x else 'No')
+        return excel_df
+
+    def format_currency(self, value):
+        if pd.isna(value) or value == "":
+            return ""
+        try:
+            if isinstance(value, str):
+                clean_value = value.replace('$', '').replace(',', '')
+                value = float(clean_value)
+            return f"${value:,.2f}"
+        except (ValueError, TypeError):
+            return str(value)
+
+    def format_worksheet(self, ws, data_rows):
+        header_row = 3
+        for cell in ws[header_row]:
+            if cell.value:
+                cell.fill = self.header_fill
+                cell.font = self.header_font
+                cell.alignment = self.center_alignment
+                cell.border = self.border
+        for row in range(header_row + 1, data_rows + 1):
+            for cell in ws[row]:
+                cell.border = self.border
+                cell.alignment = Alignment(horizontal='left', vertical='center')
+        self.auto_adjust_columns(ws)
+        self.add_conditional_formatting(ws, header_row, data_rows)
+
+    def auto_adjust_columns(self, ws):
+        max_col = ws.max_column
+        max_row = ws.max_row
+        for col_idx in range(1, max_col + 1):
+            max_len = 0
+            for row in range(1, max_row + 1):
+                cell = ws.cell(row=row, column=col_idx)
+                if isinstance(cell, MergedCell):
+                    continue
+                try:
+                    val = cell.value
+                    if val is None:
+                        continue
+                    max_len = max(max_len, len(str(val)))
+                except Exception:
+                    pass
+            letter = get_column_letter(col_idx)
+            ws.column_dimensions[letter].width = min(max_len + 2, 50)
+
+    def add_conditional_formatting(self, ws, header_row, data_rows):
+        success_col = None
+        for col, cell in enumerate(ws[header_row], 1):
+            if cell.value == 'Extraction Success':
+                success_col = col
+                break
+        if success_col:
+            for row in range(header_row + 1, data_rows + 1):
+                cell = ws.cell(row=row, column=success_col)
+                if cell.value == 'Yes':
+                    cell.fill = PatternFill(start_color="90EE90", end_color="90EE90", fill_type="solid")
+                elif cell.value == 'No':
+                    cell.fill = PatternFill(start_color="FFB6C1", end_color="FFB6C1", fill_type="solid")
+
+    def add_summary_sheet(self, wb, df):
+        ws = wb.create_sheet(title="Summary")
+        ws['A1'] = "Extraction Summary"
+        ws['A1'].font = Font(size=16, bold=True)
+        ws.merge_cells('A1:B1')
+        row = 3
+        stats = [
+            ("Total Rows", len(df)),
+            ("Successful", len(df[df['Extraction Success'] == 'Yes']) if 'Extraction Success' in df.columns else 0),
+            ("Failed", len(df[df['Extraction Success'] == 'No']) if 'Extraction Success' in df.columns else 0),
+        ]
+        for name, val in stats:
+            ws[f'A{row}'] = name
+            ws[f'B{row}'] = val
+            ws[f'A{row}'].font = Font(bold=True)
+            row += 1
+        ExcelGenerator().auto_adjust_columns(ws)
+        row += 2
+        ws[f'A{row}'] = f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+        ws[f'A{row}'].font = Font(italic=True)
+
+# ============================================================
+# Runner: glue everything together
+# ============================================================
+def process_images_to_excel(files: List[str], out_excel: str, deskewed_only: bool=False) -> None:
+    excel_gen = ExcelGenerator()
+    records: List[Dict[str, Any]] = []
+
+    for src in files:
+        try:
+            if deskewed_only:
+                img = cv2.imread(src, cv2.IMREAD_COLOR)
+                if img is None:
+                    raise FileNotFoundError(src)
+                words, _ = extract_words_and_text(src)
+                rot_words = []
+                for w in words:
+                    ww = dict(w)
+                    ww["cx_rot"], ww["cy_rot"] = w["cx"], w["cy"]
+                    rot_words.append(ww)
+                post_lines = group_horizontal_lines(rot_words)
+
+                post_txt = write_lines_txt(src, "lines_post", post_lines)  # only if DEBUG
+
+                rows = extract_all_clients_from_lines(post_lines)
+                for r in rows:
+                    r["Source File"] = os.path.basename(src)
+                    records.append(r)
+                # if DEBUG: print(f"{src} → parsed {len(rows)} PD rows (wrote {post_txt})")
+
+            else:
+                base, ext = os.path.splitext(src)
+                dst = f"{base}_deskewed{ext if ext else '.jpg'}" if DEBUG else None
+                info = smart_deskew_with_lines(src, dst, clamp_deg=30.0, use_vision=True)
+                post_lines = info.get("post_lines", []) if info else []
+                rows = extract_all_clients_from_lines(post_lines) if post_lines else []
+                for r in rows:
+                    r["Source File"] = os.path.basename(src)
+                    records.append(r)
+                # if DEBUG: print(f"{src} → rotated by {-info['angle_deg']:.3f}° → {dst}")
+
+        except Exception as e:
+            # if DEBUG: print(f"{src}: {e}")
+            records.append({
+                'Patient Name': "", 'Patient ID': "", 'ICN': "", 'CDT Code': "",
+                'Date SVC': "", 'Billed Amount': "", 'Allowed Amount': "", 'Paid Amount': "",
+                'Extraction Success': False, 'Source File': os.path.basename(src),
+            })
+
+    df = pd.DataFrame.from_records(records)
+    data = excel_gen.create_excel_file(df)
+    with open(out_excel, "wb") as f:
+        f.write(data)
+    # if DEBUG:
+    #     print(f"\n✅ Wrote Excel → {out_excel}")
+    #     print("   (and per-image: *_lines_pre.txt, *_lines_post.txt, *_deskewed.* when DEBUG=True)")
+
+# ============================================================
+# CLI
+# ============================================================
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--input", help="Folder of images (jpg/png/tif).", default=None)
+    ap.add_argument("--files", nargs="*", help="Specific image files.", default=None)
+    ap.add_argument("--out", help="Output Excel path.", required=True)
+    ap.add_argument("--deskewed-only", action="store_true",
+                    help="Only process files whose name contains '_deskewed'; skip deskew step.")
+    args = ap.parse_args()
+
+    paths: List[str] = []
+    if args.files:
+        for f in args.files:
+            if os.path.isfile(f):
+                paths.append(f)
+    if args.input and os.path.isdir(args.input):
+        for ext in ("*.jpg","*.jpeg","*.png","*.tif","*.tiff","*.bmp"):
+            paths.extend(glob.glob(os.path.join(args.input, ext)))
+
+    if args.deskewed_only:
+        paths = [p for p in paths if "_deskewed" in os.path.basename(p).lower()]
+
+    if not paths:
+        raise SystemExit("No input images found. Use --files or --input (and --deskewed-only if desired).")
+
+    if not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS"):
+        # print("WARNING: GOOGLE_APPLICATION_CREDENTIALS not set. Set it to your local service account JSON path.")
+        pass
+
+    process_images_to_excel(paths, args.out, deskewed_only=args.deskewed_only)
+
+if __name__ == "__main__":
+    main()
--- a/apps/PaymentOCRService/package.json
+++ b/apps/PaymentOCRService/package.json
@@ -0,0 +1,8 @@
+{
+  "name": "pdfservice",
+  "private": true,
+  "scripts": {
+    "postinstall": "pip install -r requirements.txt",
+    "dev": "python main.py"
+  }
+}
--- a/apps/PaymentOCRService/requirements.txt
+++ b/apps/PaymentOCRService/requirements.txt
@@ -0,0 +1,10 @@
+fastapi
+uvicorn[standard]
+google-cloud-vision
+opencv-python-headless
+pytesseract
+pillow
+pandas
+openpyxl
+numpy
+python-multipart
--- a/apps/ProcedureCodeFromMhPdf/MH.pdf
+++ b/apps/ProcedureCodeFromMhPdf/MH.pdf
--- a/apps/ProcedureCodeFromMhPdf/Readme.md
+++ b/apps/ProcedureCodeFromMhPdf/Readme.md
@@ -0,0 +1,5 @@
+This code was written only while extracting procedure code data from Mass Health pdf, to make process easy. 
+
+Only was a one time process, not used as core functionality in this whole app. 
+
+Keeping it as in future might need to extract again.
--- a/apps/ProcedureCodeFromMhPdf/compareJson.py
+++ b/apps/ProcedureCodeFromMhPdf/compareJson.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+"""
+Compare a main dental JSON file with one or more other JSON files and
+return all records whose 'Procedure Code' is NOT present in the main file.
+
+- Matching key: 'Procedure Code' (case-insensitive, trimmed).
+- Keeps the full record from the other files (including extra fields like 'Full Price').
+- Deduplicates by Procedure Code across the collected "missing" results.
+
+CONFIG: set MAIN_PATH, OTHER_PATHS, OUT_PATH below.
+"""
+
+import json
+from pathlib import Path
+from typing import List, Dict, Any
+
+# =========================
+# CONFIG — EDIT THESE ONLY
+# =========================
+MAIN_PATH = "procedureCodesMain.json"  # your main JSON (with PriceLTEQ21/PriceGT21)
+OTHER_PATHS = [
+    "procedureCodesOld.json",       # one or more other JSON files to compare against the main
+    # "other2.json",
+]
+OUT_PATH = "not_in_main.json"  # where to write the results
+# =========================
+
+
+def _load_json_any(path: str) -> List[Dict[str, Any]]:
+    """
+    Load JSON. Accept:
+      - a list of objects
+      - a single object (wraps into a list)
+    """
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if isinstance(data, dict):
+        return [data]
+    if isinstance(data, list):
+        # filter out non-dict items defensively
+        return [x for x in data if isinstance(x, dict)]
+    raise ValueError(f"Unsupported JSON top-level type in {path}: {type(data)}")
+
+
+def _norm_code(record: Dict[str, Any]) -> str:
+    # Normalize the 'Procedure Code' for matching
+    code = str(record.get("Procedure Code", "")).strip().upper()
+    # Some PDFs might have stray spaces, tabs, or zero-width chars
+    code = "".join(ch for ch in code if not ch.isspace())
+    return code
+
+
+def collect_main_codes(main_path: str) -> set:
+    main_items = _load_json_any(main_path)
+    codes = {_norm_code(rec) for rec in main_items if _norm_code(rec)}
+    return codes
+
+
+def collect_missing_records(other_paths: List[str], main_codes: set) -> List[Dict[str, Any]]:
+    missing: Dict[str, Dict[str, Any]] = {}  # map normalized code -> record
+    for p in other_paths:
+        items = _load_json_any(p)
+        for rec in items:
+            code_norm = _norm_code(rec)
+            if not code_norm:
+                continue
+            if code_norm not in main_codes and code_norm not in missing:
+                # Keep the full original record
+                missing[code_norm] = rec
+    # return in a stable, sorted order by code
+    return [missing[k] for k in sorted(missing.keys())]
+
+
+def main():
+    # Validate files exist
+    if not Path(MAIN_PATH).exists():
+        raise FileNotFoundError(f"Main file not found: {MAIN_PATH}")
+    for p in OTHER_PATHS:
+        if not Path(p).exists():
+            raise FileNotFoundError(f"Other file not found: {p}")
+
+    main_codes = collect_main_codes(MAIN_PATH)
+    missing_records = collect_missing_records(OTHER_PATHS, main_codes)
+
+    with open(OUT_PATH, "w", encoding="utf-8") as f:
+        json.dump(missing_records, f, ensure_ascii=False, indent=2)
+
+    print(f"Main codes: {len(main_codes)}")
+    print(f"Missing from main: {len(missing_records)}")
+    print(f"Wrote results to {OUT_PATH}")
+    # Also echo to stdout
+    print(json.dumps(missing_records, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/ProcedureCodeFromMhPdf/extract_bypage.py
+++ b/apps/ProcedureCodeFromMhPdf/extract_bypage.py
@@ -0,0 +1,183 @@
+import re
+import json
+from typing import List, Dict
+import fitz  # PyMuPDF
+
+
+# =========================
+# CONFIG — EDIT THESE ONLY
+# =========================
+PDF_PATH = "MH.pdf"   # path to your PDF
+PAGES = [2]                   # 0-based page indexes to parse, e.g., [2] for the page you showed
+OUT_PATH = "output.json"      # where to write JSON
+FIRST_PRICE_IS_LTE21 = True   # True => first price line is <=21; False => first price is >21
+PRINT_PAGE_TEXT = False       # set True if you want to print the raw page text for sanity check
+# =========================
+
+
+# --- patterns ---
+code_line_re = re.compile(r"^\s*(D\d{4})\s*$")
+# a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC'
+price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE)
+# lines that definitely start a notes block we should ignore once prices are done
+note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—|–|Age limitation:|CR\b)", re.IGNORECASE)
+
+
+def normalize_ws(s: str) -> str:
+    s = s.replace("\u00a0", " ")
+    s = re.sub(r"[ \t]+", " ", s)
+    s = re.sub(r"\s*\n\s*", " ", s)
+    s = re.sub(r"\s{2,}", " ", s)
+    return s.strip(" ,.;:-•·\n\t")
+
+
+def clean_money(token: str) -> str:
+    if token.upper() == "NC":
+        return "NC"
+    return token.replace(",", "").lstrip("$").strip()
+
+
+def get_page_lines(pdf_path: str, pages: List[int]) -> List[str]:
+    doc = fitz.open(pdf_path)
+    try:
+        max_idx = len(doc) - 1
+        for p in pages:
+            if p < 0 or p > max_idx:
+                raise ValueError(f"Invalid page index {p}. Valid range is 0..{max_idx}.")
+        lines: List[str] = []
+        for p in pages:
+            text = doc.load_page(p).get_text("text") or ""
+            if PRINT_PAGE_TEXT:
+                print(f"\n--- RAW PAGE {p} ---\n{text}")
+            # keep line boundaries; later we parse line-by-line
+            lines.extend(text.splitlines())
+        return lines
+    finally:
+        doc.close()
+
+
+def extract_records(lines: List[str]) -> List[Dict[str, str]]:
+    out: List[Dict[str, str]] = []
+    i = 0
+    n = len(lines)
+
+    while i < n:
+        line = lines[i].strip()
+
+        # seek a code line
+        mcode = code_line_re.match(line)
+        if not mcode:
+            i += 1
+            continue
+
+        code = mcode.group(1)
+        i += 1
+
+        # gather description lines until we encounter price lines
+        desc_lines: List[str] = []
+        # skip blank lines before description
+        while i < n and not lines[i].strip():
+            i += 1
+
+        # collect description lines (usually 1–3) until first price token
+        # stop also if we accidentally hit another code (defensive)
+        j = i
+        while j < n:
+            s = lines[j].strip()
+            if not s:
+                # blank line inside description — consider description ended if the next is a price
+                # but we don't advance here; break and let price parsing handle it
+                break
+            if code_line_re.match(s):
+                # next code — no prices found; abandon this broken record
+                break
+            if price_line_re.match(s):
+                # reached price section
+                break
+            if note_starters_re.match(s):
+                # encountered a note before price — treat as end of description; prices may be missing
+                break
+            desc_lines.append(s)
+            j += 1
+
+        # advance i to where we left off
+        i = j
+
+        description = normalize_ws(" ".join(desc_lines))
+
+        # collect up to two price tokens
+        prices: List[str] = []
+        while i < n and len(prices) < 2:
+            s = lines[i].strip()
+            if not s:
+                i += 1
+                continue
+            if code_line_re.match(s):
+                # new record — stop; this means we never got prices (malformed)
+                break
+            mprice = price_line_re.match(s)
+            if mprice:
+                prices.append(clean_money(mprice.group(1)))
+                i += 1
+                continue
+            # if we encounter a note/flags block, skip forward until the next code/blank
+            if note_starters_re.match(s) or s in {"Y", "NC"}:
+                # skip this block quickly
+                i += 1
+                # keep skipping subsequent non-empty, non-code lines until a blank or next code
+                while i < n:
+                    t = lines[i].strip()
+                    if not t or code_line_re.match(t):
+                        break
+                    i += 1
+                # now let the outer loop proceed
+                continue
+            # unrecognized line: if prices already found, we can break; else skip
+            if prices:
+                break
+            i += 1
+
+        if len(prices) < 2:
+            # couldn't find 2 prices reliably; skip this record
+            continue
+
+        if FIRST_PRICE_IS_LTE21:
+            price_lte21, price_gt21 = prices[0], prices[1]
+        else:
+            price_lte21, price_gt21 = prices[1], prices[0]
+
+        out.append(
+            {
+                "Procedure Code": code,
+                "Description": description,
+                "PriceLTEQ21": price_lte21,
+                "PriceGT21": price_gt21,
+            }
+        )
+
+        # after prices, skip forward until next code or blank block end
+        while i < n:
+            s = lines[i].strip()
+            if not s:
+                i += 1
+                break
+            if code_line_re.match(s):
+                # next record will pick this up
+                break
+            i += 1
+
+    return out
+
+
+def extract_pdf_to_json(pdf_path: str, pages: List[int], out_path: str) -> List[Dict[str, str]]:
+    lines = get_page_lines(pdf_path, pages)
+    data = extract_records(lines)
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    return data
+
+
+if __name__ == "__main__":
+    data = extract_pdf_to_json(PDF_PATH, PAGES, OUT_PATH)
+    print(f"Wrote {len(data)} rows to {OUT_PATH}")
+    print(json.dumps(data, ensure_ascii=False, indent=2))
--- a/apps/ProcedureCodeFromMhPdf/extract_byrange.py
+++ b/apps/ProcedureCodeFromMhPdf/extract_byrange.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+"""
+MassHealth dental PDF parser (PyMuPDF / fitz) — PAGE RANGE VERSION
+
+Parses rows like:
+
+D2160
+Amalgam-three surfaces,
+primary or permanent
+$110
+$92
+Y
+Y
+...
+
+Outputs a single JSON with records from the chosen page range (inclusive).
+
+Config:
+- PDF_PATH: path to the PDF
+- PAGE_START, PAGE_END: 1-based page numbers (inclusive)
+- FIRST_PRICE_IS_LTE21: True => first price line is <=21; False => first price is >21
+- OUT_PATH: output JSON path
+"""
+
+import re
+import json
+from typing import List, Dict
+import fitz  # PyMuPDF
+
+
+# =========================
+# CONFIG — EDIT THESE ONLY
+# =========================
+PDF_PATH = "MH.pdf"   # path to your PDF
+PAGE_START = 1                # 1-based inclusive start page (e.g., 1)
+PAGE_END   = 12               # 1-based inclusive end page   (e.g., 5)
+OUT_PATH = "output.json"      # single JSON file containing all parsed rows
+FIRST_PRICE_IS_LTE21 = True   # True => first price line is <=21; False => first price is >21
+PRINT_PAGE_TEXT = False       # set True to print raw text for each page
+# =========================
+
+
+# --- patterns ---
+code_line_re = re.compile(r"^\s*(D\d{4})\s*$")
+# a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC'
+price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE)
+# lines that definitely start a notes block to ignore once prices are done
+note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—|–|Age limitation:|CR\b)", re.IGNORECASE)
+
+
+def normalize_ws(s: str) -> str:
+    s = s.replace("\u00a0", " ")
+    s = re.sub(r"[ \t]+", " ", s)
+    s = re.sub(r"\s*\n\s*", " ", s)
+    s = re.sub(r"\s{2,}", " ", s)
+    return s.strip(" ,.;:-•·\n\t")
+
+
+def clean_money(token: str) -> str:
+    if token.upper() == "NC":
+        return "NC"
+    return token.replace(",", "").lstrip("$").strip()
+
+
+def get_page_lines(pdf_path: str, page_start_1b: int, page_end_1b: int) -> List[str]:
+    if page_start_1b <= 0 or page_end_1b <= 0:
+        raise ValueError("PAGE_START and PAGE_END must be >= 1 (1-based).")
+    if page_start_1b > page_end_1b:
+        raise ValueError("PAGE_START cannot be greater than PAGE_END.")
+
+    doc = fitz.open(pdf_path)
+    try:
+        last_idx_0b = len(doc) - 1
+        # convert to 0-based inclusive range
+        start_0b = page_start_1b - 1
+        end_0b = page_end_1b - 1
+        if start_0b < 0 or end_0b > last_idx_0b:
+            raise ValueError(f"Page range out of bounds. Valid 1-based range is 1..{last_idx_0b + 1}.")
+        lines: List[str] = []
+        for p in range(start_0b, end_0b + 1):
+            text = doc.load_page(p).get_text("text") or ""
+            if PRINT_PAGE_TEXT:
+                print(f"\n--- RAW PAGE {p} (0-based; shown as {p+1} 1-based) ---\n{text}")
+            lines.extend(text.splitlines())
+        return lines
+    finally:
+        doc.close()
+
+
+def extract_records(lines: List[str]) -> List[Dict[str, str]]:
+    out: List[Dict[str, str]] = []
+    i = 0
+    n = len(lines)
+
+    while i < n:
+        line = lines[i].strip()
+
+        # seek a code line
+        mcode = code_line_re.match(line)
+        if not mcode:
+            i += 1
+            continue
+
+        code = mcode.group(1)
+        i += 1
+
+        # gather description lines until we encounter price lines
+        desc_lines: List[str] = []
+        # skip blank lines before description
+        while i < n and not lines[i].strip():
+            i += 1
+
+        # collect description lines (usually 1–3) until first price token
+        # stop also if we accidentally hit another code (defensive)
+        j = i
+        while j < n:
+            s = lines[j].strip()
+            if not s:
+                break
+            if code_line_re.match(s):
+                # next code — description ended abruptly (malformed)
+                break
+            if price_line_re.match(s):
+                # reached price section
+                break
+            if note_starters_re.match(s):
+                # encountered a note before price — treat as end of description; prices may be missing
+                break
+            desc_lines.append(s)
+            j += 1
+
+        # advance i to where we left off
+        i = j
+
+        description = normalize_ws(" ".join(desc_lines))
+
+        # collect up to two price tokens
+        prices: List[str] = []
+        while i < n and len(prices) < 2:
+            s = lines[i].strip()
+            if not s:
+                i += 1
+                continue
+            if code_line_re.match(s):
+                # new record — stop; this means we never got prices (malformed)
+                break
+            mprice = price_line_re.match(s)
+            if mprice:
+                prices.append(clean_money(mprice.group(1)))
+                i += 1
+                continue
+            # if we encounter a note/flags block, skip forward until a blank or next code
+            if note_starters_re.match(s) or s in {"Y", "NC"}:
+                i += 1
+                while i < n:
+                    t = lines[i].strip()
+                    if not t or code_line_re.match(t):
+                        break
+                    i += 1
+                continue
+            # unrecognized line: if we already captured some prices, break; else skip
+            if prices:
+                break
+            i += 1
+
+        if len(prices) < 2:
+            # couldn't find 2 prices reliably; skip this record
+            continue
+
+        if FIRST_PRICE_IS_LTE21:
+            price_lte21, price_gt21 = prices[0], prices[1]
+        else:
+            price_lte21, price_gt21 = prices[1], prices[0]
+
+        out.append(
+            {
+                "Procedure Code": code,
+                "Description": description,
+                "PriceLTEQ21": price_lte21,
+                "PriceGT21": price_gt21,
+            }
+        )
+
+        # after prices, skip forward until next code or blank block end
+        while i < n:
+            s = lines[i].strip()
+            if not s:
+                i += 1
+                break
+            if code_line_re.match(s):
+                break
+            i += 1
+
+    return out
+
+
+def extract_pdf_range_to_json(pdf_path: str, page_start_1b: int, page_end_1b: int, out_path: str) -> List[Dict[str, str]]:
+    lines = get_page_lines(pdf_path, page_start_1b, page_end_1b)
+    data = extract_records(lines)
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    return data
+
+
+if __name__ == "__main__":
+    data = extract_pdf_range_to_json(PDF_PATH, PAGE_START, PAGE_END, OUT_PATH)
+    print(f"Wrote {len(data)} rows to {OUT_PATH}")
+    print(json.dumps(data, ensure_ascii=False, indent=2))
--- a/apps/ProcedureCodeFromMhPdf/not_in_main.json
+++ b/apps/ProcedureCodeFromMhPdf/not_in_main.json
@@ -0,0 +1,192 @@
+[
+  {
+    "Procedure Code": "D0120",
+    "Description": "perio exam",
+    "Price": "105"
+  },
+  {
+    "Procedure Code": "D0140",
+    "Description": "limited exam",
+    "Price": "90"
+  },
+  {
+    "Procedure Code": "D0150",
+    "Description": "comprehensive exam",
+    "Price": "120"
+  },
+  {
+    "Procedure Code": "D0210",
+    "Description": "Fmx.",
+    "Price": "120"
+  },
+  {
+    "Procedure Code": "D0220",
+    "Description": "first PA.",
+    "Price": "60"
+  },
+  {
+    "Procedure Code": "D0230",
+    "Description": "2nd PA.",
+    "Price": "50"
+  },
+  {
+    "Procedure Code": "D0272",
+    "Description": "2 BW",
+    "Price": "80"
+  },
+  {
+    "Procedure Code": "D0274",
+    "Description": "4BW",
+    "Price": "160"
+  },
+  {
+    "Procedure Code": "D0330",
+    "Description": "pano",
+    "Price": "150"
+  },
+  {
+    "Procedure Code": "D0364",
+    "Description": "Less than one jaw",
+    "Price": "350"
+  },
+  {
+    "Procedure Code": "D0365",
+    "Description": "Mand",
+    "Price": "350"
+  },
+  {
+    "Procedure Code": "D0366",
+    "Description": "Max",
+    "Price": "350"
+  },
+  {
+    "Procedure Code": "D0367",
+    "Description": "",
+    "Price": "400"
+  },
+  {
+    "Procedure Code": "D0368",
+    "Description": "include TMJ",
+    "Price": "375"
+  },
+  {
+    "Procedure Code": "D0380",
+    "Description": "Less than one jaw",
+    "Price": "300"
+  },
+  {
+    "Procedure Code": "D0381",
+    "Description": "Mand",
+    "Price": "300"
+  },
+  {
+    "Procedure Code": "D0382",
+    "Description": "Max",
+    "Price": "300"
+  },
+  {
+    "Procedure Code": "D0383",
+    "Description": "",
+    "Price": "350"
+  },
+  {
+    "Procedure Code": "D1110",
+    "Description": "adult prophy",
+    "Price": "150"
+  },
+  {
+    "Procedure Code": "D1120",
+    "Description": "child prophy",
+    "Price": "120"
+  },
+  {
+    "Procedure Code": "D1208",
+    "Description": "FL",
+    "Price": "90"
+  },
+  {
+    "Procedure Code": "D1351",
+    "Description": "sealant",
+    "Price": "80"
+  },
+  {
+    "Procedure Code": "D1999",
+    "Description": "",
+    "Price": "50"
+  },
+  {
+    "Procedure Code": "D2140",
+    "Description": "amalgam, one surface",
+    "Price": "150"
+  },
+  {
+    "Procedure Code": "D2150",
+    "Description": "amalgam, two surface",
+    "Price": "200"
+  },
+  {
+    "Procedure Code": "D2955",
+    "Description": "post renoval",
+    "Price": "350"
+  },
+  {
+    "Procedure Code": "D4910",
+    "Description": "perio maintains",
+    "Price": "250"
+  },
+  {
+    "Procedure Code": "D5510",
+    "Description": "Repair broken complete denture base (QUAD)",
+    "Price": "400"
+  },
+  {
+    "Procedure Code": "D6056",
+    "Description": "pre fab abut",
+    "Price": "750"
+  },
+  {
+    "Procedure Code": "D6057",
+    "Description": "custom abut",
+    "Price": "800"
+  },
+  {
+    "Procedure Code": "D6058",
+    "Description": "porcelain, implant crown, ceramic crown",
+    "Price": "1400"
+  },
+  {
+    "Procedure Code": "D6059",
+    "Description": "",
+    "Price": "1400"
+  },
+  {
+    "Procedure Code": "D6100",
+    "Description": "",
+    "Price": "320"
+  },
+  {
+    "Procedure Code": "D6110",
+    "Description": "implant",
+    "Price": "1600"
+  },
+  {
+    "Procedure Code": "D6242",
+    "Description": "noble metal. For united",
+    "Price": "1400"
+  },
+  {
+    "Procedure Code": "D6245",
+    "Description": "porcelain, not for united",
+    "Price": "1400"
+  },
+  {
+    "Procedure Code": "D7910",
+    "Description": "suture, small wound up to 5 mm",
+    "Price": "400"
+  },
+  {
+    "Procedure Code": "D7950",
+    "Description": "max",
+    "Price": "800"
+  }
+]
--- a/apps/ProcedureCodeFromMhPdf/procedureCodes.json
+++ b/apps/ProcedureCodeFromMhPdf/procedureCodes.json
--- a/apps/ProcedureCodeFromMhPdf/procedureCodesOld.json
+++ b/apps/ProcedureCodeFromMhPdf/procedureCodesOld.json
@@ -0,0 +1,344 @@
+[
+    {
+        "Procedure Code": "D1999",
+        "Description": "",
+        "Price": "50"
+    },
+    {
+        "Procedure Code": "D0120",
+        "Description": "perio exam",
+        "Price": "105"
+    },
+    {
+        "Procedure Code": "D0140",
+        "Description": "limited exam",
+        "Price": "90"
+    },
+    {
+        "Procedure Code": "D0150",
+        "Description": "comprehensive exam",
+        "Price": "120"
+    },
+    {
+        "Procedure Code": "D0210",
+        "Description": "Fmx.",
+        "Price": "120"
+    },
+    {
+        "Procedure Code": "D0220",
+        "Description": "first PA.",
+        "Price": "60"
+    },
+    {
+        "Procedure Code": "D0230",
+        "Description": "2nd PA.",
+        "Price": "50"
+    },
+    {
+        "Procedure Code": "D0330",
+        "Description": "pano",
+        "Price": "150"
+    },
+    {
+        "Procedure Code": "D0272",
+        "Description": "2 BW",
+        "Price": "80"
+    },
+    {
+        "Procedure Code": "D0274",
+        "Description": "4BW",
+        "Price": "160"
+    },
+    {
+        "Procedure Code": "D1110",
+        "Description": "adult prophy",
+        "Price": "150"
+    },
+    {
+        "Procedure Code": "D1120",
+        "Description": "child prophy",
+        "Price": "120"
+    },
+    {
+        "Procedure Code": "D1351",
+        "Description": "sealant",
+        "Price": "80"
+    },
+    {
+        "Procedure Code": "D4341",
+        "Description": "srp",
+        "Price": "250"
+    },
+    {
+        "Procedure Code": "D4910",
+        "Description": "perio maintains",
+        "Price": "250"
+    },
+    {
+        "Procedure Code": "D1208",
+        "Description": "FL",
+        "Price": "90"
+    },
+    {
+        "Procedure Code": "D2330",
+        "Description": "front composite. 1 s.",
+        "Price": "180"
+    },
+    {
+        "Procedure Code": "D2331",
+        "Description": "2s",
+        "Price": "220"
+    },
+    {
+        "Procedure Code": "D2332",
+        "Description": "3s",
+        "Price": "280"
+    },
+    {
+        "Procedure Code": "D2335",
+        "Description": "4s or more",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D2391",
+        "Description": "back. 1s",
+        "Price": "200"
+    },
+    {
+        "Procedure Code": "D2392",
+        "Description": "2s",
+        "Price": "250"
+    },
+    {
+        "Procedure Code": "D2393",
+        "Description": "3s",
+        "Price": "280"
+    },
+    {
+        "Procedure Code": "D2394",
+        "Description": "4s",
+        "Price": "320"
+    },
+    {
+        "Procedure Code": "D2140",
+        "Description": "amalgam, one surface",
+        "Price": "150"
+    },
+    {
+        "Procedure Code": "D2150",
+        "Description": "amalgam, two surface",
+        "Price": "200"
+    },
+    {
+        "Procedure Code": "D2750",
+        "Description": "high noble",
+        "Price": "1300"
+    },
+    {
+        "Procedure Code": "D2751",
+        "Description": "base metal",
+        "Price": "1200"
+    },
+    {
+        "Procedure Code": "D2740",
+        "Description": "crown porcelain",
+        "Price": "1300"
+    },
+    {
+        "Procedure Code": "D2954",
+        "Description": "p/c",
+        "Price": "450"
+    },
+    {
+        "Procedure Code": "D7910",
+        "Description": "suture, small wound up to 5 mm",
+        "Price": "400"
+    },
+    {
+        "Procedure Code": "D5110",
+        "Description": "FU",
+        "Price": "1200",
+        "Full Price": "1700"
+    },
+    {
+        "Procedure Code": "D5120",
+        "Description": "FL",
+        "Price": "1700",
+        "Full Price": "1700"
+    },
+    {
+        "Procedure Code": "D5211",  
+        "Description": "pu",
+        "Price": "1300"
+    },
+    {
+        "Procedure Code": "D5212",
+        "Description": "pl",
+        "Price": "1300"
+    },
+    {
+        "Procedure Code": "D5213",
+        "Description": "cast pu.",
+        "Price": "1700"
+    },
+    {
+        "Procedure Code": "D5214",
+        "Description": "cast pl",
+        "Price": "1700"
+    },
+    {
+        "Procedure Code": "D5510",
+        "Description": "Repair broken complete denture base (QUAD)",
+        "Price": "400"
+    },
+    {
+        "Procedure Code": "D5520",
+        "Description": "Replace missing or broken teeth - complete denture (each tooth) (TOOTH)",
+        "Price": "200"
+    },
+    {
+        "Procedure Code": "D5750",
+        "Description": "lab reline",
+        "Price": "600"
+    },
+    {
+        "Procedure Code": "D5730",
+        "Description": "chairside reline",
+        "Price": "500"
+    },
+    {
+        "Procedure Code": "D2920",
+        "Description": "re cement crown",
+        "Price": "120"
+    },
+    {
+        "Procedure Code": "D2950",
+        "Description": "core buildup",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D2955",
+        "Description": "post renoval",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D6100",
+        "Description": "",
+        "Price": "320"
+    },
+    {
+        "Procedure Code": "D6110",
+        "Description": "implant",
+        "Price": "1600"
+    },
+    {
+        "Procedure Code": "D6056",
+        "Description": "pre fab abut",
+        "Price": "750"
+    },
+    {
+        "Procedure Code": "D6057",
+        "Description": "custom abut",
+        "Price": "800"
+    },
+    {
+        "Procedure Code": "D6058",
+        "Description": "porcelain, implant crown, ceramic crown",
+        "Price": "1400"
+    },
+    {
+        "Procedure Code": "D6059",
+        "Description": "",
+        "Price": "1400"
+    },
+    {
+        "Procedure Code": "D6242",
+        "Description": "noble metal. For united",
+        "Price": "1400"
+    },
+    {
+        "Procedure Code": "D6245",
+        "Description": "porcelain, not for united",
+        "Price": "1400"
+    },
+    {
+        "Procedure Code": "D0367",
+        "Description": "",
+        "Price": "400"
+    },
+    {
+        "Procedure Code": "D0364",
+        "Description": "Less than one jaw",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D0365",
+        "Description": "Mand",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D0366",
+        "Description": "Max",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D0368",
+        "Description": "include TMJ",
+        "Price": "375"
+    },
+    {
+        "Procedure Code": "D0383",
+        "Description": "",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D0380",
+        "Description": "Less than one jaw",
+        "Price": "300"
+    },
+    {
+        "Procedure Code": "D0381",
+        "Description": "Mand",
+        "Price": "300"
+    },
+    {
+        "Procedure Code": "D0382",
+        "Description": "Max",
+        "Price": "300"
+    },
+    {
+        "Procedure Code": "D7950",
+        "Description": "max",
+        "Price": "800"
+    },
+    {
+        "Procedure Code": "D7140",
+        "Description": "simple ext",
+        "Price": "150"
+    },
+    {
+        "Procedure Code": "D7210",
+        "Description": "surgical ext",
+        "Price": "280"
+    },
+    {
+        "Procedure Code": "D7220",
+        "Description": "soft impacted",
+        "Price": "380"
+    },
+    {
+        "Procedure Code": "D7230",
+        "Description": "partial bony",
+        "Price": "450"
+    },
+    {
+        "Procedure Code": "D7240",
+        "Description": "fully bony",
+        "Price": "550"
+    },
+    {
+        "Procedure Code": "D3320",
+        "Description": "pre M RCT",
+        "Price": "1050"
+    }
+]