structured well

This commit is contained in:
2025-08-29 18:16:51 +05:30
parent c9ad84c3a8
commit d89bee4f07
16 changed files with 3080 additions and 0 deletions

View File

View File

@@ -0,0 +1,13 @@
# Medical Billing OCR API (FastAPI)
## 1) Prereqs
- Google Cloud Vision service-account JSON.
- `GOOGLE_APPLICATION_CREDENTIALS` env var pointing to that JSON.
- Tesseract installed (for fallback OCR), and on PATH.
## 2) Install & run (local)
```bash
python -m venv .venv && source .venv/bin/activate
pip install -r requirements.txt
export GOOGLE_APPLICATION_CREDENTIALS=/absolute/path/to/service-account.json
uvicorn app.main:app --reload --port 8080

View File

View File

@@ -0,0 +1,81 @@
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import StreamingResponse, JSONResponse, PlainTextResponse
from typing import List, Optional
import io
import os
from app.pipeline_adapter import (
process_images_to_rows,
rows_to_csv_bytes,
)
app = FastAPI(
title="Medical Billing OCR API",
description="FastAPI wrapper around the complete OCR pipeline (Google Vision + deskew + line clustering + extraction).",
version="1.0.0",
)
ALLOWED_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp"}
@app.get("/health", response_class=PlainTextResponse)
def health():
# Simple sanity check (also ensures GCP creds var visibility)
creds = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS", "")
return f"OK | GOOGLE_APPLICATION_CREDENTIALS set: {bool(creds)}"
@app.post("/extract/json")
async def extract_json(files: List[UploadFile] = File(...)):
if not files:
raise HTTPException(status_code=400, detail="No files provided.")
# Validate extensions early (not bulletproof, but helpful)
bad = [f.filename for f in files if os.path.splitext(f.filename or "")[1].lower() not in ALLOWED_EXTS]
if bad:
raise HTTPException(
status_code=415,
detail=f"Unsupported file types: {', '.join(bad)}. Allowed: {', '.join(sorted(ALLOWED_EXTS))}"
)
# Read blobs in-memory
blobs = []
filenames = []
for f in files:
blobs.append(await f.read())
filenames.append(f.filename or "upload.bin")
try:
rows = process_images_to_rows(blobs, filenames)
# rows is a list[dict] where each dict contains the columns you already emit (Patient Name, etc.)
return JSONResponse(content={"rows": rows})
except Exception as e:
raise HTTPException(status_code=500, detail=f"Processing error: {e}")
@app.post("/extract/csv")
async def extract_csv(files: List[UploadFile] = File(...), filename: Optional[str] = None):
if not files:
raise HTTPException(status_code=400, detail="No files provided.")
bad = [f.filename for f in files if os.path.splitext(f.filename or "")[1].lower() not in ALLOWED_EXTS]
if bad:
raise HTTPException(
status_code=415,
detail=f"Unsupported file types: {', '.join(bad)}. Allowed: {', '.join(sorted(ALLOWED_EXTS))}"
)
blobs = []
filenames = []
for f in files:
blobs.append(await f.read())
filenames.append(f.filename or "upload.bin")
try:
rows = process_images_to_rows(blobs, filenames)
csv_bytes = rows_to_csv_bytes(rows)
out_name = filename or "medical_billing_extract.csv"
return StreamingResponse(
io.BytesIO(csv_bytes),
media_type="text/csv",
headers={"Content-Disposition": f'attachment; filename="{out_name}"'}
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Processing error: {e}")

View File

@@ -0,0 +1,77 @@
import os
import tempfile
from typing import List, Dict
import pandas as pd
# Import your existing functions directly from complete_pipeline.py
from complete_pipeline import (
smart_deskew_with_lines,
extract_all_clients_from_lines,
)
def _process_single_image_bytes(blob: bytes, display_name: str) -> List[Dict]:
"""
Saves bytes to a temp file (so OpenCV + Google Vision can read it),
runs your existing pipeline functions, and returns extracted rows.
"""
suffix = os.path.splitext(display_name)[1] or ".jpg"
tmp_path = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
tmp.write(blob)
tmp_path = tmp.name
# Uses Google Vision + deskew + post-line grouping
info = smart_deskew_with_lines(tmp_path, None, clamp_deg=30.0, use_vision=True)
post_lines = info.get("post_lines", []) if info else []
rows = extract_all_clients_from_lines(post_lines) if post_lines else []
# Add source file information (same as your Streamlit app)
for r in rows:
r["Source File"] = display_name
# If nothing parsed, still return a placeholder row to indicate failure (optional)
if not rows:
rows.append({
'Patient Name': "", 'Patient ID': "", 'ICN': "", 'CDT Code': "",
'Tooth': "", 'Date SVC': "",
'Billed Amount': "", 'Allowed Amount': "", 'Paid Amount': "",
'Extraction Success': False, 'Source File': display_name,
})
return rows
finally:
if tmp_path:
try:
os.unlink(tmp_path)
except Exception:
pass
def process_images_to_rows(blobs: List[bytes], filenames: List[str]) -> List[Dict]:
"""
Public API used by FastAPI routes.
blobs: list of image bytes
filenames: matching names for display / Source File column
"""
all_rows: List[Dict] = []
for blob, name in zip(blobs, filenames):
rows = _process_single_image_bytes(blob, name)
all_rows.extend(rows)
return all_rows
def rows_to_csv_bytes(rows: List[Dict]) -> bytes:
"""
Convert pipeline rows to CSV bytes (for frontend to consume as a table).
"""
df = pd.DataFrame(rows)
# Keep a stable column order if present (mirrors your Excel order)
desired = [
'Patient Name', 'Patient ID', 'ICN', 'CDT Code', 'Tooth', 'Date SVC',
'Billed Amount', 'Allowed Amount', 'Paid Amount',
'Extraction Success', 'Source File'
]
cols = [c for c in desired if c in df.columns] + [c for c in df.columns if c not in desired]
df = df[cols]
return df.to_csv(index=False).encode("utf-8")

View File

@@ -0,0 +1,837 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
End-to-end local pipeline (single script)
- One Google Vision pass per image (DOCUMENT_TEXT_DETECTION)
- Smart deskew (Hough + OCR pairs) with fine grid search (in-memory)
- Build slope-aware (pre) and horizontal (post) line dumps (in-memory)
- Extract all clients & PD rows per page (robust to headers/EOBS)
- Export nicely formatted Excel via ExcelGenerator
Usage:
python ocr_pipeline.py --input "C:\\imgs" --out "results.xlsx"
python ocr_pipeline.py --files s1.jpg s2.jpg --out results.xlsx
python ocr_pipeline.py --input "C:\\imgs" --out results.xlsx --deskewed-only
"""
import os
import re
import io
import cv2
import math
import glob
import argparse
import numpy as np
import pandas as pd
from typing import List, Dict, Tuple, Any, Optional
from datetime import datetime
# ========= Debug switch =========
# Set to True to re-enable saving deskewed images, writing *_lines_*.txt,
# and printing progress messages.
DEBUG = False
# ---------- Google Vision ----------
from google.cloud import vision
# ---------- openpyxl helpers ----------
from openpyxl.utils import get_column_letter
from openpyxl.cell.cell import MergedCell
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
from openpyxl.utils.dataframe import dataframe_to_rows
# ============================================================
# Config (tuning)
# ============================================================
PERP_TOL_FACTOR = 0.6
SEED_BAND_H = 3.0
ALLOW_SINGLETON = True
POST_Y_TOL_FACTOR = 0.55
# ============================================================
# Vision OCR (ONE pass per image)
# ============================================================
def _open_bytes(path: str) -> bytes:
with open(path, "rb") as f:
return f.read()
def extract_words_and_text(image_path: str) -> Tuple[List[Dict], str]:
client = vision.ImageAnnotatorClient()
resp = client.document_text_detection(image=vision.Image(content=_open_bytes(image_path)))
if resp.error.message:
raise RuntimeError(resp.error.message)
full_text = resp.full_text_annotation.text or ""
words: List[Dict] = []
for page in resp.full_text_annotation.pages:
for block in page.blocks:
for para in block.paragraphs:
for word in para.words:
text = "".join(s.text for s in word.symbols)
vs = word.bounding_box.vertices
xs = [v.x for v in vs]; ys = [v.y for v in vs]
left, top = min(xs), min(ys)
w, h = max(xs) - left, max(ys) - top
cx, cy = left + w/2.0, top + h/2.0
words.append({"text": text, "left": left, "top": top,
"w": w, "h": h, "cx": cx, "cy": cy})
return words, full_text
# ============================================================
# Skew estimation (Hough + OCR pairs)
# ============================================================
def weighted_median(pairs: List[Tuple[float, float]]) -> float:
if not pairs: return 0.0
arr = sorted(pairs, key=lambda t: t[0])
tot = sum(w for _, w in arr)
acc = 0.0
for v, w in arr:
acc += w
if acc >= tot/2.0:
return v
return arr[-1][0]
def estimate_skew_pairs(words: List[Dict],
y_band_mult: float = 2.0,
min_dx_mult: float = 0.8,
max_abs_deg: float = 15.0) -> Tuple[float,int]:
if not words: return 0.0, 0
widths = [w["w"] for w in words if w["w"]>0]
heights = [w["h"] for w in words if w["h"]>0]
w_med = float(np.median(widths) if widths else 10.0)
h_med = float(np.median(heights) if heights else 16.0)
y_band = y_band_mult * h_med
min_dx = max(4.0, min_dx_mult * w_med)
words_sorted = sorted(words, key=lambda w: (w["cy"], w["cx"]))
pairs: List[Tuple[float,float]] = []
for i, wi in enumerate(words_sorted):
best_j = None; best_dx = None
for j in range(i+1, len(words_sorted)):
wj = words_sorted[j]
dy = wj["cy"] - wi["cy"]
if dy > y_band: break
if abs(dy) <= y_band:
dx = wj["cx"] - wi["cx"]
if dx <= 0 or dx < min_dx: continue
if best_dx is None or dx < best_dx:
best_dx, best_j = dx, j
if best_j is None: continue
wj = words_sorted[best_j]
dx = wj["cx"] - wi["cx"]; dy = wj["cy"] - wi["cy"]
ang = math.degrees(math.atan2(dy, dx))
if abs(ang) <= max_abs_deg:
pairs.append((ang, max(1.0, dx)))
if not pairs: return 0.0, 0
vals = np.array([v for v,_ in pairs], dtype=float)
q1, q3 = np.percentile(vals, [25,75]); iqr = q3-q1
lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
trimmed = [(v,w) for v,w in pairs if lo <= v <= hi] or pairs
return float(weighted_median(trimmed)), len(trimmed)
def estimate_skew_hough(img: np.ndarray, thr: int = 180) -> Tuple[float,int]:
g = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
g = cv2.GaussianBlur(g, (3,3), 0)
edges = cv2.Canny(g, 60, 160, apertureSize=3)
lines = cv2.HoughLines(edges, 1, np.pi/180, threshold=thr)
if lines is None: return 0.0, 0
angs = []
for (rho, theta) in lines[:,0,:]:
ang = (theta - np.pi/2.0) * 180.0/np.pi
while ang > 45: ang -= 90
while ang < -45: ang += 90
angs.append(ang)
angs = np.array(angs, dtype=float)
med = float(np.median(angs))
keep = angs[np.abs(angs - med) <= 10.0]
return (float(np.median(keep)) if keep.size else med), int(angs.size)
# ============================================================
# Rotation (image + coordinates) and scoring
# ============================================================
def rotation_matrix_keep_bounds(shape_hw: Tuple[int,int], angle_deg: float) -> Tuple[np.ndarray, Tuple[int,int]]:
h, w = shape_hw
center = (w/2.0, h/2.0)
M = cv2.getRotationMatrix2D(center, angle_deg, 1.0)
cos, sin = abs(M[0,0]), abs(M[0,1])
new_w = int(h*sin + w*cos)
new_h = int(h*cos + w*sin)
M[0,2] += (new_w/2) - center[0]
M[1,2] += (new_h/2) - center[1]
return M, (new_h, new_w)
def rotate_image_keep_bounds(img: np.ndarray, angle_deg: float, border_value=255) -> np.ndarray:
M, (nh, nw) = rotation_matrix_keep_bounds(img.shape[:2], angle_deg)
return cv2.warpAffine(img, M, (nw, nh),
flags=cv2.INTER_LINEAR,
borderMode=cv2.BORDER_CONSTANT,
borderValue=border_value)
def transform_words(words: List[Dict], shape_hw: Tuple[int,int], angle_deg: float) -> List[Dict]:
M, _ = rotation_matrix_keep_bounds(shape_hw, angle_deg)
out = []
for w in words:
x, y = (M @ np.array([w["cx"], w["cy"], 1.0])).tolist()
ww = dict(w)
ww["cx_rot"], ww["cy_rot"] = float(x), float(y)
out.append(ww)
return out
def preview_score(img: np.ndarray, deskew_angle: float) -> float:
h, w = img.shape[:2]
scale = 1200.0 / max(h, w)
small = cv2.resize(img, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA) if scale < 1 else img
rot = rotate_image_keep_bounds(small, deskew_angle, border_value=255)
resid, n = estimate_skew_hough(rot, thr=140)
return abs(resid) if n > 0 else 90.0
# ============================================================
# Slope-based clustering (pre-rotation)
# ============================================================
def line_from_points(p0, p1):
(x0,y0),(x1,y1)=p0,p1
dx = x1-x0
if abs(dx) < 1e-9: return float("inf"), x0
m = (y1-y0)/dx; b = y0 - m*x0
return m,b
def perp_distance(m,b,x,y):
if math.isinf(m): return abs(x-b)
return abs(m*x - y + b) / math.sqrt(m*m + 1.0)
def refit_line(points: List[Tuple[float,float]]) -> Tuple[float,float]:
if len(points) == 1:
x,y = points[0]; return 0.0, y
xs=[p[0] for p in points]; ys=[p[1] for p in points]
xm = sum(xs)/len(xs); ym = sum(ys)/len(ys)
num = sum((x-xm)*(y-ym) for x,y in zip(xs,ys))
den = sum((x-xm)**2 for x in xs)
if abs(den) < 1e-12: return float("inf"), xm
m = num/den; b = ym - m*xm
return m,b
def project_t(m,b,x0,y0,x,y):
if math.isinf(m): return y - y0
denom = math.sqrt(1+m*m)
return ((x-x0) + m*(y-y0))/denom
def _build_line_result(words, idxs, m, b, rotated=False):
origin_idx = min(idxs, key=lambda i: (words[i]["cx_rot"] if rotated else words[i]["cx"]))
x0 = words[origin_idx]["cx_rot"] if rotated else words[origin_idx]["cx"]
y0 = words[origin_idx]["cy_rot"] if rotated else words[origin_idx]["cy"]
ordered = sorted(
idxs,
key=lambda i: project_t(
m, b, x0, y0,
words[i]["cx_rot"] if rotated else words[i]["cx"],
words[i]["cy_rot"] if rotated else words[i]["cy"]
)
)
line_words = [words[i] for i in ordered]
text = " ".join(w["text"] for w in line_words)
xs = [(w["cx_rot"] if rotated else w["cx"]) for w in line_words]
ys = [(w["cy_rot"] if rotated else w["cy"]) for w in line_words]
return {
"text": text,
"words": line_words,
"slope": m,
"center_x": float(sum(xs)/len(xs)),
"center_y": float(sum(ys)/len(ys)),
"count": len(line_words),
}
def cluster_tilted_lines(words: List[Dict]) -> List[Dict]:
if not words: return []
hs = sorted([w["h"] for w in words if w["h"]>0])
h_med = hs[len(hs)//2] if hs else 16.0
perp_tol = PERP_TOL_FACTOR * h_med
band_dy = SEED_BAND_H * h_med
remaining = set(range(len(words)))
order = sorted(remaining, key=lambda i: (words[i]["cy"], words[i]["cx"]))
lines = []
while remaining:
seed_idx = next(i for i in order if i in remaining)
remaining.remove(seed_idx)
sx, sy = words[seed_idx]["cx"], words[seed_idx]["cy"]
cand_idxs = [j for j in remaining if abs(words[j]["cy"] - sy) <= band_dy]
if not cand_idxs:
if ALLOW_SINGLETON:
m,b = refit_line([(sx,sy)])
lines.append(_build_line_result(words, {seed_idx}, m, b))
continue
cand_idxs.sort(key=lambda j: abs(words[j]["cx"] - sx))
best_inliers = None; best_mb = None
for j in cand_idxs[:min(10, len(cand_idxs))]:
m,b = line_from_points((sx,sy), (words[j]["cx"], words[j]["cy"]))
inliers = {seed_idx, j}
for k in remaining:
xk, yk = words[k]["cx"], words[k]["cy"]
if perp_distance(m,b,xk,yk) <= perp_tol:
inliers.add(k)
if best_inliers is None or len(inliers) > len(best_inliers):
best_inliers, best_mb = inliers, (m,b)
m,b = best_mb
pts = [(words[i]["cx"], words[i]["cy"]) for i in best_inliers]
m,b = refit_line(pts)
expanded = set(best_inliers)
for idx in list(remaining):
xk, yk = words[idx]["cx"], words[idx]["cy"]
if perp_distance(m,b,xk,yk) <= perp_tol:
expanded.add(idx)
for idx in expanded:
if idx in remaining:
remaining.remove(idx)
lines.append(_build_line_result(words, expanded, m, b))
lines.sort(key=lambda L: L["center_y"])
return lines
# ============================================================
# Post-rotation grouping (simple horizontal lines)
# ============================================================
def group_horizontal_lines(rotated_words: List[Dict]) -> List[Dict]:
if not rotated_words: return []
hs = sorted([w["h"] for w in rotated_words if w["h"]>0])
h_med = hs[len(hs)//2] if hs else 16.0
y_tol = POST_Y_TOL_FACTOR * h_med
idxs = list(range(len(rotated_words)))
idxs.sort(key=lambda i: (rotated_words[i]["cy_rot"], rotated_words[i]["cx_rot"]))
lines = []
cur = []
def flush():
nonlocal cur
if not cur: return
xs = [rotated_words[i]["cx_rot"] for i in cur]
ys = [rotated_words[i]["cy_rot"] for i in cur]
m,b = refit_line(list(zip(xs,ys)))
cur_sorted = sorted(cur, key=lambda i: rotated_words[i]["cx_rot"])
lines.append(_build_line_result(rotated_words, set(cur_sorted), m, b, rotated=True))
cur = []
for i in idxs:
if not cur:
cur = [i]
else:
y0 = rotated_words[cur[0]]["cy_rot"]
yi = rotated_words[i]["cy_rot"]
if abs(yi - y0) <= y_tol:
cur.append(i)
else:
flush()
cur = [i]
flush()
lines.sort(key=lambda L: L["center_y"])
return lines
# ============================================================
# Utilities: dump lines to txt (only if DEBUG)
# ============================================================
def slope_to_deg(m: float) -> float:
if math.isinf(m): return 90.0
return math.degrees(math.atan(m))
def write_lines_txt(base_path: str, suffix: str, lines: List[Dict]) -> Optional[str]:
if not DEBUG:
return None
txt_path = f"{os.path.splitext(base_path)[0]}_{suffix}.txt"
with open(txt_path, "w", encoding="utf-8") as f:
f.write(f"# {os.path.basename(base_path)} ({suffix})\n")
for i, L in enumerate(lines, 1):
ang = slope_to_deg(L["slope"])
f.write(f"[{i:03d}] words={L['count']:>3} slope={ang:+.3f}°\n")
f.write(L["text"] + "\n\n")
return txt_path
# ============================================================
# Smart deskew + full pipeline (in-memory; returns words + full_text)
# ============================================================
def smart_deskew_with_lines(image_path: str,
out_path: Optional[str] = None,
clamp_deg: float = 30.0,
use_vision: bool = True) -> Dict:
img = cv2.imread(image_path, cv2.IMREAD_COLOR)
if img is None: raise FileNotFoundError(image_path)
words, full_text = ([], "")
if use_vision:
words, full_text = extract_words_and_text(image_path)
a_h, n_h = estimate_skew_hough(img)
a_p, n_p = (0.0, 0)
if words:
a_p, n_p = estimate_skew_pairs(words, y_band_mult=2.0, min_dx_mult=0.8, max_abs_deg=15.0)
candidates = []
if n_h >= 10: candidates += [a_h, -a_h]
if n_p >= 10: candidates += [a_p, -a_p]
if not candidates: candidates = [0.0]
cand = []
for a in candidates:
a = float(max(-clamp_deg, min(clamp_deg, a)))
if all(abs(a - b) > 0.05 for b in cand):
cand.append(a)
grid = []
for a in cand:
for d in (-0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6):
g = a + d
if all(abs(g - x) > 0.05 for x in grid):
grid.append(g)
scored = [(a, preview_score(img, -a)) for a in grid]
best_angle, best_cost = min(scored, key=lambda t: t[1])
# Debug print kept as a comment
# print(f"[smart] hough={a_h:.3f}°(n={n_h}) pairs={a_p:.3f}°(n={n_p}) tried={', '.join(f'{a:+.2f}°' for a,_ in scored)} → chosen {best_angle:+.2f}° (cost={best_cost:.3f})")
# Rotate in-memory. Save only if DEBUG.
rotated = rotate_image_keep_bounds(img, -best_angle, border_value=255)
if DEBUG and out_path:
cv2.imwrite(out_path, rotated)
result = {
"angle_deg": float(best_angle),
"hough_lines": int(n_h),
"pair_samples": int(n_p),
"out_path": out_path if DEBUG else None,
"pre_txt": None,
"post_txt": None,
"pre_lines": [],
"post_lines": [],
"words": words,
"full_text": full_text,
}
if words:
pre_lines = cluster_tilted_lines(words)
result["pre_lines"] = pre_lines
result["pre_txt"] = write_lines_txt(image_path, "lines_pre", pre_lines) # only if DEBUG
rot_words = transform_words(words, img.shape[:2], -best_angle)
post_lines = group_horizontal_lines(rot_words)
result["post_lines"] = post_lines
result["post_txt"] = write_lines_txt(image_path, "lines_post", post_lines) # only if DEBUG
# More debug prints kept as comments
# def preview(lines, tag):
# print(f" {tag} ({len(lines)} lines)")
# for L in lines[:5]:
# ang = slope_to_deg(L["slope"])
# print(f" [{L['count']:>3} w] slope={ang:+.3f}° | {L['text'][:90]}")
# preview(pre_lines, "pre (slope-aware)")
# preview(post_lines, "post (horizontal)")
# if DEBUG:
# print(f" → wrote: {result['pre_txt']} and {result['post_txt']}")
return result
# ============================================================
# Multi-client extraction from post lines (robust)
# ============================================================
MEMBER_RE = re.compile(r'\bMEMBER NAME\s*:\s*(.+)', re.IGNORECASE)
MEMBERID_RE = re.compile(r'\bMEMBER ID\s*:\s*([A-Za-z0-9]+)', re.IGNORECASE)
ICN_LINE_RE = re.compile(r'^\s*\d{12,}\b')
AMOUNT_RE = re.compile(r'(\d{1,3}(?:,\d{3})*\.\d{2})') # decimals only
DATE6_RE = re.compile(r'\b\d{6}\b')
PD_ROW_RE = re.compile(r'\bPD\s+(D?\d{4})\b', re.IGNORECASE)
TOOTH_RE = re.compile(r'^(?:[1-9]|[12][0-9]|3[0-2]|[A-Ta-t])$')
SURFACE_RE = re.compile(r'^[MDBOILFP]{1,4}$', re.IGNORECASE)
def _to_float(s: str) -> float:
try:
return float(s.replace(',', ''))
except Exception:
return 0.0
def _parse_pd_line(t: str) -> Optional[Tuple[str, Optional[float], Optional[float], Optional[float], Optional[str], Optional[str], Optional[str]]]:
"""
Parse a single PD line.
Returns: (CDT, billed, allowed, paid, date6, tooth, surface)
"""
m = PD_ROW_RE.search(t)
if not m:
return None
code = m.group(1)
code = code if code.upper().startswith('D') else f'D{code}'
amts = [_to_float(x) for x in AMOUNT_RE.findall(t)]
billed = allowed = paid = None
if len(amts) >= 3:
billed, allowed, paid = amts[-3:]
d = None
md = DATE6_RE.search(t)
if md:
d = md.group(0)
tooth = None
surface = None
tokens = t.split()
try:
code_idx = tokens.index(code)
except ValueError:
code_idx = None
for i, tok in enumerate(tokens):
if PD_ROW_RE.match(f'PD {tok}'):
code_idx = i
break
if code_idx is not None:
date_idx = None
for i in range(code_idx + 1, len(tokens)):
if DATE6_RE.fullmatch(tokens[i]):
date_idx = i
break
window = tokens[code_idx + 1: date_idx if date_idx is not None else len(tokens)]
for tok in window:
if TOOTH_RE.fullmatch(tok):
tooth = tok.upper()
break
start_j = 0
if tooth is not None:
for j, tok in enumerate(window):
if tok.upper() == tooth:
start_j = j + 1
break
for tok in window[start_j:]:
if SURFACE_RE.fullmatch(tok):
surface = tok.upper()
break
return code, billed, allowed, paid, d, tooth, surface
def extract_all_clients_from_lines(post_lines: List[dict]) -> List[dict]:
"""
Split strictly by MEMBER NAME lines; ignore anything before the first name.
For each member block, look up ICN from the nearest line above the member header.
Parse each PD line for CDT, Date SVC, Billed, Allowed, Paid (decimals only).
"""
texts = [L["text"] for L in post_lines]
starts = [i for i,t in enumerate(texts) if MEMBER_RE.search(t)]
if not starts:
return []
out_rows = []
for si, start in enumerate(starts):
end = starts[si+1] if si+1 < len(starts) else len(texts)
# header line with MEMBER NAME
name_line = texts[start]
raw_name = MEMBER_RE.search(name_line).group(1).strip()
# Stop at "MEMBER ID" (case-insensitive) and other headers
cut_points = ["MEMBER ID", "OTH INS CD", "PA:", "DIAG:"]
mname = raw_name
for cp in cut_points:
idx = mname.upper().find(cp)
if idx != -1:
mname = mname[:idx].strip()
# Debug
# print(raw_name); print(mname)
# member id: search within the block
mid = ""
for t in texts[start:end]:
m = MEMBERID_RE.search(t)
if m:
mid = m.group(1).strip()
break
# ICN: search a few lines ABOVE the member header
icn = ""
for k in range(start-1, max(-1, start-6), -1):
if k < 0: break
mm = ICN_LINE_RE.match(texts[k])
if mm:
icn = mm.group(0)
break
# PD lines in the block
had_pd = False
for t in texts[start:end]:
if " PD " not in f" {t} ":
continue
parsed = _parse_pd_line(t)
if not parsed:
continue
had_pd = True
code, billed, allowed, paid, dsvc, tooth, surface = parsed
out_rows.append({
'Patient Name': mname.title() if mname else "",
'Patient ID': mid,
'ICN': icn,
'CDT Code': code,
'Tooth': tooth if tooth else "",
#'Surface': surface if surface else "",
'Date SVC': dsvc if dsvc else "",
'Billed Amount': billed if billed is not None else "",
'Allowed Amount': allowed if allowed is not None else "",
'Paid Amount': paid if paid is not None else "",
'Extraction Success': True,
})
if not had_pd:
out_rows.append({
'Patient Name': mname.title() if mname else "",
'Patient ID': mid,
'ICN': icn,
'CDT Code': "",
'Tooth': "",
#'Surface': "",
'Date SVC': "",
'Billed Amount': "",
'Allowed Amount': "",
'Paid Amount': "",
'Extraction Success': bool(mname or mid),
})
return out_rows
# ============================================================
# ExcelGenerator
# ============================================================
class ExcelGenerator:
def __init__(self):
self.header_fill = PatternFill(start_color="366092", end_color="366092", fill_type="solid")
self.header_font = Font(color="FFFFFF", bold=True)
self.border = Border(
left=Side(style='thin'),
right=Side(style='thin'),
top=Side(style='thin'),
bottom=Side(style='thin')
)
self.center_alignment = Alignment(horizontal='center', vertical='center')
def create_excel_file(self, df: pd.DataFrame) -> bytes:
wb = Workbook()
ws = wb.active
ws.title = "Medical Billing Extract"
ws['A1'] = f"Medical Billing OCR Extract - Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
ws.merge_cells('A1:H1')
ws['A1'].font = Font(size=14, bold=True)
ws['A1'].alignment = self.center_alignment
ws.append([])
excel_df = self.prepare_dataframe_for_excel(df)
for r in dataframe_to_rows(excel_df, index=False, header=True):
ws.append(r)
self.format_worksheet(ws, len(excel_df) + 3)
self.add_summary_sheet(wb, excel_df)
output = io.BytesIO()
wb.save(output)
output.seek(0)
return output.getvalue()
def prepare_dataframe_for_excel(self, df: pd.DataFrame) -> pd.DataFrame:
excel_df = df.copy()
column_order = [
'Patient Name', 'Patient ID', 'ICN', 'CDT Code', 'Tooth', 'Date SVC', #'Surface',
'Billed Amount', 'Allowed Amount', 'Paid Amount',
'Extraction Success', 'Source File'
]
existing = [c for c in column_order if c in excel_df.columns]
excel_df = excel_df[existing]
for amount_col in ['Billed Amount', 'Allowed Amount', 'Paid Amount']:
if amount_col in excel_df.columns:
excel_df[amount_col] = excel_df[amount_col].apply(self.format_currency)
if 'Extraction Success' in excel_df.columns:
excel_df['Extraction Success'] = excel_df['Extraction Success'].apply(lambda x: 'Yes' if x else 'No')
return excel_df
def format_currency(self, value):
if pd.isna(value) or value == "":
return ""
try:
if isinstance(value, str):
clean_value = value.replace('$', '').replace(',', '')
value = float(clean_value)
return f"${value:,.2f}"
except (ValueError, TypeError):
return str(value)
def format_worksheet(self, ws, data_rows):
header_row = 3
for cell in ws[header_row]:
if cell.value:
cell.fill = self.header_fill
cell.font = self.header_font
cell.alignment = self.center_alignment
cell.border = self.border
for row in range(header_row + 1, data_rows + 1):
for cell in ws[row]:
cell.border = self.border
cell.alignment = Alignment(horizontal='left', vertical='center')
self.auto_adjust_columns(ws)
self.add_conditional_formatting(ws, header_row, data_rows)
def auto_adjust_columns(self, ws):
max_col = ws.max_column
max_row = ws.max_row
for col_idx in range(1, max_col + 1):
max_len = 0
for row in range(1, max_row + 1):
cell = ws.cell(row=row, column=col_idx)
if isinstance(cell, MergedCell):
continue
try:
val = cell.value
if val is None:
continue
max_len = max(max_len, len(str(val)))
except Exception:
pass
letter = get_column_letter(col_idx)
ws.column_dimensions[letter].width = min(max_len + 2, 50)
def add_conditional_formatting(self, ws, header_row, data_rows):
success_col = None
for col, cell in enumerate(ws[header_row], 1):
if cell.value == 'Extraction Success':
success_col = col
break
if success_col:
for row in range(header_row + 1, data_rows + 1):
cell = ws.cell(row=row, column=success_col)
if cell.value == 'Yes':
cell.fill = PatternFill(start_color="90EE90", end_color="90EE90", fill_type="solid")
elif cell.value == 'No':
cell.fill = PatternFill(start_color="FFB6C1", end_color="FFB6C1", fill_type="solid")
def add_summary_sheet(self, wb, df):
ws = wb.create_sheet(title="Summary")
ws['A1'] = "Extraction Summary"
ws['A1'].font = Font(size=16, bold=True)
ws.merge_cells('A1:B1')
row = 3
stats = [
("Total Rows", len(df)),
("Successful", len(df[df['Extraction Success'] == 'Yes']) if 'Extraction Success' in df.columns else 0),
("Failed", len(df[df['Extraction Success'] == 'No']) if 'Extraction Success' in df.columns else 0),
]
for name, val in stats:
ws[f'A{row}'] = name
ws[f'B{row}'] = val
ws[f'A{row}'].font = Font(bold=True)
row += 1
ExcelGenerator().auto_adjust_columns(ws)
row += 2
ws[f'A{row}'] = f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
ws[f'A{row}'].font = Font(italic=True)
# ============================================================
# Runner: glue everything together
# ============================================================
def process_images_to_excel(files: List[str], out_excel: str, deskewed_only: bool=False) -> None:
excel_gen = ExcelGenerator()
records: List[Dict[str, Any]] = []
for src in files:
try:
if deskewed_only:
img = cv2.imread(src, cv2.IMREAD_COLOR)
if img is None:
raise FileNotFoundError(src)
words, _ = extract_words_and_text(src)
rot_words = []
for w in words:
ww = dict(w)
ww["cx_rot"], ww["cy_rot"] = w["cx"], w["cy"]
rot_words.append(ww)
post_lines = group_horizontal_lines(rot_words)
post_txt = write_lines_txt(src, "lines_post", post_lines) # only if DEBUG
rows = extract_all_clients_from_lines(post_lines)
for r in rows:
r["Source File"] = os.path.basename(src)
records.append(r)
# if DEBUG: print(f"{src} → parsed {len(rows)} PD rows (wrote {post_txt})")
else:
base, ext = os.path.splitext(src)
dst = f"{base}_deskewed{ext if ext else '.jpg'}" if DEBUG else None
info = smart_deskew_with_lines(src, dst, clamp_deg=30.0, use_vision=True)
post_lines = info.get("post_lines", []) if info else []
rows = extract_all_clients_from_lines(post_lines) if post_lines else []
for r in rows:
r["Source File"] = os.path.basename(src)
records.append(r)
# if DEBUG: print(f"{src} → rotated by {-info['angle_deg']:.3f}° → {dst}")
except Exception as e:
# if DEBUG: print(f"{src}: {e}")
records.append({
'Patient Name': "", 'Patient ID': "", 'ICN': "", 'CDT Code': "",
'Date SVC': "", 'Billed Amount': "", 'Allowed Amount': "", 'Paid Amount': "",
'Extraction Success': False, 'Source File': os.path.basename(src),
})
df = pd.DataFrame.from_records(records)
data = excel_gen.create_excel_file(df)
with open(out_excel, "wb") as f:
f.write(data)
# if DEBUG:
# print(f"\n✅ Wrote Excel → {out_excel}")
# print(" (and per-image: *_lines_pre.txt, *_lines_post.txt, *_deskewed.* when DEBUG=True)")
# ============================================================
# CLI
# ============================================================
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--input", help="Folder of images (jpg/png/tif).", default=None)
ap.add_argument("--files", nargs="*", help="Specific image files.", default=None)
ap.add_argument("--out", help="Output Excel path.", required=True)
ap.add_argument("--deskewed-only", action="store_true",
help="Only process files whose name contains '_deskewed'; skip deskew step.")
args = ap.parse_args()
paths: List[str] = []
if args.files:
for f in args.files:
if os.path.isfile(f):
paths.append(f)
if args.input and os.path.isdir(args.input):
for ext in ("*.jpg","*.jpeg","*.png","*.tif","*.tiff","*.bmp"):
paths.extend(glob.glob(os.path.join(args.input, ext)))
if args.deskewed_only:
paths = [p for p in paths if "_deskewed" in os.path.basename(p).lower()]
if not paths:
raise SystemExit("No input images found. Use --files or --input (and --deskewed-only if desired).")
if not os.environ.get("GOOGLE_APPLICATION_CREDENTIALS"):
# print("WARNING: GOOGLE_APPLICATION_CREDENTIALS not set. Set it to your local service account JSON path.")
pass
process_images_to_excel(paths, args.out, deskewed_only=args.deskewed_only)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,8 @@
{
"name": "pdfservice",
"private": true,
"scripts": {
"postinstall": "pip install -r requirements.txt",
"dev": "python main.py"
}
}

View File

@@ -0,0 +1,10 @@
fastapi
uvicorn[standard]
google-cloud-vision
opencv-python-headless
pytesseract
pillow
pandas
openpyxl
numpy
python-multipart

Binary file not shown.

View File

@@ -0,0 +1,5 @@
This code was written only while extracting procedure code data from Mass Health pdf, to make process easy.
Only was a one time process, not used as core functionality in this whole app.
Keeping it as in future might need to extract again.

View File

@@ -0,0 +1,96 @@
#!/usr/bin/env python3
"""
Compare a main dental JSON file with one or more other JSON files and
return all records whose 'Procedure Code' is NOT present in the main file.
- Matching key: 'Procedure Code' (case-insensitive, trimmed).
- Keeps the full record from the other files (including extra fields like 'Full Price').
- Deduplicates by Procedure Code across the collected "missing" results.
CONFIG: set MAIN_PATH, OTHER_PATHS, OUT_PATH below.
"""
import json
from pathlib import Path
from typing import List, Dict, Any
# =========================
# CONFIG — EDIT THESE ONLY
# =========================
MAIN_PATH = "procedureCodesMain.json" # your main JSON (with PriceLTEQ21/PriceGT21)
OTHER_PATHS = [
"procedureCodesOld.json", # one or more other JSON files to compare against the main
# "other2.json",
]
OUT_PATH = "not_in_main.json" # where to write the results
# =========================
def _load_json_any(path: str) -> List[Dict[str, Any]]:
"""
Load JSON. Accept:
- a list of objects
- a single object (wraps into a list)
"""
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, dict):
return [data]
if isinstance(data, list):
# filter out non-dict items defensively
return [x for x in data if isinstance(x, dict)]
raise ValueError(f"Unsupported JSON top-level type in {path}: {type(data)}")
def _norm_code(record: Dict[str, Any]) -> str:
# Normalize the 'Procedure Code' for matching
code = str(record.get("Procedure Code", "")).strip().upper()
# Some PDFs might have stray spaces, tabs, or zero-width chars
code = "".join(ch for ch in code if not ch.isspace())
return code
def collect_main_codes(main_path: str) -> set:
main_items = _load_json_any(main_path)
codes = {_norm_code(rec) for rec in main_items if _norm_code(rec)}
return codes
def collect_missing_records(other_paths: List[str], main_codes: set) -> List[Dict[str, Any]]:
missing: Dict[str, Dict[str, Any]] = {} # map normalized code -> record
for p in other_paths:
items = _load_json_any(p)
for rec in items:
code_norm = _norm_code(rec)
if not code_norm:
continue
if code_norm not in main_codes and code_norm not in missing:
# Keep the full original record
missing[code_norm] = rec
# return in a stable, sorted order by code
return [missing[k] for k in sorted(missing.keys())]
def main():
# Validate files exist
if not Path(MAIN_PATH).exists():
raise FileNotFoundError(f"Main file not found: {MAIN_PATH}")
for p in OTHER_PATHS:
if not Path(p).exists():
raise FileNotFoundError(f"Other file not found: {p}")
main_codes = collect_main_codes(MAIN_PATH)
missing_records = collect_missing_records(OTHER_PATHS, main_codes)
with open(OUT_PATH, "w", encoding="utf-8") as f:
json.dump(missing_records, f, ensure_ascii=False, indent=2)
print(f"Main codes: {len(main_codes)}")
print(f"Missing from main: {len(missing_records)}")
print(f"Wrote results to {OUT_PATH}")
# Also echo to stdout
print(json.dumps(missing_records, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,183 @@
import re
import json
from typing import List, Dict
import fitz # PyMuPDF
# =========================
# CONFIG — EDIT THESE ONLY
# =========================
PDF_PATH = "MH.pdf" # path to your PDF
PAGES = [2] # 0-based page indexes to parse, e.g., [2] for the page you showed
OUT_PATH = "output.json" # where to write JSON
FIRST_PRICE_IS_LTE21 = True # True => first price line is <=21; False => first price is >21
PRINT_PAGE_TEXT = False # set True if you want to print the raw page text for sanity check
# =========================
# --- patterns ---
code_line_re = re.compile(r"^\s*(D\d{4})\s*$")
# a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC'
price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE)
# lines that definitely start a notes block we should ignore once prices are done
note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—||Age limitation:|CR\b)", re.IGNORECASE)
def normalize_ws(s: str) -> str:
s = s.replace("\u00a0", " ")
s = re.sub(r"[ \t]+", " ", s)
s = re.sub(r"\s*\n\s*", " ", s)
s = re.sub(r"\s{2,}", " ", s)
return s.strip(" ,.;:-•·\n\t")
def clean_money(token: str) -> str:
if token.upper() == "NC":
return "NC"
return token.replace(",", "").lstrip("$").strip()
def get_page_lines(pdf_path: str, pages: List[int]) -> List[str]:
doc = fitz.open(pdf_path)
try:
max_idx = len(doc) - 1
for p in pages:
if p < 0 or p > max_idx:
raise ValueError(f"Invalid page index {p}. Valid range is 0..{max_idx}.")
lines: List[str] = []
for p in pages:
text = doc.load_page(p).get_text("text") or ""
if PRINT_PAGE_TEXT:
print(f"\n--- RAW PAGE {p} ---\n{text}")
# keep line boundaries; later we parse line-by-line
lines.extend(text.splitlines())
return lines
finally:
doc.close()
def extract_records(lines: List[str]) -> List[Dict[str, str]]:
out: List[Dict[str, str]] = []
i = 0
n = len(lines)
while i < n:
line = lines[i].strip()
# seek a code line
mcode = code_line_re.match(line)
if not mcode:
i += 1
continue
code = mcode.group(1)
i += 1
# gather description lines until we encounter price lines
desc_lines: List[str] = []
# skip blank lines before description
while i < n and not lines[i].strip():
i += 1
# collect description lines (usually 13) until first price token
# stop also if we accidentally hit another code (defensive)
j = i
while j < n:
s = lines[j].strip()
if not s:
# blank line inside description — consider description ended if the next is a price
# but we don't advance here; break and let price parsing handle it
break
if code_line_re.match(s):
# next code — no prices found; abandon this broken record
break
if price_line_re.match(s):
# reached price section
break
if note_starters_re.match(s):
# encountered a note before price — treat as end of description; prices may be missing
break
desc_lines.append(s)
j += 1
# advance i to where we left off
i = j
description = normalize_ws(" ".join(desc_lines))
# collect up to two price tokens
prices: List[str] = []
while i < n and len(prices) < 2:
s = lines[i].strip()
if not s:
i += 1
continue
if code_line_re.match(s):
# new record — stop; this means we never got prices (malformed)
break
mprice = price_line_re.match(s)
if mprice:
prices.append(clean_money(mprice.group(1)))
i += 1
continue
# if we encounter a note/flags block, skip forward until the next code/blank
if note_starters_re.match(s) or s in {"Y", "NC"}:
# skip this block quickly
i += 1
# keep skipping subsequent non-empty, non-code lines until a blank or next code
while i < n:
t = lines[i].strip()
if not t or code_line_re.match(t):
break
i += 1
# now let the outer loop proceed
continue
# unrecognized line: if prices already found, we can break; else skip
if prices:
break
i += 1
if len(prices) < 2:
# couldn't find 2 prices reliably; skip this record
continue
if FIRST_PRICE_IS_LTE21:
price_lte21, price_gt21 = prices[0], prices[1]
else:
price_lte21, price_gt21 = prices[1], prices[0]
out.append(
{
"Procedure Code": code,
"Description": description,
"PriceLTEQ21": price_lte21,
"PriceGT21": price_gt21,
}
)
# after prices, skip forward until next code or blank block end
while i < n:
s = lines[i].strip()
if not s:
i += 1
break
if code_line_re.match(s):
# next record will pick this up
break
i += 1
return out
def extract_pdf_to_json(pdf_path: str, pages: List[int], out_path: str) -> List[Dict[str, str]]:
lines = get_page_lines(pdf_path, pages)
data = extract_records(lines)
with open(out_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
return data
if __name__ == "__main__":
data = extract_pdf_to_json(PDF_PATH, PAGES, OUT_PATH)
print(f"Wrote {len(data)} rows to {OUT_PATH}")
print(json.dumps(data, ensure_ascii=False, indent=2))

View File

@@ -0,0 +1,208 @@
#!/usr/bin/env python3
"""
MassHealth dental PDF parser (PyMuPDF / fitz) — PAGE RANGE VERSION
Parses rows like:
D2160
Amalgam-three surfaces,
primary or permanent
$110
$92
Y
Y
...
Outputs a single JSON with records from the chosen page range (inclusive).
Config:
- PDF_PATH: path to the PDF
- PAGE_START, PAGE_END: 1-based page numbers (inclusive)
- FIRST_PRICE_IS_LTE21: True => first price line is <=21; False => first price is >21
- OUT_PATH: output JSON path
"""
import re
import json
from typing import List, Dict
import fitz # PyMuPDF
# =========================
# CONFIG — EDIT THESE ONLY
# =========================
PDF_PATH = "MH.pdf" # path to your PDF
PAGE_START = 1 # 1-based inclusive start page (e.g., 1)
PAGE_END = 12 # 1-based inclusive end page (e.g., 5)
OUT_PATH = "output.json" # single JSON file containing all parsed rows
FIRST_PRICE_IS_LTE21 = True # True => first price line is <=21; False => first price is >21
PRINT_PAGE_TEXT = False # set True to print raw text for each page
# =========================
# --- patterns ---
code_line_re = re.compile(r"^\s*(D\d{4})\s*$")
# a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC'
price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE)
# lines that definitely start a notes block to ignore once prices are done
note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—||Age limitation:|CR\b)", re.IGNORECASE)
def normalize_ws(s: str) -> str:
s = s.replace("\u00a0", " ")
s = re.sub(r"[ \t]+", " ", s)
s = re.sub(r"\s*\n\s*", " ", s)
s = re.sub(r"\s{2,}", " ", s)
return s.strip(" ,.;:-•·\n\t")
def clean_money(token: str) -> str:
if token.upper() == "NC":
return "NC"
return token.replace(",", "").lstrip("$").strip()
def get_page_lines(pdf_path: str, page_start_1b: int, page_end_1b: int) -> List[str]:
if page_start_1b <= 0 or page_end_1b <= 0:
raise ValueError("PAGE_START and PAGE_END must be >= 1 (1-based).")
if page_start_1b > page_end_1b:
raise ValueError("PAGE_START cannot be greater than PAGE_END.")
doc = fitz.open(pdf_path)
try:
last_idx_0b = len(doc) - 1
# convert to 0-based inclusive range
start_0b = page_start_1b - 1
end_0b = page_end_1b - 1
if start_0b < 0 or end_0b > last_idx_0b:
raise ValueError(f"Page range out of bounds. Valid 1-based range is 1..{last_idx_0b + 1}.")
lines: List[str] = []
for p in range(start_0b, end_0b + 1):
text = doc.load_page(p).get_text("text") or ""
if PRINT_PAGE_TEXT:
print(f"\n--- RAW PAGE {p} (0-based; shown as {p+1} 1-based) ---\n{text}")
lines.extend(text.splitlines())
return lines
finally:
doc.close()
def extract_records(lines: List[str]) -> List[Dict[str, str]]:
out: List[Dict[str, str]] = []
i = 0
n = len(lines)
while i < n:
line = lines[i].strip()
# seek a code line
mcode = code_line_re.match(line)
if not mcode:
i += 1
continue
code = mcode.group(1)
i += 1
# gather description lines until we encounter price lines
desc_lines: List[str] = []
# skip blank lines before description
while i < n and not lines[i].strip():
i += 1
# collect description lines (usually 13) until first price token
# stop also if we accidentally hit another code (defensive)
j = i
while j < n:
s = lines[j].strip()
if not s:
break
if code_line_re.match(s):
# next code — description ended abruptly (malformed)
break
if price_line_re.match(s):
# reached price section
break
if note_starters_re.match(s):
# encountered a note before price — treat as end of description; prices may be missing
break
desc_lines.append(s)
j += 1
# advance i to where we left off
i = j
description = normalize_ws(" ".join(desc_lines))
# collect up to two price tokens
prices: List[str] = []
while i < n and len(prices) < 2:
s = lines[i].strip()
if not s:
i += 1
continue
if code_line_re.match(s):
# new record — stop; this means we never got prices (malformed)
break
mprice = price_line_re.match(s)
if mprice:
prices.append(clean_money(mprice.group(1)))
i += 1
continue
# if we encounter a note/flags block, skip forward until a blank or next code
if note_starters_re.match(s) or s in {"Y", "NC"}:
i += 1
while i < n:
t = lines[i].strip()
if not t or code_line_re.match(t):
break
i += 1
continue
# unrecognized line: if we already captured some prices, break; else skip
if prices:
break
i += 1
if len(prices) < 2:
# couldn't find 2 prices reliably; skip this record
continue
if FIRST_PRICE_IS_LTE21:
price_lte21, price_gt21 = prices[0], prices[1]
else:
price_lte21, price_gt21 = prices[1], prices[0]
out.append(
{
"Procedure Code": code,
"Description": description,
"PriceLTEQ21": price_lte21,
"PriceGT21": price_gt21,
}
)
# after prices, skip forward until next code or blank block end
while i < n:
s = lines[i].strip()
if not s:
i += 1
break
if code_line_re.match(s):
break
i += 1
return out
def extract_pdf_range_to_json(pdf_path: str, page_start_1b: int, page_end_1b: int, out_path: str) -> List[Dict[str, str]]:
lines = get_page_lines(pdf_path, page_start_1b, page_end_1b)
data = extract_records(lines)
with open(out_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
return data
if __name__ == "__main__":
data = extract_pdf_range_to_json(PDF_PATH, PAGE_START, PAGE_END, OUT_PATH)
print(f"Wrote {len(data)} rows to {OUT_PATH}")
print(json.dumps(data, ensure_ascii=False, indent=2))

View File

@@ -0,0 +1,192 @@
[
{
"Procedure Code": "D0120",
"Description": "perio exam",
"Price": "105"
},
{
"Procedure Code": "D0140",
"Description": "limited exam",
"Price": "90"
},
{
"Procedure Code": "D0150",
"Description": "comprehensive exam",
"Price": "120"
},
{
"Procedure Code": "D0210",
"Description": "Fmx.",
"Price": "120"
},
{
"Procedure Code": "D0220",
"Description": "first PA.",
"Price": "60"
},
{
"Procedure Code": "D0230",
"Description": "2nd PA.",
"Price": "50"
},
{
"Procedure Code": "D0272",
"Description": "2 BW",
"Price": "80"
},
{
"Procedure Code": "D0274",
"Description": "4BW",
"Price": "160"
},
{
"Procedure Code": "D0330",
"Description": "pano",
"Price": "150"
},
{
"Procedure Code": "D0364",
"Description": "Less than one jaw",
"Price": "350"
},
{
"Procedure Code": "D0365",
"Description": "Mand",
"Price": "350"
},
{
"Procedure Code": "D0366",
"Description": "Max",
"Price": "350"
},
{
"Procedure Code": "D0367",
"Description": "",
"Price": "400"
},
{
"Procedure Code": "D0368",
"Description": "include TMJ",
"Price": "375"
},
{
"Procedure Code": "D0380",
"Description": "Less than one jaw",
"Price": "300"
},
{
"Procedure Code": "D0381",
"Description": "Mand",
"Price": "300"
},
{
"Procedure Code": "D0382",
"Description": "Max",
"Price": "300"
},
{
"Procedure Code": "D0383",
"Description": "",
"Price": "350"
},
{
"Procedure Code": "D1110",
"Description": "adult prophy",
"Price": "150"
},
{
"Procedure Code": "D1120",
"Description": "child prophy",
"Price": "120"
},
{
"Procedure Code": "D1208",
"Description": "FL",
"Price": "90"
},
{
"Procedure Code": "D1351",
"Description": "sealant",
"Price": "80"
},
{
"Procedure Code": "D1999",
"Description": "",
"Price": "50"
},
{
"Procedure Code": "D2140",
"Description": "amalgam, one surface",
"Price": "150"
},
{
"Procedure Code": "D2150",
"Description": "amalgam, two surface",
"Price": "200"
},
{
"Procedure Code": "D2955",
"Description": "post renoval",
"Price": "350"
},
{
"Procedure Code": "D4910",
"Description": "perio maintains",
"Price": "250"
},
{
"Procedure Code": "D5510",
"Description": "Repair broken complete denture base (QUAD)",
"Price": "400"
},
{
"Procedure Code": "D6056",
"Description": "pre fab abut",
"Price": "750"
},
{
"Procedure Code": "D6057",
"Description": "custom abut",
"Price": "800"
},
{
"Procedure Code": "D6058",
"Description": "porcelain, implant crown, ceramic crown",
"Price": "1400"
},
{
"Procedure Code": "D6059",
"Description": "",
"Price": "1400"
},
{
"Procedure Code": "D6100",
"Description": "",
"Price": "320"
},
{
"Procedure Code": "D6110",
"Description": "implant",
"Price": "1600"
},
{
"Procedure Code": "D6242",
"Description": "noble metal. For united",
"Price": "1400"
},
{
"Procedure Code": "D6245",
"Description": "porcelain, not for united",
"Price": "1400"
},
{
"Procedure Code": "D7910",
"Description": "suture, small wound up to 5 mm",
"Price": "400"
},
{
"Procedure Code": "D7950",
"Description": "max",
"Price": "800"
}
]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,344 @@
[
{
"Procedure Code": "D1999",
"Description": "",
"Price": "50"
},
{
"Procedure Code": "D0120",
"Description": "perio exam",
"Price": "105"
},
{
"Procedure Code": "D0140",
"Description": "limited exam",
"Price": "90"
},
{
"Procedure Code": "D0150",
"Description": "comprehensive exam",
"Price": "120"
},
{
"Procedure Code": "D0210",
"Description": "Fmx.",
"Price": "120"
},
{
"Procedure Code": "D0220",
"Description": "first PA.",
"Price": "60"
},
{
"Procedure Code": "D0230",
"Description": "2nd PA.",
"Price": "50"
},
{
"Procedure Code": "D0330",
"Description": "pano",
"Price": "150"
},
{
"Procedure Code": "D0272",
"Description": "2 BW",
"Price": "80"
},
{
"Procedure Code": "D0274",
"Description": "4BW",
"Price": "160"
},
{
"Procedure Code": "D1110",
"Description": "adult prophy",
"Price": "150"
},
{
"Procedure Code": "D1120",
"Description": "child prophy",
"Price": "120"
},
{
"Procedure Code": "D1351",
"Description": "sealant",
"Price": "80"
},
{
"Procedure Code": "D4341",
"Description": "srp",
"Price": "250"
},
{
"Procedure Code": "D4910",
"Description": "perio maintains",
"Price": "250"
},
{
"Procedure Code": "D1208",
"Description": "FL",
"Price": "90"
},
{
"Procedure Code": "D2330",
"Description": "front composite. 1 s.",
"Price": "180"
},
{
"Procedure Code": "D2331",
"Description": "2s",
"Price": "220"
},
{
"Procedure Code": "D2332",
"Description": "3s",
"Price": "280"
},
{
"Procedure Code": "D2335",
"Description": "4s or more",
"Price": "350"
},
{
"Procedure Code": "D2391",
"Description": "back. 1s",
"Price": "200"
},
{
"Procedure Code": "D2392",
"Description": "2s",
"Price": "250"
},
{
"Procedure Code": "D2393",
"Description": "3s",
"Price": "280"
},
{
"Procedure Code": "D2394",
"Description": "4s",
"Price": "320"
},
{
"Procedure Code": "D2140",
"Description": "amalgam, one surface",
"Price": "150"
},
{
"Procedure Code": "D2150",
"Description": "amalgam, two surface",
"Price": "200"
},
{
"Procedure Code": "D2750",
"Description": "high noble",
"Price": "1300"
},
{
"Procedure Code": "D2751",
"Description": "base metal",
"Price": "1200"
},
{
"Procedure Code": "D2740",
"Description": "crown porcelain",
"Price": "1300"
},
{
"Procedure Code": "D2954",
"Description": "p/c",
"Price": "450"
},
{
"Procedure Code": "D7910",
"Description": "suture, small wound up to 5 mm",
"Price": "400"
},
{
"Procedure Code": "D5110",
"Description": "FU",
"Price": "1200",
"Full Price": "1700"
},
{
"Procedure Code": "D5120",
"Description": "FL",
"Price": "1700",
"Full Price": "1700"
},
{
"Procedure Code": "D5211",
"Description": "pu",
"Price": "1300"
},
{
"Procedure Code": "D5212",
"Description": "pl",
"Price": "1300"
},
{
"Procedure Code": "D5213",
"Description": "cast pu.",
"Price": "1700"
},
{
"Procedure Code": "D5214",
"Description": "cast pl",
"Price": "1700"
},
{
"Procedure Code": "D5510",
"Description": "Repair broken complete denture base (QUAD)",
"Price": "400"
},
{
"Procedure Code": "D5520",
"Description": "Replace missing or broken teeth - complete denture (each tooth) (TOOTH)",
"Price": "200"
},
{
"Procedure Code": "D5750",
"Description": "lab reline",
"Price": "600"
},
{
"Procedure Code": "D5730",
"Description": "chairside reline",
"Price": "500"
},
{
"Procedure Code": "D2920",
"Description": "re cement crown",
"Price": "120"
},
{
"Procedure Code": "D2950",
"Description": "core buildup",
"Price": "350"
},
{
"Procedure Code": "D2955",
"Description": "post renoval",
"Price": "350"
},
{
"Procedure Code": "D6100",
"Description": "",
"Price": "320"
},
{
"Procedure Code": "D6110",
"Description": "implant",
"Price": "1600"
},
{
"Procedure Code": "D6056",
"Description": "pre fab abut",
"Price": "750"
},
{
"Procedure Code": "D6057",
"Description": "custom abut",
"Price": "800"
},
{
"Procedure Code": "D6058",
"Description": "porcelain, implant crown, ceramic crown",
"Price": "1400"
},
{
"Procedure Code": "D6059",
"Description": "",
"Price": "1400"
},
{
"Procedure Code": "D6242",
"Description": "noble metal. For united",
"Price": "1400"
},
{
"Procedure Code": "D6245",
"Description": "porcelain, not for united",
"Price": "1400"
},
{
"Procedure Code": "D0367",
"Description": "",
"Price": "400"
},
{
"Procedure Code": "D0364",
"Description": "Less than one jaw",
"Price": "350"
},
{
"Procedure Code": "D0365",
"Description": "Mand",
"Price": "350"
},
{
"Procedure Code": "D0366",
"Description": "Max",
"Price": "350"
},
{
"Procedure Code": "D0368",
"Description": "include TMJ",
"Price": "375"
},
{
"Procedure Code": "D0383",
"Description": "",
"Price": "350"
},
{
"Procedure Code": "D0380",
"Description": "Less than one jaw",
"Price": "300"
},
{
"Procedure Code": "D0381",
"Description": "Mand",
"Price": "300"
},
{
"Procedure Code": "D0382",
"Description": "Max",
"Price": "300"
},
{
"Procedure Code": "D7950",
"Description": "max",
"Price": "800"
},
{
"Procedure Code": "D7140",
"Description": "simple ext",
"Price": "150"
},
{
"Procedure Code": "D7210",
"Description": "surgical ext",
"Price": "280"
},
{
"Procedure Code": "D7220",
"Description": "soft impacted",
"Price": "380"
},
{
"Procedure Code": "D7230",
"Description": "partial bony",
"Price": "450"
},
{
"Procedure Code": "D7240",
"Description": "fully bony",
"Price": "550"
},
{
"Procedure Code": "D3320",
"Description": "pre M RCT",
"Price": "1050"
}
]