initial commit

2026-04-04 22:13:55 -04:00
commit 5d77e207c9
10181 changed files with 522212 additions and 0 deletions
--- a/apps/ProcedureCodeFromMhPdf/extract_bypage.py
+++ b/apps/ProcedureCodeFromMhPdf/extract_bypage.py
@@ -0,0 +1,183 @@
+import re
+import json
+from typing import List, Dict
+import fitz  # PyMuPDF
+
+
+# =========================
+# CONFIG — EDIT THESE ONLY
+# =========================
+PDF_PATH = "MH.pdf"   # path to your PDF
+PAGES = [2]                   # 0-based page indexes to parse, e.g., [2] for the page you showed
+OUT_PATH = "output.json"      # where to write JSON
+FIRST_PRICE_IS_LTE21 = True   # True => first price line is <=21; False => first price is >21
+PRINT_PAGE_TEXT = False       # set True if you want to print the raw page text for sanity check
+# =========================
+
+
+# --- patterns ---
+code_line_re = re.compile(r"^\s*(D\d{4})\s*$")
+# a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC'
+price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE)
+# lines that definitely start a notes block we should ignore once prices are done
+note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—|–|Age limitation:|CR\b)", re.IGNORECASE)
+
+
+def normalize_ws(s: str) -> str:
+    s = s.replace("\u00a0", " ")
+    s = re.sub(r"[ \t]+", " ", s)
+    s = re.sub(r"\s*\n\s*", " ", s)
+    s = re.sub(r"\s{2,}", " ", s)
+    return s.strip(" ,.;:-•·\n\t")
+
+
+def clean_money(token: str) -> str:
+    if token.upper() == "NC":
+        return "NC"
+    return token.replace(",", "").lstrip("$").strip()
+
+
+def get_page_lines(pdf_path: str, pages: List[int]) -> List[str]:
+    doc = fitz.open(pdf_path)
+    try:
+        max_idx = len(doc) - 1
+        for p in pages:
+            if p < 0 or p > max_idx:
+                raise ValueError(f"Invalid page index {p}. Valid range is 0..{max_idx}.")
+        lines: List[str] = []
+        for p in pages:
+            text = doc.load_page(p).get_text("text") or ""
+            if PRINT_PAGE_TEXT:
+                print(f"\n--- RAW PAGE {p} ---\n{text}")
+            # keep line boundaries; later we parse line-by-line
+            lines.extend(text.splitlines())
+        return lines
+    finally:
+        doc.close()
+
+
+def extract_records(lines: List[str]) -> List[Dict[str, str]]:
+    out: List[Dict[str, str]] = []
+    i = 0
+    n = len(lines)
+
+    while i < n:
+        line = lines[i].strip()
+
+        # seek a code line
+        mcode = code_line_re.match(line)
+        if not mcode:
+            i += 1
+            continue
+
+        code = mcode.group(1)
+        i += 1
+
+        # gather description lines until we encounter price lines
+        desc_lines: List[str] = []
+        # skip blank lines before description
+        while i < n and not lines[i].strip():
+            i += 1
+
+        # collect description lines (usually 1–3) until first price token
+        # stop also if we accidentally hit another code (defensive)
+        j = i
+        while j < n:
+            s = lines[j].strip()
+            if not s:
+                # blank line inside description — consider description ended if the next is a price
+                # but we don't advance here; break and let price parsing handle it
+                break
+            if code_line_re.match(s):
+                # next code — no prices found; abandon this broken record
+                break
+            if price_line_re.match(s):
+                # reached price section
+                break
+            if note_starters_re.match(s):
+                # encountered a note before price — treat as end of description; prices may be missing
+                break
+            desc_lines.append(s)
+            j += 1
+
+        # advance i to where we left off
+        i = j
+
+        description = normalize_ws(" ".join(desc_lines))
+
+        # collect up to two price tokens
+        prices: List[str] = []
+        while i < n and len(prices) < 2:
+            s = lines[i].strip()
+            if not s:
+                i += 1
+                continue
+            if code_line_re.match(s):
+                # new record — stop; this means we never got prices (malformed)
+                break
+            mprice = price_line_re.match(s)
+            if mprice:
+                prices.append(clean_money(mprice.group(1)))
+                i += 1
+                continue
+            # if we encounter a note/flags block, skip forward until the next code/blank
+            if note_starters_re.match(s) or s in {"Y", "NC"}:
+                # skip this block quickly
+                i += 1
+                # keep skipping subsequent non-empty, non-code lines until a blank or next code
+                while i < n:
+                    t = lines[i].strip()
+                    if not t or code_line_re.match(t):
+                        break
+                    i += 1
+                # now let the outer loop proceed
+                continue
+            # unrecognized line: if prices already found, we can break; else skip
+            if prices:
+                break
+            i += 1
+
+        if len(prices) < 2:
+            # couldn't find 2 prices reliably; skip this record
+            continue
+
+        if FIRST_PRICE_IS_LTE21:
+            price_lte21, price_gt21 = prices[0], prices[1]
+        else:
+            price_lte21, price_gt21 = prices[1], prices[0]
+
+        out.append(
+            {
+                "Procedure Code": code,
+                "Description": description,
+                "PriceLTEQ21": price_lte21,
+                "PriceGT21": price_gt21,
+            }
+        )
+
+        # after prices, skip forward until next code or blank block end
+        while i < n:
+            s = lines[i].strip()
+            if not s:
+                i += 1
+                break
+            if code_line_re.match(s):
+                # next record will pick this up
+                break
+            i += 1
+
+    return out
+
+
+def extract_pdf_to_json(pdf_path: str, pages: List[int], out_path: str) -> List[Dict[str, str]]:
+    lines = get_page_lines(pdf_path, pages)
+    data = extract_records(lines)
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    return data
+
+
+if __name__ == "__main__":
+    data = extract_pdf_to_json(PDF_PATH, PAGES, OUT_PATH)
+    print(f"Wrote {len(data)} rows to {OUT_PATH}")
+    print(json.dumps(data, ensure_ascii=False, indent=2))