initial commit

2026-04-04 22:13:55 -04:00
commit 5d77e207c9
10181 changed files with 522212 additions and 0 deletions
--- a/apps/ProcedureCodeFromMhPdf/MH.pdf
+++ b/apps/ProcedureCodeFromMhPdf/MH.pdf
--- a/apps/ProcedureCodeFromMhPdf/MHv2.pdf
+++ b/apps/ProcedureCodeFromMhPdf/MHv2.pdf
--- a/apps/ProcedureCodeFromMhPdf/Readme.md
+++ b/apps/ProcedureCodeFromMhPdf/Readme.md
@@ -0,0 +1,5 @@
+This code was written only while extracting procedure code data from Mass Health pdf, to make process easy. 
+
+Only was a one time process, not used as core functionality in this whole app. 
+
+Keeping it as in future might need to extract again.
--- a/apps/ProcedureCodeFromMhPdf/compareJson.py
+++ b/apps/ProcedureCodeFromMhPdf/compareJson.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+"""
+Compare a main dental JSON file with one or more other JSON files and
+return all records whose 'Procedure Code' is NOT present in the main file.
+
+- Matching key: 'Procedure Code' (case-insensitive, trimmed).
+- Keeps the full record from the other files (including extra fields like 'Full Price').
+- Deduplicates by Procedure Code across the collected "missing" results.
+
+CONFIG: set MAIN_PATH, OTHER_PATHS, OUT_PATH below.
+"""
+
+import json
+from pathlib import Path
+from typing import List, Dict, Any
+
+# =========================
+# CONFIG — EDIT THESE ONLY
+# =========================
+MAIN_PATH = "procedureCodes_v2.json"  # your main JSON (with PriceLTEQ21/PriceGT21)
+OTHER_PATHS = [
+    # "procedureCodesOld.json",       # one or more other JSON files to compare against the main
+    "output.json",
+]
+OUT_PATH = "not_in_main.json"  # where to write the results
+# =========================
+
+
+def _load_json_any(path: str) -> List[Dict[str, Any]]:
+    """
+    Load JSON. Accept:
+      - a list of objects
+      - a single object (wraps into a list)
+    """
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if isinstance(data, dict):
+        return [data]
+    if isinstance(data, list):
+        # filter out non-dict items defensively
+        return [x for x in data if isinstance(x, dict)]
+    raise ValueError(f"Unsupported JSON top-level type in {path}: {type(data)}")
+
+
+def _norm_code(record: Dict[str, Any]) -> str:
+    # Normalize the 'Procedure Code' for matching
+    code = str(record.get("Procedure Code", "")).strip().upper()
+    # Some PDFs might have stray spaces, tabs, or zero-width chars
+    code = "".join(ch for ch in code if not ch.isspace())
+    return code
+
+
+def collect_main_codes(main_path: str) -> set:
+    main_items = _load_json_any(main_path)
+    codes = {_norm_code(rec) for rec in main_items if _norm_code(rec)}
+    return codes
+
+
+def collect_missing_records(other_paths: List[str], main_codes: set) -> List[Dict[str, Any]]:
+    missing: Dict[str, Dict[str, Any]] = {}  # map normalized code -> record
+    for p in other_paths:
+        items = _load_json_any(p)
+        for rec in items:
+            code_norm = _norm_code(rec)
+            if not code_norm:
+                continue
+            if code_norm not in main_codes and code_norm not in missing:
+                # Keep the full original record
+                missing[code_norm] = rec
+    # return in a stable, sorted order by code
+    return [missing[k] for k in sorted(missing.keys())]
+
+
+def main():
+    # Validate files exist
+    if not Path(MAIN_PATH).exists():
+        raise FileNotFoundError(f"Main file not found: {MAIN_PATH}")
+    for p in OTHER_PATHS:
+        if not Path(p).exists():
+            raise FileNotFoundError(f"Other file not found: {p}")
+
+    main_codes = collect_main_codes(MAIN_PATH)
+    missing_records = collect_missing_records(OTHER_PATHS, main_codes)
+
+    with open(OUT_PATH, "w", encoding="utf-8") as f:
+        json.dump(missing_records, f, ensure_ascii=False, indent=2)
+
+    print(f"Main codes: {len(main_codes)}")
+    print(f"Missing from main: {len(missing_records)}")
+    print(f"Wrote results to {OUT_PATH}")
+    # Also echo to stdout
+    print(json.dumps(missing_records, ensure_ascii=False, indent=2))
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/ProcedureCodeFromMhPdf/compareJson_matchingPrice.py
+++ b/apps/ProcedureCodeFromMhPdf/compareJson_matchingPrice.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+"""
+Compare prices between two JSON files (file1 vs file2) — CONFIG-DRIVEN version.
+
+Behavior:
+- Loads two JSON arrays of records (file1 and file2).
+- Indexes by procedure code (tries common keys like "Procedure Code", "Code", etc).
+- Normalizes money tokens: removes $ and commas, treats "NC" as literal.
+- Compares all three price fields:
+    - Price
+    - PriceLTEQ21
+    - PriceGT21
+  Matching rules:
+    - If both records have the same named field, compare them.
+    - If file1 has only a single "Price" and file2 has PriceLTEQ21 / PriceGT21,
+      the script will compare file1.Price to BOTH PriceLTEQ21 and PriceGT21 (and
+      report mismatch if file1.Price differs from either).
+    - "NC" only equals "NC".
+    - Numeric tokens compared numerically within tolerance (default 0.005).
+- Produces output JSON (configured below) listing:
+    - mismatches: detailed entries for codes that differ
+    - only_in_file1: codes found only in file1
+    - only_in_file2: codes found only in file2
+    - summary
+
+Edit the CONFIG block below, then run the script.
+"""
+
+import json
+import re
+from typing import List, Dict, Any, Optional
+
+# =========================
+# CONFIG — EDIT THESE ONLY
+# =========================
+FILE1_PATH = "procedureCodes_v2.json"         # path to file 1 (your base/reference file)
+FILE2_PATH = "output.json"         # path to file 2 (the file to compare)
+OUT_PATH = "price_diffs.json"     # output JSON writing mismatches
+TOLERANCE = 0.005                 # numeric tolerance for floats
+CODE_KEY_CANDIDATES = ("Procedure Code", "Code", "procedure_code", "procedure code")
+# If True: when file1 has single "Price" and file2 has both LTEQ/GT values,
+# compare file1.Price against both fields and flag mismatch if either differs.
+COMPARE_SINGLE_PRICE_AGAINST_BOTH = True
+# =========================
+
+_money_re = re.compile(r"^\s*(NC|\$?\s*[\d,]+(?:\.\d{1,2})?)\s*$", re.IGNORECASE)
+
+
+def normalize_money_token(token: Optional[str]) -> Optional[str]:
+    """Normalize money token to canonical string or 'NC'. Return None if missing/empty."""
+    if token is None:
+        return None
+    t = str(token).strip()
+    if not t:
+        return None
+    m = _money_re.match(t)
+    if not m:
+        # unknown format — return trimmed token so mismatch is visible
+        return t
+    val = m.group(1)
+    if val.upper() == "NC":
+        return "NC"
+    val = val.replace("$", "").replace(",", "").strip()
+    # Remove trailing zeros from decimals, but preserve integer form
+    if "." in val:
+        val = val.rstrip("0").rstrip(".")
+    return val
+
+
+def numeric_compare(a: Optional[str], b: Optional[str], tol: float = TOLERANCE) -> bool:
+    """Compare normalized tokens. NC compares only equal to NC. Otherwise numeric compare."""
+    if a is None or b is None:
+        return False
+    if a == b:
+        return True
+    if a.upper() == "NC" or b.upper() == "NC":
+        return a.upper() == b.upper()
+    try:
+        return abs(float(a) - float(b)) <= tol
+    except Exception:
+        # fallback to exact match if non-numeric
+        return a == b
+
+
+def load_json(path: str) -> List[Dict[str, Any]]:
+    with open(path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    if not isinstance(data, list):
+        raise ValueError(f"Expected JSON array in {path}")
+    return data
+
+
+def build_index(records: List[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
+    """Index records by procedure code. First match wins for duplicates."""
+    idx: Dict[str, Dict[str, Any]] = {}
+    for rec in records:
+        code = None
+        for k in CODE_KEY_CANDIDATES:
+            if k in rec and rec[k]:
+                code = str(rec[k]).strip()
+                break
+        if not code:
+            # try to find any field with a Dxxxx-like value
+            for v in rec.values():
+                if isinstance(v, str) and re.match(r"^\s*D\d{4}\s*$", v):
+                    code = v.strip()
+                    break
+        if not code:
+            continue
+        if code in idx:
+            # duplicate: keep first occurrence
+            continue
+        idx[code] = rec
+    return idx
+
+
+def extract_price_fields(rec: Dict[str, Any]) -> Dict[str, Optional[str]]:
+    """
+    Return dict with normalized values for 'Price', 'PriceLTEQ21', and 'PriceGT21'.
+    Keys always present with None when missing.
+    """
+    return {
+        "Price": normalize_money_token(rec.get("Price")),
+        "PriceLTEQ21": normalize_money_token(rec.get("PriceLTEQ21")),
+        "PriceGT21": normalize_money_token(rec.get("PriceGT21")),
+    }
+
+
+def compare_code_records(code: str, rec1: Dict[str, Any], rec2: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """
+    Compare price fields for a single code. Return mismatch dict if any mismatch present, else None.
+    Mismatch dict includes file1/file2 price fields and per-field mismatch details.
+    """
+    p1 = extract_price_fields(rec1)
+    p2 = extract_price_fields(rec2)
+
+    mismatches = []
+
+    # 1) Compare same-named fields if both present
+    for key in ("Price", "PriceLTEQ21", "PriceGT21"):
+        a = p1.get(key)
+        b = p2.get(key)
+        if a is None and b is None:
+            continue
+        if a is None or b is None:
+            # present in one but not the other: count as mismatch
+            mismatches.append({"field": key, "file1": a, "file2": b, "reason": "missing_in_one"})
+            continue
+        if not numeric_compare(a, b):
+            mismatches.append({"field": key, "file1": a, "file2": b, "reason": "value_mismatch"})
+
+    # 2) Special-case: if file1 has only single Price, and file2 has LTEQ/GT present,
+    #    optionally compare file1.Price against each of them.
+    if COMPARE_SINGLE_PRICE_AGAINST_BOTH:
+        # Only apply if file1.Price exists and file1 does NOT have LTEQ/GT (both None),
+        # but file2 has at least one of LTEQ/GT.
+        file1_has_price = p1.get("Price") is not None
+        file1_has_any_special = (p1.get("PriceLTEQ21") is not None) or (p1.get("PriceGT21") is not None)
+        file2_has_any_special = (p2.get("PriceLTEQ21") is not None) or (p2.get("PriceGT21") is not None)
+        if file1_has_price and (not file1_has_any_special) and file2_has_any_special:
+            # compare file1.Price to each present file2 special price
+            left = p1.get("Price")
+            for special_key in ("PriceLTEQ21", "PriceGT21"):
+                right = p2.get(special_key)
+                if right is None:
+                    continue
+                # If already recorded a same-named mismatch for this special_key above,
+                # that mismatch covered the case where file1 was missing that named field.
+                # But since file1 lacked that special field, we still want to compare single Price vs special.
+                if not numeric_compare(left, right):
+                    mismatches.append({
+                        "field": f"Price_vs_{special_key}",
+                        "file1": left,
+                        "file2": right,
+                        "reason": "single_price_vs_special_mismatch"
+                    })
+
+    if mismatches:
+        return {
+            "Procedure Code": code,
+            "Description_file1": rec1.get("Description"),
+            "Description_file2": rec2.get("Description"),
+            "file1_prices": p1,
+            "file2_prices": p2,
+            "mismatches": mismatches
+        }
+    return None
+
+
+def main():
+    # load inputs
+    data1 = load_json(FILE1_PATH)
+    data2 = load_json(FILE2_PATH)
+
+    idx1 = build_index(data1)
+    idx2 = build_index(data2)
+
+    codes_all = sorted(set(list(idx1.keys()) + list(idx2.keys())))
+
+    mismatched: List[Dict[str, Any]] = []
+    only_in_file1: List[str] = []
+    only_in_file2: List[str] = []
+
+    for code in codes_all:
+        rec1 = idx1.get(code)
+        rec2 = idx2.get(code)
+        if rec1 is None:
+            only_in_file2.append(code)
+            continue
+        if rec2 is None:
+            only_in_file1.append(code)
+            continue
+        diff = compare_code_records(code, rec1, rec2)
+        if diff:
+            mismatched.append(diff)
+
+    out = {
+        "summary": {
+            "total_codes_found": len(codes_all),
+            "only_in_file1_count": len(only_in_file1),
+            "only_in_file2_count": len(only_in_file2),
+            "mismatched_count": len(mismatched),
+        },
+        "only_in_file1": only_in_file1,
+        "only_in_file2": only_in_file2,
+        "mismatches": mismatched
+    }
+
+    with open(OUT_PATH, "w", encoding="utf-8") as f:
+        json.dump(out, f, ensure_ascii=False, indent=2)
+
+    # brief console summary
+    print(f"Compared {len(codes_all)} procedure codes.")
+    print(f"Only in {FILE1_PATH}: {len(only_in_file1)} codes.")
+    print(f"Only in {FILE2_PATH}: {len(only_in_file2)} codes.")
+    print(f"Mismatched prices: {len(mismatched)} codes.")
+    print(f"Wrote detailed diffs to {OUT_PATH}")
+
+
+if __name__ == "__main__":
+    main()
--- a/apps/ProcedureCodeFromMhPdf/extract_bypage.py
+++ b/apps/ProcedureCodeFromMhPdf/extract_bypage.py
@@ -0,0 +1,183 @@
+import re
+import json
+from typing import List, Dict
+import fitz  # PyMuPDF
+
+
+# =========================
+# CONFIG — EDIT THESE ONLY
+# =========================
+PDF_PATH = "MH.pdf"   # path to your PDF
+PAGES = [2]                   # 0-based page indexes to parse, e.g., [2] for the page you showed
+OUT_PATH = "output.json"      # where to write JSON
+FIRST_PRICE_IS_LTE21 = True   # True => first price line is <=21; False => first price is >21
+PRINT_PAGE_TEXT = False       # set True if you want to print the raw page text for sanity check
+# =========================
+
+
+# --- patterns ---
+code_line_re = re.compile(r"^\s*(D\d{4})\s*$")
+# a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC'
+price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE)
+# lines that definitely start a notes block we should ignore once prices are done
+note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—|–|Age limitation:|CR\b)", re.IGNORECASE)
+
+
+def normalize_ws(s: str) -> str:
+    s = s.replace("\u00a0", " ")
+    s = re.sub(r"[ \t]+", " ", s)
+    s = re.sub(r"\s*\n\s*", " ", s)
+    s = re.sub(r"\s{2,}", " ", s)
+    return s.strip(" ,.;:-•·\n\t")
+
+
+def clean_money(token: str) -> str:
+    if token.upper() == "NC":
+        return "NC"
+    return token.replace(",", "").lstrip("$").strip()
+
+
+def get_page_lines(pdf_path: str, pages: List[int]) -> List[str]:
+    doc = fitz.open(pdf_path)
+    try:
+        max_idx = len(doc) - 1
+        for p in pages:
+            if p < 0 or p > max_idx:
+                raise ValueError(f"Invalid page index {p}. Valid range is 0..{max_idx}.")
+        lines: List[str] = []
+        for p in pages:
+            text = doc.load_page(p).get_text("text") or ""
+            if PRINT_PAGE_TEXT:
+                print(f"\n--- RAW PAGE {p} ---\n{text}")
+            # keep line boundaries; later we parse line-by-line
+            lines.extend(text.splitlines())
+        return lines
+    finally:
+        doc.close()
+
+
+def extract_records(lines: List[str]) -> List[Dict[str, str]]:
+    out: List[Dict[str, str]] = []
+    i = 0
+    n = len(lines)
+
+    while i < n:
+        line = lines[i].strip()
+
+        # seek a code line
+        mcode = code_line_re.match(line)
+        if not mcode:
+            i += 1
+            continue
+
+        code = mcode.group(1)
+        i += 1
+
+        # gather description lines until we encounter price lines
+        desc_lines: List[str] = []
+        # skip blank lines before description
+        while i < n and not lines[i].strip():
+            i += 1
+
+        # collect description lines (usually 1–3) until first price token
+        # stop also if we accidentally hit another code (defensive)
+        j = i
+        while j < n:
+            s = lines[j].strip()
+            if not s:
+                # blank line inside description — consider description ended if the next is a price
+                # but we don't advance here; break and let price parsing handle it
+                break
+            if code_line_re.match(s):
+                # next code — no prices found; abandon this broken record
+                break
+            if price_line_re.match(s):
+                # reached price section
+                break
+            if note_starters_re.match(s):
+                # encountered a note before price — treat as end of description; prices may be missing
+                break
+            desc_lines.append(s)
+            j += 1
+
+        # advance i to where we left off
+        i = j
+
+        description = normalize_ws(" ".join(desc_lines))
+
+        # collect up to two price tokens
+        prices: List[str] = []
+        while i < n and len(prices) < 2:
+            s = lines[i].strip()
+            if not s:
+                i += 1
+                continue
+            if code_line_re.match(s):
+                # new record — stop; this means we never got prices (malformed)
+                break
+            mprice = price_line_re.match(s)
+            if mprice:
+                prices.append(clean_money(mprice.group(1)))
+                i += 1
+                continue
+            # if we encounter a note/flags block, skip forward until the next code/blank
+            if note_starters_re.match(s) or s in {"Y", "NC"}:
+                # skip this block quickly
+                i += 1
+                # keep skipping subsequent non-empty, non-code lines until a blank or next code
+                while i < n:
+                    t = lines[i].strip()
+                    if not t or code_line_re.match(t):
+                        break
+                    i += 1
+                # now let the outer loop proceed
+                continue
+            # unrecognized line: if prices already found, we can break; else skip
+            if prices:
+                break
+            i += 1
+
+        if len(prices) < 2:
+            # couldn't find 2 prices reliably; skip this record
+            continue
+
+        if FIRST_PRICE_IS_LTE21:
+            price_lte21, price_gt21 = prices[0], prices[1]
+        else:
+            price_lte21, price_gt21 = prices[1], prices[0]
+
+        out.append(
+            {
+                "Procedure Code": code,
+                "Description": description,
+                "PriceLTEQ21": price_lte21,
+                "PriceGT21": price_gt21,
+            }
+        )
+
+        # after prices, skip forward until next code or blank block end
+        while i < n:
+            s = lines[i].strip()
+            if not s:
+                i += 1
+                break
+            if code_line_re.match(s):
+                # next record will pick this up
+                break
+            i += 1
+
+    return out
+
+
+def extract_pdf_to_json(pdf_path: str, pages: List[int], out_path: str) -> List[Dict[str, str]]:
+    lines = get_page_lines(pdf_path, pages)
+    data = extract_records(lines)
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    return data
+
+
+if __name__ == "__main__":
+    data = extract_pdf_to_json(PDF_PATH, PAGES, OUT_PATH)
+    print(f"Wrote {len(data)} rows to {OUT_PATH}")
+    print(json.dumps(data, ensure_ascii=False, indent=2))
--- a/apps/ProcedureCodeFromMhPdf/extract_byrange.py
+++ b/apps/ProcedureCodeFromMhPdf/extract_byrange.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python3
+"""
+MassHealth dental PDF parser (PyMuPDF / fitz) — PAGE RANGE VERSION
+
+Parses rows like:
+
+D2160
+Amalgam-three surfaces,
+primary or permanent
+$110
+$92
+Y
+Y
+...
+
+Outputs a single JSON with records from the chosen page range (inclusive).
+
+Config:
+- PDF_PATH: path to the PDF
+- PAGE_START, PAGE_END: 1-based page numbers (inclusive)
+- FIRST_PRICE_IS_LTE21: True => first price line is <=21; False => first price is >21
+- OUT_PATH: output JSON path
+"""
+
+import re
+import json
+from typing import List, Dict
+import fitz  # PyMuPDF
+
+
+# =========================
+# CONFIG — EDIT THESE ONLY
+# =========================
+PDF_PATH = "MHv2.pdf"   # path to your PDF
+PAGE_START = 1                # 1-based inclusive start page (e.g., 1)
+PAGE_END   = 15             # 1-based inclusive end page   (e.g., 5)
+OUT_PATH = "output.json"      # single JSON file containing all parsed rows
+FIRST_PRICE_IS_LTE21 = True   # True => first price line is <=21; False => first price is >21
+PRINT_PAGE_TEXT = False       # set True to print raw text for each page
+# =========================
+
+
+# --- patterns ---
+code_line_re = re.compile(r"^\s*(D\d{4})\s*$")
+# a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC'
+price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE)
+# lines that definitely start a notes block to ignore once prices are done
+note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—|–|Age limitation:|CR\b)", re.IGNORECASE)
+
+
+def normalize_ws(s: str) -> str:
+    s = s.replace("\u00a0", " ")
+    s = re.sub(r"[ \t]+", " ", s)
+    s = re.sub(r"\s*\n\s*", " ", s)
+    s = re.sub(r"\s{2,}", " ", s)
+    return s.strip(" ,.;:-•·\n\t")
+
+
+def clean_money(token: str) -> str:
+    if token.upper() == "NC":
+        return "NC"
+    return token.replace(",", "").lstrip("$").strip()
+
+
+def get_page_lines(pdf_path: str, page_start_1b: int, page_end_1b: int) -> List[str]:
+    if page_start_1b <= 0 or page_end_1b <= 0:
+        raise ValueError("PAGE_START and PAGE_END must be >= 1 (1-based).")
+    if page_start_1b > page_end_1b:
+        raise ValueError("PAGE_START cannot be greater than PAGE_END.")
+
+    doc = fitz.open(pdf_path)
+    try:
+        last_idx_0b = len(doc) - 1
+        # convert to 0-based inclusive range
+        start_0b = page_start_1b - 1
+        end_0b = page_end_1b - 1
+        if start_0b < 0 or end_0b > last_idx_0b:
+            raise ValueError(f"Page range out of bounds. Valid 1-based range is 1..{last_idx_0b + 1}.")
+        lines: List[str] = []
+        for p in range(start_0b, end_0b + 1):
+            text = doc.load_page(p).get_text("text") or ""
+            if PRINT_PAGE_TEXT:
+                print(f"\n--- RAW PAGE {p} (0-based; shown as {p+1} 1-based) ---\n{text}")
+            lines.extend(text.splitlines())
+        return lines
+    finally:
+        doc.close()
+
+
+def extract_records(lines: List[str]) -> List[Dict[str, str]]:
+    out: List[Dict[str, str]] = []
+    i = 0
+    n = len(lines)
+
+    while i < n:
+        line = lines[i].strip()
+
+        # seek a code line
+        mcode = code_line_re.match(line)
+        if not mcode:
+            i += 1
+            continue
+
+        code = mcode.group(1)
+        i += 1
+
+        # gather description lines until we encounter price lines
+        desc_lines: List[str] = []
+        # skip blank lines before description
+        while i < n and not lines[i].strip():
+            i += 1
+
+        # collect description lines (usually 1–3) until first price token
+        # stop also if we accidentally hit another code (defensive)
+        j = i
+        while j < n:
+            s = lines[j].strip()
+            if not s:
+                break
+            if code_line_re.match(s):
+                # next code — description ended abruptly (malformed)
+                break
+            if price_line_re.match(s):
+                # reached price section
+                break
+            if note_starters_re.match(s):
+                # encountered a note before price — treat as end of description; prices may be missing
+                break
+            desc_lines.append(s)
+            j += 1
+
+        # advance i to where we left off
+        i = j
+
+        description = normalize_ws(" ".join(desc_lines))
+
+        # collect up to two price tokens
+        prices: List[str] = []
+        while i < n and len(prices) < 2:
+            s = lines[i].strip()
+            if not s:
+                i += 1
+                continue
+            if code_line_re.match(s):
+                # new record — stop; this means we never got prices (malformed)
+                break
+            mprice = price_line_re.match(s)
+            if mprice:
+                prices.append(clean_money(mprice.group(1)))
+                i += 1
+                continue
+            # if we encounter a note/flags block, skip forward until a blank or next code
+            if note_starters_re.match(s) or s in {"Y", "NC"}:
+                i += 1
+                while i < n:
+                    t = lines[i].strip()
+                    if not t or code_line_re.match(t):
+                        break
+                    i += 1
+                continue
+            # unrecognized line: if we already captured some prices, break; else skip
+            if prices:
+                break
+            i += 1
+
+        if len(prices) < 2:
+            # couldn't find 2 prices reliably; skip this record
+            continue
+
+        if FIRST_PRICE_IS_LTE21:
+            price_lte21, price_gt21 = prices[0], prices[1]
+        else:
+            price_lte21, price_gt21 = prices[1], prices[0]
+
+        out.append(
+            {
+                "Procedure Code": code,
+                "Description": description,
+                "PriceLTEQ21": price_lte21,
+                "PriceGT21": price_gt21,
+            }
+        )
+
+        # after prices, skip forward until next code or blank block end
+        while i < n:
+            s = lines[i].strip()
+            if not s:
+                i += 1
+                break
+            if code_line_re.match(s):
+                break
+            i += 1
+
+    return out
+
+
+def extract_pdf_range_to_json(pdf_path: str, page_start_1b: int, page_end_1b: int, out_path: str) -> List[Dict[str, str]]:
+    lines = get_page_lines(pdf_path, page_start_1b, page_end_1b)
+    data = extract_records(lines)
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+    return data
+
+
+if __name__ == "__main__":
+    data = extract_pdf_range_to_json(PDF_PATH, PAGE_START, PAGE_END, OUT_PATH)
+    print(f"Wrote {len(data)} rows to {OUT_PATH}")
+    print(json.dumps(data, ensure_ascii=False, indent=2))
--- a/apps/ProcedureCodeFromMhPdf/output.json
+++ b/apps/ProcedureCodeFromMhPdf/output.json
--- a/apps/ProcedureCodeFromMhPdf/procedureCodes_v0.json
+++ b/apps/ProcedureCodeFromMhPdf/procedureCodes_v0.json
@@ -0,0 +1,344 @@
+[
+    {
+        "Procedure Code": "D1999",
+        "Description": "",
+        "Price": "50"
+    },
+    {
+        "Procedure Code": "D0120",
+        "Description": "perio exam",
+        "Price": "105"
+    },
+    {
+        "Procedure Code": "D0140",
+        "Description": "limited exam",
+        "Price": "90"
+    },
+    {
+        "Procedure Code": "D0150",
+        "Description": "comprehensive exam",
+        "Price": "120"
+    },
+    {
+        "Procedure Code": "D0210",
+        "Description": "Fmx.",
+        "Price": "120"
+    },
+    {
+        "Procedure Code": "D0220",
+        "Description": "first PA.",
+        "Price": "60"
+    },
+    {
+        "Procedure Code": "D0230",
+        "Description": "2nd PA.",
+        "Price": "50"
+    },
+    {
+        "Procedure Code": "D0330",
+        "Description": "pano",
+        "Price": "150"
+    },
+    {
+        "Procedure Code": "D0272",
+        "Description": "2 BW",
+        "Price": "80"
+    },
+    {
+        "Procedure Code": "D0274",
+        "Description": "4BW",
+        "Price": "160"
+    },
+    {
+        "Procedure Code": "D1110",
+        "Description": "adult prophy",
+        "Price": "150"
+    },
+    {
+        "Procedure Code": "D1120",
+        "Description": "child prophy",
+        "Price": "120"
+    },
+    {
+        "Procedure Code": "D1351",
+        "Description": "sealant",
+        "Price": "80"
+    },
+    {
+        "Procedure Code": "D4341",
+        "Description": "srp",
+        "Price": "250"
+    },
+    {
+        "Procedure Code": "D4910",
+        "Description": "perio maintains",
+        "Price": "250"
+    },
+    {
+        "Procedure Code": "D1208",
+        "Description": "FL",
+        "Price": "90"
+    },
+    {
+        "Procedure Code": "D2330",
+        "Description": "front composite. 1 s.",
+        "Price": "180"
+    },
+    {
+        "Procedure Code": "D2331",
+        "Description": "2s",
+        "Price": "220"
+    },
+    {
+        "Procedure Code": "D2332",
+        "Description": "3s",
+        "Price": "280"
+    },
+    {
+        "Procedure Code": "D2335",
+        "Description": "4s or more",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D2391",
+        "Description": "back. 1s",
+        "Price": "200"
+    },
+    {
+        "Procedure Code": "D2392",
+        "Description": "2s",
+        "Price": "250"
+    },
+    {
+        "Procedure Code": "D2393",
+        "Description": "3s",
+        "Price": "280"
+    },
+    {
+        "Procedure Code": "D2394",
+        "Description": "4s",
+        "Price": "320"
+    },
+    {
+        "Procedure Code": "D2140",
+        "Description": "amalgam, one surface",
+        "Price": "150"
+    },
+    {
+        "Procedure Code": "D2150",
+        "Description": "amalgam, two surface",
+        "Price": "200"
+    },
+    {
+        "Procedure Code": "D2750",
+        "Description": "high noble",
+        "Price": "1300"
+    },
+    {
+        "Procedure Code": "D2751",
+        "Description": "base metal",
+        "Price": "1200"
+    },
+    {
+        "Procedure Code": "D2740",
+        "Description": "crown porcelain",
+        "Price": "1300"
+    },
+    {
+        "Procedure Code": "D2954",
+        "Description": "p/c",
+        "Price": "450"
+    },
+    {
+        "Procedure Code": "D7910",
+        "Description": "suture, small wound up to 5 mm",
+        "Price": "400"
+    },
+    {
+        "Procedure Code": "D5110",
+        "Description": "FU",
+        "Price": "1200",
+        "Full Price": "1700"
+    },
+    {
+        "Procedure Code": "D5120",
+        "Description": "FL",
+        "Price": "1700",
+        "Full Price": "1700"
+    },
+    {
+        "Procedure Code": "D5211",  
+        "Description": "pu",
+        "Price": "1300"
+    },
+    {
+        "Procedure Code": "D5212",
+        "Description": "pl",
+        "Price": "1300"
+    },
+    {
+        "Procedure Code": "D5213",
+        "Description": "cast pu.",
+        "Price": "1700"
+    },
+    {
+        "Procedure Code": "D5214",
+        "Description": "cast pl",
+        "Price": "1700"
+    },
+    {
+        "Procedure Code": "D5510",
+        "Description": "Repair broken complete denture base (QUAD)",
+        "Price": "400"
+    },
+    {
+        "Procedure Code": "D5520",
+        "Description": "Replace missing or broken teeth - complete denture (each tooth) (TOOTH)",
+        "Price": "200"
+    },
+    {
+        "Procedure Code": "D5750",
+        "Description": "lab reline",
+        "Price": "600"
+    },
+    {
+        "Procedure Code": "D5730",
+        "Description": "chairside reline",
+        "Price": "500"
+    },
+    {
+        "Procedure Code": "D2920",
+        "Description": "re cement crown",
+        "Price": "120"
+    },
+    {
+        "Procedure Code": "D2950",
+        "Description": "core buildup",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D2955",
+        "Description": "post renoval",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D6100",
+        "Description": "",
+        "Price": "320"
+    },
+    {
+        "Procedure Code": "D6110",
+        "Description": "implant",
+        "Price": "1600"
+    },
+    {
+        "Procedure Code": "D6056",
+        "Description": "pre fab abut",
+        "Price": "750"
+    },
+    {
+        "Procedure Code": "D6057",
+        "Description": "custom abut",
+        "Price": "800"
+    },
+    {
+        "Procedure Code": "D6058",
+        "Description": "porcelain, implant crown, ceramic crown",
+        "Price": "1400"
+    },
+    {
+        "Procedure Code": "D6059",
+        "Description": "",
+        "Price": "1400"
+    },
+    {
+        "Procedure Code": "D6242",
+        "Description": "noble metal. For united",
+        "Price": "1400"
+    },
+    {
+        "Procedure Code": "D6245",
+        "Description": "porcelain, not for united",
+        "Price": "1400"
+    },
+    {
+        "Procedure Code": "D0367",
+        "Description": "",
+        "Price": "400"
+    },
+    {
+        "Procedure Code": "D0364",
+        "Description": "Less than one jaw",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D0365",
+        "Description": "Mand",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D0366",
+        "Description": "Max",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D0368",
+        "Description": "include TMJ",
+        "Price": "375"
+    },
+    {
+        "Procedure Code": "D0383",
+        "Description": "",
+        "Price": "350"
+    },
+    {
+        "Procedure Code": "D0380",
+        "Description": "Less than one jaw",
+        "Price": "300"
+    },
+    {
+        "Procedure Code": "D0381",
+        "Description": "Mand",
+        "Price": "300"
+    },
+    {
+        "Procedure Code": "D0382",
+        "Description": "Max",
+        "Price": "300"
+    },
+    {
+        "Procedure Code": "D7950",
+        "Description": "max",
+        "Price": "800"
+    },
+    {
+        "Procedure Code": "D7140",
+        "Description": "simple ext",
+        "Price": "150"
+    },
+    {
+        "Procedure Code": "D7210",
+        "Description": "surgical ext",
+        "Price": "280"
+    },
+    {
+        "Procedure Code": "D7220",
+        "Description": "soft impacted",
+        "Price": "380"
+    },
+    {
+        "Procedure Code": "D7230",
+        "Description": "partial bony",
+        "Price": "450"
+    },
+    {
+        "Procedure Code": "D7240",
+        "Description": "fully bony",
+        "Price": "550"
+    },
+    {
+        "Procedure Code": "D3320",
+        "Description": "pre M RCT",
+        "Price": "1050"
+    }
+]
--- a/apps/ProcedureCodeFromMhPdf/procedureCodes_v1.json
+++ b/apps/ProcedureCodeFromMhPdf/procedureCodes_v1.json
--- a/apps/ProcedureCodeFromMhPdf/procedureCodes_v2.json
+++ b/apps/ProcedureCodeFromMhPdf/procedureCodes_v2.json