#!/usr/bin/env python3 """ MassHealth dental PDF parser (PyMuPDF / fitz) — PAGE RANGE VERSION Parses rows like: D2160 Amalgam-three surfaces, primary or permanent $110 $92 Y Y ... Outputs a single JSON with records from the chosen page range (inclusive). Config: - PDF_PATH: path to the PDF - PAGE_START, PAGE_END: 1-based page numbers (inclusive) - FIRST_PRICE_IS_LTE21: True => first price line is <=21; False => first price is >21 - OUT_PATH: output JSON path """ import re import json from typing import List, Dict import fitz # PyMuPDF # ========================= # CONFIG — EDIT THESE ONLY # ========================= PDF_PATH = "MHv2.pdf" # path to your PDF PAGE_START = 1 # 1-based inclusive start page (e.g., 1) PAGE_END = 15 # 1-based inclusive end page (e.g., 5) OUT_PATH = "output.json" # single JSON file containing all parsed rows FIRST_PRICE_IS_LTE21 = True # True => first price line is <=21; False => first price is >21 PRINT_PAGE_TEXT = False # set True to print raw text for each page # ========================= # --- patterns --- code_line_re = re.compile(r"^\s*(D\d{4})\s*$") # a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC' price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE) # lines that definitely start a notes block to ignore once prices are done note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—|–|Age limitation:|CR\b)", re.IGNORECASE) def normalize_ws(s: str) -> str: s = s.replace("\u00a0", " ") s = re.sub(r"[ \t]+", " ", s) s = re.sub(r"\s*\n\s*", " ", s) s = re.sub(r"\s{2,}", " ", s) return s.strip(" ,.;:-•·\n\t") def clean_money(token: str) -> str: if token.upper() == "NC": return "NC" return token.replace(",", "").lstrip("$").strip() def get_page_lines(pdf_path: str, page_start_1b: int, page_end_1b: int) -> List[str]: if page_start_1b <= 0 or page_end_1b <= 0: raise ValueError("PAGE_START and PAGE_END must be >= 1 (1-based).") if page_start_1b > page_end_1b: raise ValueError("PAGE_START cannot be greater than PAGE_END.") doc = fitz.open(pdf_path) try: last_idx_0b = len(doc) - 1 # convert to 0-based inclusive range start_0b = page_start_1b - 1 end_0b = page_end_1b - 1 if start_0b < 0 or end_0b > last_idx_0b: raise ValueError(f"Page range out of bounds. Valid 1-based range is 1..{last_idx_0b + 1}.") lines: List[str] = [] for p in range(start_0b, end_0b + 1): text = doc.load_page(p).get_text("text") or "" if PRINT_PAGE_TEXT: print(f"\n--- RAW PAGE {p} (0-based; shown as {p+1} 1-based) ---\n{text}") lines.extend(text.splitlines()) return lines finally: doc.close() def extract_records(lines: List[str]) -> List[Dict[str, str]]: out: List[Dict[str, str]] = [] i = 0 n = len(lines) while i < n: line = lines[i].strip() # seek a code line mcode = code_line_re.match(line) if not mcode: i += 1 continue code = mcode.group(1) i += 1 # gather description lines until we encounter price lines desc_lines: List[str] = [] # skip blank lines before description while i < n and not lines[i].strip(): i += 1 # collect description lines (usually 1–3) until first price token # stop also if we accidentally hit another code (defensive) j = i while j < n: s = lines[j].strip() if not s: break if code_line_re.match(s): # next code — description ended abruptly (malformed) break if price_line_re.match(s): # reached price section break if note_starters_re.match(s): # encountered a note before price — treat as end of description; prices may be missing break desc_lines.append(s) j += 1 # advance i to where we left off i = j description = normalize_ws(" ".join(desc_lines)) # collect up to two price tokens prices: List[str] = [] while i < n and len(prices) < 2: s = lines[i].strip() if not s: i += 1 continue if code_line_re.match(s): # new record — stop; this means we never got prices (malformed) break mprice = price_line_re.match(s) if mprice: prices.append(clean_money(mprice.group(1))) i += 1 continue # if we encounter a note/flags block, skip forward until a blank or next code if note_starters_re.match(s) or s in {"Y", "NC"}: i += 1 while i < n: t = lines[i].strip() if not t or code_line_re.match(t): break i += 1 continue # unrecognized line: if we already captured some prices, break; else skip if prices: break i += 1 if len(prices) < 2: # couldn't find 2 prices reliably; skip this record continue if FIRST_PRICE_IS_LTE21: price_lte21, price_gt21 = prices[0], prices[1] else: price_lte21, price_gt21 = prices[1], prices[0] out.append( { "Procedure Code": code, "Description": description, "PriceLTEQ21": price_lte21, "PriceGT21": price_gt21, } ) # after prices, skip forward until next code or blank block end while i < n: s = lines[i].strip() if not s: i += 1 break if code_line_re.match(s): break i += 1 return out def extract_pdf_range_to_json(pdf_path: str, page_start_1b: int, page_end_1b: int, out_path: str) -> List[Dict[str, str]]: lines = get_page_lines(pdf_path, page_start_1b, page_end_1b) data = extract_records(lines) with open(out_path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) return data if __name__ == "__main__": data = extract_pdf_range_to_json(PDF_PATH, PAGE_START, PAGE_END, OUT_PATH) print(f"Wrote {len(data)} rows to {OUT_PATH}") print(json.dumps(data, ensure_ascii=False, indent=2))