Files
DentalManagementMHAprilgg/apps/ProcedureCodeFromMhPdf/extract_bypage.py
2026-04-04 22:13:55 -04:00

184 lines
6.0 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import json
from typing import List, Dict
import fitz # PyMuPDF
# =========================
# CONFIG — EDIT THESE ONLY
# =========================
PDF_PATH = "MH.pdf" # path to your PDF
PAGES = [2] # 0-based page indexes to parse, e.g., [2] for the page you showed
OUT_PATH = "output.json" # where to write JSON
FIRST_PRICE_IS_LTE21 = True # True => first price line is <=21; False => first price is >21
PRINT_PAGE_TEXT = False # set True if you want to print the raw page text for sanity check
# =========================
# --- patterns ---
code_line_re = re.compile(r"^\s*(D\d{4})\s*$")
# a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC'
price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE)
# lines that definitely start a notes block we should ignore once prices are done
note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—||Age limitation:|CR\b)", re.IGNORECASE)
def normalize_ws(s: str) -> str:
s = s.replace("\u00a0", " ")
s = re.sub(r"[ \t]+", " ", s)
s = re.sub(r"\s*\n\s*", " ", s)
s = re.sub(r"\s{2,}", " ", s)
return s.strip(" ,.;:-•·\n\t")
def clean_money(token: str) -> str:
if token.upper() == "NC":
return "NC"
return token.replace(",", "").lstrip("$").strip()
def get_page_lines(pdf_path: str, pages: List[int]) -> List[str]:
doc = fitz.open(pdf_path)
try:
max_idx = len(doc) - 1
for p in pages:
if p < 0 or p > max_idx:
raise ValueError(f"Invalid page index {p}. Valid range is 0..{max_idx}.")
lines: List[str] = []
for p in pages:
text = doc.load_page(p).get_text("text") or ""
if PRINT_PAGE_TEXT:
print(f"\n--- RAW PAGE {p} ---\n{text}")
# keep line boundaries; later we parse line-by-line
lines.extend(text.splitlines())
return lines
finally:
doc.close()
def extract_records(lines: List[str]) -> List[Dict[str, str]]:
out: List[Dict[str, str]] = []
i = 0
n = len(lines)
while i < n:
line = lines[i].strip()
# seek a code line
mcode = code_line_re.match(line)
if not mcode:
i += 1
continue
code = mcode.group(1)
i += 1
# gather description lines until we encounter price lines
desc_lines: List[str] = []
# skip blank lines before description
while i < n and not lines[i].strip():
i += 1
# collect description lines (usually 13) until first price token
# stop also if we accidentally hit another code (defensive)
j = i
while j < n:
s = lines[j].strip()
if not s:
# blank line inside description — consider description ended if the next is a price
# but we don't advance here; break and let price parsing handle it
break
if code_line_re.match(s):
# next code — no prices found; abandon this broken record
break
if price_line_re.match(s):
# reached price section
break
if note_starters_re.match(s):
# encountered a note before price — treat as end of description; prices may be missing
break
desc_lines.append(s)
j += 1
# advance i to where we left off
i = j
description = normalize_ws(" ".join(desc_lines))
# collect up to two price tokens
prices: List[str] = []
while i < n and len(prices) < 2:
s = lines[i].strip()
if not s:
i += 1
continue
if code_line_re.match(s):
# new record — stop; this means we never got prices (malformed)
break
mprice = price_line_re.match(s)
if mprice:
prices.append(clean_money(mprice.group(1)))
i += 1
continue
# if we encounter a note/flags block, skip forward until the next code/blank
if note_starters_re.match(s) or s in {"Y", "NC"}:
# skip this block quickly
i += 1
# keep skipping subsequent non-empty, non-code lines until a blank or next code
while i < n:
t = lines[i].strip()
if not t or code_line_re.match(t):
break
i += 1
# now let the outer loop proceed
continue
# unrecognized line: if prices already found, we can break; else skip
if prices:
break
i += 1
if len(prices) < 2:
# couldn't find 2 prices reliably; skip this record
continue
if FIRST_PRICE_IS_LTE21:
price_lte21, price_gt21 = prices[0], prices[1]
else:
price_lte21, price_gt21 = prices[1], prices[0]
out.append(
{
"Procedure Code": code,
"Description": description,
"PriceLTEQ21": price_lte21,
"PriceGT21": price_gt21,
}
)
# after prices, skip forward until next code or blank block end
while i < n:
s = lines[i].strip()
if not s:
i += 1
break
if code_line_re.match(s):
# next record will pick this up
break
i += 1
return out
def extract_pdf_to_json(pdf_path: str, pages: List[int], out_path: str) -> List[Dict[str, str]]:
lines = get_page_lines(pdf_path, pages)
data = extract_records(lines)
with open(out_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
return data
if __name__ == "__main__":
data = extract_pdf_to_json(PDF_PATH, PAGES, OUT_PATH)
print(f"Wrote {len(data)} rows to {OUT_PATH}")
print(json.dumps(data, ensure_ascii=False, indent=2))