initial commit
This commit is contained in:
183
apps/ProcedureCodeFromMhPdf/extract_bypage.py
Executable file
183
apps/ProcedureCodeFromMhPdf/extract_bypage.py
Executable file
@@ -0,0 +1,183 @@
|
||||
import re
|
||||
import json
|
||||
from typing import List, Dict
|
||||
import fitz # PyMuPDF
|
||||
|
||||
|
||||
# =========================
|
||||
# CONFIG — EDIT THESE ONLY
|
||||
# =========================
|
||||
PDF_PATH = "MH.pdf" # path to your PDF
|
||||
PAGES = [2] # 0-based page indexes to parse, e.g., [2] for the page you showed
|
||||
OUT_PATH = "output.json" # where to write JSON
|
||||
FIRST_PRICE_IS_LTE21 = True # True => first price line is <=21; False => first price is >21
|
||||
PRINT_PAGE_TEXT = False # set True if you want to print the raw page text for sanity check
|
||||
# =========================
|
||||
|
||||
|
||||
# --- patterns ---
|
||||
code_line_re = re.compile(r"^\s*(D\d{4})\s*$")
|
||||
# a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC'
|
||||
price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE)
|
||||
# lines that definitely start a notes block we should ignore once prices are done
|
||||
note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—|–|Age limitation:|CR\b)", re.IGNORECASE)
|
||||
|
||||
|
||||
def normalize_ws(s: str) -> str:
|
||||
s = s.replace("\u00a0", " ")
|
||||
s = re.sub(r"[ \t]+", " ", s)
|
||||
s = re.sub(r"\s*\n\s*", " ", s)
|
||||
s = re.sub(r"\s{2,}", " ", s)
|
||||
return s.strip(" ,.;:-•·\n\t")
|
||||
|
||||
|
||||
def clean_money(token: str) -> str:
|
||||
if token.upper() == "NC":
|
||||
return "NC"
|
||||
return token.replace(",", "").lstrip("$").strip()
|
||||
|
||||
|
||||
def get_page_lines(pdf_path: str, pages: List[int]) -> List[str]:
|
||||
doc = fitz.open(pdf_path)
|
||||
try:
|
||||
max_idx = len(doc) - 1
|
||||
for p in pages:
|
||||
if p < 0 or p > max_idx:
|
||||
raise ValueError(f"Invalid page index {p}. Valid range is 0..{max_idx}.")
|
||||
lines: List[str] = []
|
||||
for p in pages:
|
||||
text = doc.load_page(p).get_text("text") or ""
|
||||
if PRINT_PAGE_TEXT:
|
||||
print(f"\n--- RAW PAGE {p} ---\n{text}")
|
||||
# keep line boundaries; later we parse line-by-line
|
||||
lines.extend(text.splitlines())
|
||||
return lines
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
|
||||
def extract_records(lines: List[str]) -> List[Dict[str, str]]:
|
||||
out: List[Dict[str, str]] = []
|
||||
i = 0
|
||||
n = len(lines)
|
||||
|
||||
while i < n:
|
||||
line = lines[i].strip()
|
||||
|
||||
# seek a code line
|
||||
mcode = code_line_re.match(line)
|
||||
if not mcode:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
code = mcode.group(1)
|
||||
i += 1
|
||||
|
||||
# gather description lines until we encounter price lines
|
||||
desc_lines: List[str] = []
|
||||
# skip blank lines before description
|
||||
while i < n and not lines[i].strip():
|
||||
i += 1
|
||||
|
||||
# collect description lines (usually 1–3) until first price token
|
||||
# stop also if we accidentally hit another code (defensive)
|
||||
j = i
|
||||
while j < n:
|
||||
s = lines[j].strip()
|
||||
if not s:
|
||||
# blank line inside description — consider description ended if the next is a price
|
||||
# but we don't advance here; break and let price parsing handle it
|
||||
break
|
||||
if code_line_re.match(s):
|
||||
# next code — no prices found; abandon this broken record
|
||||
break
|
||||
if price_line_re.match(s):
|
||||
# reached price section
|
||||
break
|
||||
if note_starters_re.match(s):
|
||||
# encountered a note before price — treat as end of description; prices may be missing
|
||||
break
|
||||
desc_lines.append(s)
|
||||
j += 1
|
||||
|
||||
# advance i to where we left off
|
||||
i = j
|
||||
|
||||
description = normalize_ws(" ".join(desc_lines))
|
||||
|
||||
# collect up to two price tokens
|
||||
prices: List[str] = []
|
||||
while i < n and len(prices) < 2:
|
||||
s = lines[i].strip()
|
||||
if not s:
|
||||
i += 1
|
||||
continue
|
||||
if code_line_re.match(s):
|
||||
# new record — stop; this means we never got prices (malformed)
|
||||
break
|
||||
mprice = price_line_re.match(s)
|
||||
if mprice:
|
||||
prices.append(clean_money(mprice.group(1)))
|
||||
i += 1
|
||||
continue
|
||||
# if we encounter a note/flags block, skip forward until the next code/blank
|
||||
if note_starters_re.match(s) or s in {"Y", "NC"}:
|
||||
# skip this block quickly
|
||||
i += 1
|
||||
# keep skipping subsequent non-empty, non-code lines until a blank or next code
|
||||
while i < n:
|
||||
t = lines[i].strip()
|
||||
if not t or code_line_re.match(t):
|
||||
break
|
||||
i += 1
|
||||
# now let the outer loop proceed
|
||||
continue
|
||||
# unrecognized line: if prices already found, we can break; else skip
|
||||
if prices:
|
||||
break
|
||||
i += 1
|
||||
|
||||
if len(prices) < 2:
|
||||
# couldn't find 2 prices reliably; skip this record
|
||||
continue
|
||||
|
||||
if FIRST_PRICE_IS_LTE21:
|
||||
price_lte21, price_gt21 = prices[0], prices[1]
|
||||
else:
|
||||
price_lte21, price_gt21 = prices[1], prices[0]
|
||||
|
||||
out.append(
|
||||
{
|
||||
"Procedure Code": code,
|
||||
"Description": description,
|
||||
"PriceLTEQ21": price_lte21,
|
||||
"PriceGT21": price_gt21,
|
||||
}
|
||||
)
|
||||
|
||||
# after prices, skip forward until next code or blank block end
|
||||
while i < n:
|
||||
s = lines[i].strip()
|
||||
if not s:
|
||||
i += 1
|
||||
break
|
||||
if code_line_re.match(s):
|
||||
# next record will pick this up
|
||||
break
|
||||
i += 1
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def extract_pdf_to_json(pdf_path: str, pages: List[int], out_path: str) -> List[Dict[str, str]]:
|
||||
lines = get_page_lines(pdf_path, pages)
|
||||
data = extract_records(lines)
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
data = extract_pdf_to_json(PDF_PATH, PAGES, OUT_PATH)
|
||||
print(f"Wrote {len(data)} rows to {OUT_PATH}")
|
||||
print(json.dumps(data, ensure_ascii=False, indent=2))
|
||||
Reference in New Issue
Block a user