initial commit
This commit is contained in:
BIN
apps/ProcedureCodeFromMhPdf/MH.pdf
Executable file
BIN
apps/ProcedureCodeFromMhPdf/MH.pdf
Executable file
Binary file not shown.
BIN
apps/ProcedureCodeFromMhPdf/MHv2.pdf
Executable file
BIN
apps/ProcedureCodeFromMhPdf/MHv2.pdf
Executable file
Binary file not shown.
5
apps/ProcedureCodeFromMhPdf/Readme.md
Executable file
5
apps/ProcedureCodeFromMhPdf/Readme.md
Executable file
@@ -0,0 +1,5 @@
|
||||
This code was written only while extracting procedure code data from Mass Health pdf, to make process easy.
|
||||
|
||||
Only was a one time process, not used as core functionality in this whole app.
|
||||
|
||||
Keeping it as in future might need to extract again.
|
||||
96
apps/ProcedureCodeFromMhPdf/compareJson.py
Executable file
96
apps/ProcedureCodeFromMhPdf/compareJson.py
Executable file
@@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Compare a main dental JSON file with one or more other JSON files and
|
||||
return all records whose 'Procedure Code' is NOT present in the main file.
|
||||
|
||||
- Matching key: 'Procedure Code' (case-insensitive, trimmed).
|
||||
- Keeps the full record from the other files (including extra fields like 'Full Price').
|
||||
- Deduplicates by Procedure Code across the collected "missing" results.
|
||||
|
||||
CONFIG: set MAIN_PATH, OTHER_PATHS, OUT_PATH below.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
|
||||
# =========================
|
||||
# CONFIG — EDIT THESE ONLY
|
||||
# =========================
|
||||
MAIN_PATH = "procedureCodes_v2.json" # your main JSON (with PriceLTEQ21/PriceGT21)
|
||||
OTHER_PATHS = [
|
||||
# "procedureCodesOld.json", # one or more other JSON files to compare against the main
|
||||
"output.json",
|
||||
]
|
||||
OUT_PATH = "not_in_main.json" # where to write the results
|
||||
# =========================
|
||||
|
||||
|
||||
def _load_json_any(path: str) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Load JSON. Accept:
|
||||
- a list of objects
|
||||
- a single object (wraps into a list)
|
||||
"""
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if isinstance(data, dict):
|
||||
return [data]
|
||||
if isinstance(data, list):
|
||||
# filter out non-dict items defensively
|
||||
return [x for x in data if isinstance(x, dict)]
|
||||
raise ValueError(f"Unsupported JSON top-level type in {path}: {type(data)}")
|
||||
|
||||
|
||||
def _norm_code(record: Dict[str, Any]) -> str:
|
||||
# Normalize the 'Procedure Code' for matching
|
||||
code = str(record.get("Procedure Code", "")).strip().upper()
|
||||
# Some PDFs might have stray spaces, tabs, or zero-width chars
|
||||
code = "".join(ch for ch in code if not ch.isspace())
|
||||
return code
|
||||
|
||||
|
||||
def collect_main_codes(main_path: str) -> set:
|
||||
main_items = _load_json_any(main_path)
|
||||
codes = {_norm_code(rec) for rec in main_items if _norm_code(rec)}
|
||||
return codes
|
||||
|
||||
|
||||
def collect_missing_records(other_paths: List[str], main_codes: set) -> List[Dict[str, Any]]:
|
||||
missing: Dict[str, Dict[str, Any]] = {} # map normalized code -> record
|
||||
for p in other_paths:
|
||||
items = _load_json_any(p)
|
||||
for rec in items:
|
||||
code_norm = _norm_code(rec)
|
||||
if not code_norm:
|
||||
continue
|
||||
if code_norm not in main_codes and code_norm not in missing:
|
||||
# Keep the full original record
|
||||
missing[code_norm] = rec
|
||||
# return in a stable, sorted order by code
|
||||
return [missing[k] for k in sorted(missing.keys())]
|
||||
|
||||
|
||||
def main():
|
||||
# Validate files exist
|
||||
if not Path(MAIN_PATH).exists():
|
||||
raise FileNotFoundError(f"Main file not found: {MAIN_PATH}")
|
||||
for p in OTHER_PATHS:
|
||||
if not Path(p).exists():
|
||||
raise FileNotFoundError(f"Other file not found: {p}")
|
||||
|
||||
main_codes = collect_main_codes(MAIN_PATH)
|
||||
missing_records = collect_missing_records(OTHER_PATHS, main_codes)
|
||||
|
||||
with open(OUT_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(missing_records, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"Main codes: {len(main_codes)}")
|
||||
print(f"Missing from main: {len(missing_records)}")
|
||||
print(f"Wrote results to {OUT_PATH}")
|
||||
# Also echo to stdout
|
||||
print(json.dumps(missing_records, ensure_ascii=False, indent=2))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
241
apps/ProcedureCodeFromMhPdf/compareJson_matchingPrice.py
Executable file
241
apps/ProcedureCodeFromMhPdf/compareJson_matchingPrice.py
Executable file
@@ -0,0 +1,241 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Compare prices between two JSON files (file1 vs file2) — CONFIG-DRIVEN version.
|
||||
|
||||
Behavior:
|
||||
- Loads two JSON arrays of records (file1 and file2).
|
||||
- Indexes by procedure code (tries common keys like "Procedure Code", "Code", etc).
|
||||
- Normalizes money tokens: removes $ and commas, treats "NC" as literal.
|
||||
- Compares all three price fields:
|
||||
- Price
|
||||
- PriceLTEQ21
|
||||
- PriceGT21
|
||||
Matching rules:
|
||||
- If both records have the same named field, compare them.
|
||||
- If file1 has only a single "Price" and file2 has PriceLTEQ21 / PriceGT21,
|
||||
the script will compare file1.Price to BOTH PriceLTEQ21 and PriceGT21 (and
|
||||
report mismatch if file1.Price differs from either).
|
||||
- "NC" only equals "NC".
|
||||
- Numeric tokens compared numerically within tolerance (default 0.005).
|
||||
- Produces output JSON (configured below) listing:
|
||||
- mismatches: detailed entries for codes that differ
|
||||
- only_in_file1: codes found only in file1
|
||||
- only_in_file2: codes found only in file2
|
||||
- summary
|
||||
|
||||
Edit the CONFIG block below, then run the script.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
# =========================
|
||||
# CONFIG — EDIT THESE ONLY
|
||||
# =========================
|
||||
FILE1_PATH = "procedureCodes_v2.json" # path to file 1 (your base/reference file)
|
||||
FILE2_PATH = "output.json" # path to file 2 (the file to compare)
|
||||
OUT_PATH = "price_diffs.json" # output JSON writing mismatches
|
||||
TOLERANCE = 0.005 # numeric tolerance for floats
|
||||
CODE_KEY_CANDIDATES = ("Procedure Code", "Code", "procedure_code", "procedure code")
|
||||
# If True: when file1 has single "Price" and file2 has both LTEQ/GT values,
|
||||
# compare file1.Price against both fields and flag mismatch if either differs.
|
||||
COMPARE_SINGLE_PRICE_AGAINST_BOTH = True
|
||||
# =========================
|
||||
|
||||
_money_re = re.compile(r"^\s*(NC|\$?\s*[\d,]+(?:\.\d{1,2})?)\s*$", re.IGNORECASE)
|
||||
|
||||
|
||||
def normalize_money_token(token: Optional[str]) -> Optional[str]:
|
||||
"""Normalize money token to canonical string or 'NC'. Return None if missing/empty."""
|
||||
if token is None:
|
||||
return None
|
||||
t = str(token).strip()
|
||||
if not t:
|
||||
return None
|
||||
m = _money_re.match(t)
|
||||
if not m:
|
||||
# unknown format — return trimmed token so mismatch is visible
|
||||
return t
|
||||
val = m.group(1)
|
||||
if val.upper() == "NC":
|
||||
return "NC"
|
||||
val = val.replace("$", "").replace(",", "").strip()
|
||||
# Remove trailing zeros from decimals, but preserve integer form
|
||||
if "." in val:
|
||||
val = val.rstrip("0").rstrip(".")
|
||||
return val
|
||||
|
||||
|
||||
def numeric_compare(a: Optional[str], b: Optional[str], tol: float = TOLERANCE) -> bool:
|
||||
"""Compare normalized tokens. NC compares only equal to NC. Otherwise numeric compare."""
|
||||
if a is None or b is None:
|
||||
return False
|
||||
if a == b:
|
||||
return True
|
||||
if a.upper() == "NC" or b.upper() == "NC":
|
||||
return a.upper() == b.upper()
|
||||
try:
|
||||
return abs(float(a) - float(b)) <= tol
|
||||
except Exception:
|
||||
# fallback to exact match if non-numeric
|
||||
return a == b
|
||||
|
||||
|
||||
def load_json(path: str) -> List[Dict[str, Any]]:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if not isinstance(data, list):
|
||||
raise ValueError(f"Expected JSON array in {path}")
|
||||
return data
|
||||
|
||||
|
||||
def build_index(records: List[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
|
||||
"""Index records by procedure code. First match wins for duplicates."""
|
||||
idx: Dict[str, Dict[str, Any]] = {}
|
||||
for rec in records:
|
||||
code = None
|
||||
for k in CODE_KEY_CANDIDATES:
|
||||
if k in rec and rec[k]:
|
||||
code = str(rec[k]).strip()
|
||||
break
|
||||
if not code:
|
||||
# try to find any field with a Dxxxx-like value
|
||||
for v in rec.values():
|
||||
if isinstance(v, str) and re.match(r"^\s*D\d{4}\s*$", v):
|
||||
code = v.strip()
|
||||
break
|
||||
if not code:
|
||||
continue
|
||||
if code in idx:
|
||||
# duplicate: keep first occurrence
|
||||
continue
|
||||
idx[code] = rec
|
||||
return idx
|
||||
|
||||
|
||||
def extract_price_fields(rec: Dict[str, Any]) -> Dict[str, Optional[str]]:
|
||||
"""
|
||||
Return dict with normalized values for 'Price', 'PriceLTEQ21', and 'PriceGT21'.
|
||||
Keys always present with None when missing.
|
||||
"""
|
||||
return {
|
||||
"Price": normalize_money_token(rec.get("Price")),
|
||||
"PriceLTEQ21": normalize_money_token(rec.get("PriceLTEQ21")),
|
||||
"PriceGT21": normalize_money_token(rec.get("PriceGT21")),
|
||||
}
|
||||
|
||||
|
||||
def compare_code_records(code: str, rec1: Dict[str, Any], rec2: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Compare price fields for a single code. Return mismatch dict if any mismatch present, else None.
|
||||
Mismatch dict includes file1/file2 price fields and per-field mismatch details.
|
||||
"""
|
||||
p1 = extract_price_fields(rec1)
|
||||
p2 = extract_price_fields(rec2)
|
||||
|
||||
mismatches = []
|
||||
|
||||
# 1) Compare same-named fields if both present
|
||||
for key in ("Price", "PriceLTEQ21", "PriceGT21"):
|
||||
a = p1.get(key)
|
||||
b = p2.get(key)
|
||||
if a is None and b is None:
|
||||
continue
|
||||
if a is None or b is None:
|
||||
# present in one but not the other: count as mismatch
|
||||
mismatches.append({"field": key, "file1": a, "file2": b, "reason": "missing_in_one"})
|
||||
continue
|
||||
if not numeric_compare(a, b):
|
||||
mismatches.append({"field": key, "file1": a, "file2": b, "reason": "value_mismatch"})
|
||||
|
||||
# 2) Special-case: if file1 has only single Price, and file2 has LTEQ/GT present,
|
||||
# optionally compare file1.Price against each of them.
|
||||
if COMPARE_SINGLE_PRICE_AGAINST_BOTH:
|
||||
# Only apply if file1.Price exists and file1 does NOT have LTEQ/GT (both None),
|
||||
# but file2 has at least one of LTEQ/GT.
|
||||
file1_has_price = p1.get("Price") is not None
|
||||
file1_has_any_special = (p1.get("PriceLTEQ21") is not None) or (p1.get("PriceGT21") is not None)
|
||||
file2_has_any_special = (p2.get("PriceLTEQ21") is not None) or (p2.get("PriceGT21") is not None)
|
||||
if file1_has_price and (not file1_has_any_special) and file2_has_any_special:
|
||||
# compare file1.Price to each present file2 special price
|
||||
left = p1.get("Price")
|
||||
for special_key in ("PriceLTEQ21", "PriceGT21"):
|
||||
right = p2.get(special_key)
|
||||
if right is None:
|
||||
continue
|
||||
# If already recorded a same-named mismatch for this special_key above,
|
||||
# that mismatch covered the case where file1 was missing that named field.
|
||||
# But since file1 lacked that special field, we still want to compare single Price vs special.
|
||||
if not numeric_compare(left, right):
|
||||
mismatches.append({
|
||||
"field": f"Price_vs_{special_key}",
|
||||
"file1": left,
|
||||
"file2": right,
|
||||
"reason": "single_price_vs_special_mismatch"
|
||||
})
|
||||
|
||||
if mismatches:
|
||||
return {
|
||||
"Procedure Code": code,
|
||||
"Description_file1": rec1.get("Description"),
|
||||
"Description_file2": rec2.get("Description"),
|
||||
"file1_prices": p1,
|
||||
"file2_prices": p2,
|
||||
"mismatches": mismatches
|
||||
}
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
# load inputs
|
||||
data1 = load_json(FILE1_PATH)
|
||||
data2 = load_json(FILE2_PATH)
|
||||
|
||||
idx1 = build_index(data1)
|
||||
idx2 = build_index(data2)
|
||||
|
||||
codes_all = sorted(set(list(idx1.keys()) + list(idx2.keys())))
|
||||
|
||||
mismatched: List[Dict[str, Any]] = []
|
||||
only_in_file1: List[str] = []
|
||||
only_in_file2: List[str] = []
|
||||
|
||||
for code in codes_all:
|
||||
rec1 = idx1.get(code)
|
||||
rec2 = idx2.get(code)
|
||||
if rec1 is None:
|
||||
only_in_file2.append(code)
|
||||
continue
|
||||
if rec2 is None:
|
||||
only_in_file1.append(code)
|
||||
continue
|
||||
diff = compare_code_records(code, rec1, rec2)
|
||||
if diff:
|
||||
mismatched.append(diff)
|
||||
|
||||
out = {
|
||||
"summary": {
|
||||
"total_codes_found": len(codes_all),
|
||||
"only_in_file1_count": len(only_in_file1),
|
||||
"only_in_file2_count": len(only_in_file2),
|
||||
"mismatched_count": len(mismatched),
|
||||
},
|
||||
"only_in_file1": only_in_file1,
|
||||
"only_in_file2": only_in_file2,
|
||||
"mismatches": mismatched
|
||||
}
|
||||
|
||||
with open(OUT_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(out, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# brief console summary
|
||||
print(f"Compared {len(codes_all)} procedure codes.")
|
||||
print(f"Only in {FILE1_PATH}: {len(only_in_file1)} codes.")
|
||||
print(f"Only in {FILE2_PATH}: {len(only_in_file2)} codes.")
|
||||
print(f"Mismatched prices: {len(mismatched)} codes.")
|
||||
print(f"Wrote detailed diffs to {OUT_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
183
apps/ProcedureCodeFromMhPdf/extract_bypage.py
Executable file
183
apps/ProcedureCodeFromMhPdf/extract_bypage.py
Executable file
@@ -0,0 +1,183 @@
|
||||
import re
|
||||
import json
|
||||
from typing import List, Dict
|
||||
import fitz # PyMuPDF
|
||||
|
||||
|
||||
# =========================
|
||||
# CONFIG — EDIT THESE ONLY
|
||||
# =========================
|
||||
PDF_PATH = "MH.pdf" # path to your PDF
|
||||
PAGES = [2] # 0-based page indexes to parse, e.g., [2] for the page you showed
|
||||
OUT_PATH = "output.json" # where to write JSON
|
||||
FIRST_PRICE_IS_LTE21 = True # True => first price line is <=21; False => first price is >21
|
||||
PRINT_PAGE_TEXT = False # set True if you want to print the raw page text for sanity check
|
||||
# =========================
|
||||
|
||||
|
||||
# --- patterns ---
|
||||
code_line_re = re.compile(r"^\s*(D\d{4})\s*$")
|
||||
# a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC'
|
||||
price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE)
|
||||
# lines that definitely start a notes block we should ignore once prices are done
|
||||
note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—|–|Age limitation:|CR\b)", re.IGNORECASE)
|
||||
|
||||
|
||||
def normalize_ws(s: str) -> str:
|
||||
s = s.replace("\u00a0", " ")
|
||||
s = re.sub(r"[ \t]+", " ", s)
|
||||
s = re.sub(r"\s*\n\s*", " ", s)
|
||||
s = re.sub(r"\s{2,}", " ", s)
|
||||
return s.strip(" ,.;:-•·\n\t")
|
||||
|
||||
|
||||
def clean_money(token: str) -> str:
|
||||
if token.upper() == "NC":
|
||||
return "NC"
|
||||
return token.replace(",", "").lstrip("$").strip()
|
||||
|
||||
|
||||
def get_page_lines(pdf_path: str, pages: List[int]) -> List[str]:
|
||||
doc = fitz.open(pdf_path)
|
||||
try:
|
||||
max_idx = len(doc) - 1
|
||||
for p in pages:
|
||||
if p < 0 or p > max_idx:
|
||||
raise ValueError(f"Invalid page index {p}. Valid range is 0..{max_idx}.")
|
||||
lines: List[str] = []
|
||||
for p in pages:
|
||||
text = doc.load_page(p).get_text("text") or ""
|
||||
if PRINT_PAGE_TEXT:
|
||||
print(f"\n--- RAW PAGE {p} ---\n{text}")
|
||||
# keep line boundaries; later we parse line-by-line
|
||||
lines.extend(text.splitlines())
|
||||
return lines
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
|
||||
def extract_records(lines: List[str]) -> List[Dict[str, str]]:
|
||||
out: List[Dict[str, str]] = []
|
||||
i = 0
|
||||
n = len(lines)
|
||||
|
||||
while i < n:
|
||||
line = lines[i].strip()
|
||||
|
||||
# seek a code line
|
||||
mcode = code_line_re.match(line)
|
||||
if not mcode:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
code = mcode.group(1)
|
||||
i += 1
|
||||
|
||||
# gather description lines until we encounter price lines
|
||||
desc_lines: List[str] = []
|
||||
# skip blank lines before description
|
||||
while i < n and not lines[i].strip():
|
||||
i += 1
|
||||
|
||||
# collect description lines (usually 1–3) until first price token
|
||||
# stop also if we accidentally hit another code (defensive)
|
||||
j = i
|
||||
while j < n:
|
||||
s = lines[j].strip()
|
||||
if not s:
|
||||
# blank line inside description — consider description ended if the next is a price
|
||||
# but we don't advance here; break and let price parsing handle it
|
||||
break
|
||||
if code_line_re.match(s):
|
||||
# next code — no prices found; abandon this broken record
|
||||
break
|
||||
if price_line_re.match(s):
|
||||
# reached price section
|
||||
break
|
||||
if note_starters_re.match(s):
|
||||
# encountered a note before price — treat as end of description; prices may be missing
|
||||
break
|
||||
desc_lines.append(s)
|
||||
j += 1
|
||||
|
||||
# advance i to where we left off
|
||||
i = j
|
||||
|
||||
description = normalize_ws(" ".join(desc_lines))
|
||||
|
||||
# collect up to two price tokens
|
||||
prices: List[str] = []
|
||||
while i < n and len(prices) < 2:
|
||||
s = lines[i].strip()
|
||||
if not s:
|
||||
i += 1
|
||||
continue
|
||||
if code_line_re.match(s):
|
||||
# new record — stop; this means we never got prices (malformed)
|
||||
break
|
||||
mprice = price_line_re.match(s)
|
||||
if mprice:
|
||||
prices.append(clean_money(mprice.group(1)))
|
||||
i += 1
|
||||
continue
|
||||
# if we encounter a note/flags block, skip forward until the next code/blank
|
||||
if note_starters_re.match(s) or s in {"Y", "NC"}:
|
||||
# skip this block quickly
|
||||
i += 1
|
||||
# keep skipping subsequent non-empty, non-code lines until a blank or next code
|
||||
while i < n:
|
||||
t = lines[i].strip()
|
||||
if not t or code_line_re.match(t):
|
||||
break
|
||||
i += 1
|
||||
# now let the outer loop proceed
|
||||
continue
|
||||
# unrecognized line: if prices already found, we can break; else skip
|
||||
if prices:
|
||||
break
|
||||
i += 1
|
||||
|
||||
if len(prices) < 2:
|
||||
# couldn't find 2 prices reliably; skip this record
|
||||
continue
|
||||
|
||||
if FIRST_PRICE_IS_LTE21:
|
||||
price_lte21, price_gt21 = prices[0], prices[1]
|
||||
else:
|
||||
price_lte21, price_gt21 = prices[1], prices[0]
|
||||
|
||||
out.append(
|
||||
{
|
||||
"Procedure Code": code,
|
||||
"Description": description,
|
||||
"PriceLTEQ21": price_lte21,
|
||||
"PriceGT21": price_gt21,
|
||||
}
|
||||
)
|
||||
|
||||
# after prices, skip forward until next code or blank block end
|
||||
while i < n:
|
||||
s = lines[i].strip()
|
||||
if not s:
|
||||
i += 1
|
||||
break
|
||||
if code_line_re.match(s):
|
||||
# next record will pick this up
|
||||
break
|
||||
i += 1
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def extract_pdf_to_json(pdf_path: str, pages: List[int], out_path: str) -> List[Dict[str, str]]:
|
||||
lines = get_page_lines(pdf_path, pages)
|
||||
data = extract_records(lines)
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
data = extract_pdf_to_json(PDF_PATH, PAGES, OUT_PATH)
|
||||
print(f"Wrote {len(data)} rows to {OUT_PATH}")
|
||||
print(json.dumps(data, ensure_ascii=False, indent=2))
|
||||
208
apps/ProcedureCodeFromMhPdf/extract_byrange.py
Executable file
208
apps/ProcedureCodeFromMhPdf/extract_byrange.py
Executable file
@@ -0,0 +1,208 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
MassHealth dental PDF parser (PyMuPDF / fitz) — PAGE RANGE VERSION
|
||||
|
||||
Parses rows like:
|
||||
|
||||
D2160
|
||||
Amalgam-three surfaces,
|
||||
primary or permanent
|
||||
$110
|
||||
$92
|
||||
Y
|
||||
Y
|
||||
...
|
||||
|
||||
Outputs a single JSON with records from the chosen page range (inclusive).
|
||||
|
||||
Config:
|
||||
- PDF_PATH: path to the PDF
|
||||
- PAGE_START, PAGE_END: 1-based page numbers (inclusive)
|
||||
- FIRST_PRICE_IS_LTE21: True => first price line is <=21; False => first price is >21
|
||||
- OUT_PATH: output JSON path
|
||||
"""
|
||||
|
||||
import re
|
||||
import json
|
||||
from typing import List, Dict
|
||||
import fitz # PyMuPDF
|
||||
|
||||
|
||||
# =========================
|
||||
# CONFIG — EDIT THESE ONLY
|
||||
# =========================
|
||||
PDF_PATH = "MHv2.pdf" # path to your PDF
|
||||
PAGE_START = 1 # 1-based inclusive start page (e.g., 1)
|
||||
PAGE_END = 15 # 1-based inclusive end page (e.g., 5)
|
||||
OUT_PATH = "output.json" # single JSON file containing all parsed rows
|
||||
FIRST_PRICE_IS_LTE21 = True # True => first price line is <=21; False => first price is >21
|
||||
PRINT_PAGE_TEXT = False # set True to print raw text for each page
|
||||
# =========================
|
||||
|
||||
|
||||
# --- patterns ---
|
||||
code_line_re = re.compile(r"^\s*(D\d{4})\s*$")
|
||||
# a price token is either '$123', '$1,234.50', '123', '123.45', or 'NC'
|
||||
price_line_re = re.compile(r"^\s*(?:\$\s*)?(\d{1,3}(?:,\d{3})*(?:\.\d{2})?|\d+(?:\.\d{2})?|NC)\s*$", re.IGNORECASE)
|
||||
# lines that definitely start a notes block to ignore once prices are done
|
||||
note_starters_re = re.compile(r"^(Teeth\b|One of\b|--|—|–|Age limitation:|CR\b)", re.IGNORECASE)
|
||||
|
||||
|
||||
def normalize_ws(s: str) -> str:
|
||||
s = s.replace("\u00a0", " ")
|
||||
s = re.sub(r"[ \t]+", " ", s)
|
||||
s = re.sub(r"\s*\n\s*", " ", s)
|
||||
s = re.sub(r"\s{2,}", " ", s)
|
||||
return s.strip(" ,.;:-•·\n\t")
|
||||
|
||||
|
||||
def clean_money(token: str) -> str:
|
||||
if token.upper() == "NC":
|
||||
return "NC"
|
||||
return token.replace(",", "").lstrip("$").strip()
|
||||
|
||||
|
||||
def get_page_lines(pdf_path: str, page_start_1b: int, page_end_1b: int) -> List[str]:
|
||||
if page_start_1b <= 0 or page_end_1b <= 0:
|
||||
raise ValueError("PAGE_START and PAGE_END must be >= 1 (1-based).")
|
||||
if page_start_1b > page_end_1b:
|
||||
raise ValueError("PAGE_START cannot be greater than PAGE_END.")
|
||||
|
||||
doc = fitz.open(pdf_path)
|
||||
try:
|
||||
last_idx_0b = len(doc) - 1
|
||||
# convert to 0-based inclusive range
|
||||
start_0b = page_start_1b - 1
|
||||
end_0b = page_end_1b - 1
|
||||
if start_0b < 0 or end_0b > last_idx_0b:
|
||||
raise ValueError(f"Page range out of bounds. Valid 1-based range is 1..{last_idx_0b + 1}.")
|
||||
lines: List[str] = []
|
||||
for p in range(start_0b, end_0b + 1):
|
||||
text = doc.load_page(p).get_text("text") or ""
|
||||
if PRINT_PAGE_TEXT:
|
||||
print(f"\n--- RAW PAGE {p} (0-based; shown as {p+1} 1-based) ---\n{text}")
|
||||
lines.extend(text.splitlines())
|
||||
return lines
|
||||
finally:
|
||||
doc.close()
|
||||
|
||||
|
||||
def extract_records(lines: List[str]) -> List[Dict[str, str]]:
|
||||
out: List[Dict[str, str]] = []
|
||||
i = 0
|
||||
n = len(lines)
|
||||
|
||||
while i < n:
|
||||
line = lines[i].strip()
|
||||
|
||||
# seek a code line
|
||||
mcode = code_line_re.match(line)
|
||||
if not mcode:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
code = mcode.group(1)
|
||||
i += 1
|
||||
|
||||
# gather description lines until we encounter price lines
|
||||
desc_lines: List[str] = []
|
||||
# skip blank lines before description
|
||||
while i < n and not lines[i].strip():
|
||||
i += 1
|
||||
|
||||
# collect description lines (usually 1–3) until first price token
|
||||
# stop also if we accidentally hit another code (defensive)
|
||||
j = i
|
||||
while j < n:
|
||||
s = lines[j].strip()
|
||||
if not s:
|
||||
break
|
||||
if code_line_re.match(s):
|
||||
# next code — description ended abruptly (malformed)
|
||||
break
|
||||
if price_line_re.match(s):
|
||||
# reached price section
|
||||
break
|
||||
if note_starters_re.match(s):
|
||||
# encountered a note before price — treat as end of description; prices may be missing
|
||||
break
|
||||
desc_lines.append(s)
|
||||
j += 1
|
||||
|
||||
# advance i to where we left off
|
||||
i = j
|
||||
|
||||
description = normalize_ws(" ".join(desc_lines))
|
||||
|
||||
# collect up to two price tokens
|
||||
prices: List[str] = []
|
||||
while i < n and len(prices) < 2:
|
||||
s = lines[i].strip()
|
||||
if not s:
|
||||
i += 1
|
||||
continue
|
||||
if code_line_re.match(s):
|
||||
# new record — stop; this means we never got prices (malformed)
|
||||
break
|
||||
mprice = price_line_re.match(s)
|
||||
if mprice:
|
||||
prices.append(clean_money(mprice.group(1)))
|
||||
i += 1
|
||||
continue
|
||||
# if we encounter a note/flags block, skip forward until a blank or next code
|
||||
if note_starters_re.match(s) or s in {"Y", "NC"}:
|
||||
i += 1
|
||||
while i < n:
|
||||
t = lines[i].strip()
|
||||
if not t or code_line_re.match(t):
|
||||
break
|
||||
i += 1
|
||||
continue
|
||||
# unrecognized line: if we already captured some prices, break; else skip
|
||||
if prices:
|
||||
break
|
||||
i += 1
|
||||
|
||||
if len(prices) < 2:
|
||||
# couldn't find 2 prices reliably; skip this record
|
||||
continue
|
||||
|
||||
if FIRST_PRICE_IS_LTE21:
|
||||
price_lte21, price_gt21 = prices[0], prices[1]
|
||||
else:
|
||||
price_lte21, price_gt21 = prices[1], prices[0]
|
||||
|
||||
out.append(
|
||||
{
|
||||
"Procedure Code": code,
|
||||
"Description": description,
|
||||
"PriceLTEQ21": price_lte21,
|
||||
"PriceGT21": price_gt21,
|
||||
}
|
||||
)
|
||||
|
||||
# after prices, skip forward until next code or blank block end
|
||||
while i < n:
|
||||
s = lines[i].strip()
|
||||
if not s:
|
||||
i += 1
|
||||
break
|
||||
if code_line_re.match(s):
|
||||
break
|
||||
i += 1
|
||||
|
||||
return out
|
||||
|
||||
|
||||
def extract_pdf_range_to_json(pdf_path: str, page_start_1b: int, page_end_1b: int, out_path: str) -> List[Dict[str, str]]:
|
||||
lines = get_page_lines(pdf_path, page_start_1b, page_end_1b)
|
||||
data = extract_records(lines)
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
data = extract_pdf_range_to_json(PDF_PATH, PAGE_START, PAGE_END, OUT_PATH)
|
||||
print(f"Wrote {len(data)} rows to {OUT_PATH}")
|
||||
print(json.dumps(data, ensure_ascii=False, indent=2))
|
||||
1004
apps/ProcedureCodeFromMhPdf/output.json
Executable file
1004
apps/ProcedureCodeFromMhPdf/output.json
Executable file
File diff suppressed because it is too large
Load Diff
344
apps/ProcedureCodeFromMhPdf/procedureCodes_v0.json
Executable file
344
apps/ProcedureCodeFromMhPdf/procedureCodes_v0.json
Executable file
@@ -0,0 +1,344 @@
|
||||
[
|
||||
{
|
||||
"Procedure Code": "D1999",
|
||||
"Description": "",
|
||||
"Price": "50"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0120",
|
||||
"Description": "perio exam",
|
||||
"Price": "105"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0140",
|
||||
"Description": "limited exam",
|
||||
"Price": "90"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0150",
|
||||
"Description": "comprehensive exam",
|
||||
"Price": "120"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0210",
|
||||
"Description": "Fmx.",
|
||||
"Price": "120"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0220",
|
||||
"Description": "first PA.",
|
||||
"Price": "60"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0230",
|
||||
"Description": "2nd PA.",
|
||||
"Price": "50"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0330",
|
||||
"Description": "pano",
|
||||
"Price": "150"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0272",
|
||||
"Description": "2 BW",
|
||||
"Price": "80"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0274",
|
||||
"Description": "4BW",
|
||||
"Price": "160"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D1110",
|
||||
"Description": "adult prophy",
|
||||
"Price": "150"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D1120",
|
||||
"Description": "child prophy",
|
||||
"Price": "120"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D1351",
|
||||
"Description": "sealant",
|
||||
"Price": "80"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D4341",
|
||||
"Description": "srp",
|
||||
"Price": "250"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D4910",
|
||||
"Description": "perio maintains",
|
||||
"Price": "250"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D1208",
|
||||
"Description": "FL",
|
||||
"Price": "90"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2330",
|
||||
"Description": "front composite. 1 s.",
|
||||
"Price": "180"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2331",
|
||||
"Description": "2s",
|
||||
"Price": "220"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2332",
|
||||
"Description": "3s",
|
||||
"Price": "280"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2335",
|
||||
"Description": "4s or more",
|
||||
"Price": "350"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2391",
|
||||
"Description": "back. 1s",
|
||||
"Price": "200"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2392",
|
||||
"Description": "2s",
|
||||
"Price": "250"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2393",
|
||||
"Description": "3s",
|
||||
"Price": "280"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2394",
|
||||
"Description": "4s",
|
||||
"Price": "320"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2140",
|
||||
"Description": "amalgam, one surface",
|
||||
"Price": "150"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2150",
|
||||
"Description": "amalgam, two surface",
|
||||
"Price": "200"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2750",
|
||||
"Description": "high noble",
|
||||
"Price": "1300"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2751",
|
||||
"Description": "base metal",
|
||||
"Price": "1200"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2740",
|
||||
"Description": "crown porcelain",
|
||||
"Price": "1300"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2954",
|
||||
"Description": "p/c",
|
||||
"Price": "450"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D7910",
|
||||
"Description": "suture, small wound up to 5 mm",
|
||||
"Price": "400"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D5110",
|
||||
"Description": "FU",
|
||||
"Price": "1200",
|
||||
"Full Price": "1700"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D5120",
|
||||
"Description": "FL",
|
||||
"Price": "1700",
|
||||
"Full Price": "1700"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D5211",
|
||||
"Description": "pu",
|
||||
"Price": "1300"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D5212",
|
||||
"Description": "pl",
|
||||
"Price": "1300"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D5213",
|
||||
"Description": "cast pu.",
|
||||
"Price": "1700"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D5214",
|
||||
"Description": "cast pl",
|
||||
"Price": "1700"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D5510",
|
||||
"Description": "Repair broken complete denture base (QUAD)",
|
||||
"Price": "400"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D5520",
|
||||
"Description": "Replace missing or broken teeth - complete denture (each tooth) (TOOTH)",
|
||||
"Price": "200"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D5750",
|
||||
"Description": "lab reline",
|
||||
"Price": "600"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D5730",
|
||||
"Description": "chairside reline",
|
||||
"Price": "500"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2920",
|
||||
"Description": "re cement crown",
|
||||
"Price": "120"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2950",
|
||||
"Description": "core buildup",
|
||||
"Price": "350"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D2955",
|
||||
"Description": "post renoval",
|
||||
"Price": "350"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D6100",
|
||||
"Description": "",
|
||||
"Price": "320"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D6110",
|
||||
"Description": "implant",
|
||||
"Price": "1600"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D6056",
|
||||
"Description": "pre fab abut",
|
||||
"Price": "750"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D6057",
|
||||
"Description": "custom abut",
|
||||
"Price": "800"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D6058",
|
||||
"Description": "porcelain, implant crown, ceramic crown",
|
||||
"Price": "1400"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D6059",
|
||||
"Description": "",
|
||||
"Price": "1400"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D6242",
|
||||
"Description": "noble metal. For united",
|
||||
"Price": "1400"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D6245",
|
||||
"Description": "porcelain, not for united",
|
||||
"Price": "1400"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0367",
|
||||
"Description": "",
|
||||
"Price": "400"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0364",
|
||||
"Description": "Less than one jaw",
|
||||
"Price": "350"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0365",
|
||||
"Description": "Mand",
|
||||
"Price": "350"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0366",
|
||||
"Description": "Max",
|
||||
"Price": "350"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0368",
|
||||
"Description": "include TMJ",
|
||||
"Price": "375"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0383",
|
||||
"Description": "",
|
||||
"Price": "350"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0380",
|
||||
"Description": "Less than one jaw",
|
||||
"Price": "300"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0381",
|
||||
"Description": "Mand",
|
||||
"Price": "300"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D0382",
|
||||
"Description": "Max",
|
||||
"Price": "300"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D7950",
|
||||
"Description": "max",
|
||||
"Price": "800"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D7140",
|
||||
"Description": "simple ext",
|
||||
"Price": "150"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D7210",
|
||||
"Description": "surgical ext",
|
||||
"Price": "280"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D7220",
|
||||
"Description": "soft impacted",
|
||||
"Price": "380"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D7230",
|
||||
"Description": "partial bony",
|
||||
"Price": "450"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D7240",
|
||||
"Description": "fully bony",
|
||||
"Price": "550"
|
||||
},
|
||||
{
|
||||
"Procedure Code": "D3320",
|
||||
"Description": "pre M RCT",
|
||||
"Price": "1050"
|
||||
}
|
||||
]
|
||||
1026
apps/ProcedureCodeFromMhPdf/procedureCodes_v1.json
Executable file
1026
apps/ProcedureCodeFromMhPdf/procedureCodes_v1.json
Executable file
File diff suppressed because it is too large
Load Diff
1192
apps/ProcedureCodeFromMhPdf/procedureCodes_v2.json
Executable file
1192
apps/ProcedureCodeFromMhPdf/procedureCodes_v2.json
Executable file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user