Files
DentalManagementMH06/apps/PaymentOCRService/pdf_extractor.py
Gitead dd0df4a435 feat: payment PDF extraction, import, and remittance tracking
- Add Upload Payment Documents section with Extract & Download (Excel)
  and Extract & Import (database) buttons
- PDF extractor (pdfplumber) parses MassHealth RA PDFs: two-pass
  strategy joins summary-page ICN/patient map with detail-page
  procedure data (CDT code, paid code, tooth, date, allowed amount)
- RA cover-page summary (Payee ID, RA #, Payment Amount, etc.)
  included as separate Excel sheet; numeric values written as numbers
- Backend PDF import route groups rows by Member #, finds/creates
  patient, creates Payment + ServiceLines with ICN per procedure
- Add icn, paidCode, allowedAmount fields to ServiceLine schema
- Payments table: status simplified to Paid in Full / Balance;
  adjustment auto-computed on mhPaidAmount/copayment change;
  Paid in Full and Revert buttons with confirmation dialogs
- Edit Payment modal: shows ICN, Paid Code, Allowed Amount per line
- PDF Import badge distinguishes from OCR imports in payments table

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-07 12:53:50 -04:00

225 lines
8.8 KiB
Python

import io
import re
import pdfplumber
DCODE_RE = re.compile(r'^D\d{4}$')
MEMBER_RE = re.compile(r'\b(\d{12})\b') # MassHealth member IDs are always 12 digits
# ── Page-1 header patterns ────────────────────────────────────────────────────
_H = {
"Payee ID": re.compile(r'Payee ID:\s*(\S+)'),
"Business NPI": re.compile(r'Business NPI:\s*(\S+)'),
"Run #": re.compile(r'Run #:\s*(\S+)'),
"RA #": re.compile(r'RA #:\s*(\S+)'),
"RA Date": re.compile(r'RA Date:\s*(\S+)'),
"Claim Detail Amount": re.compile(r'Claim Detail Amount:\s*([\$\d,\.]+)'),
"Claim Adjustment Amount": re.compile(r'Claim Adjustment Amount:\s*([\$\(\)\d,\.]+)'),
"Misc. Adjustment Amount": re.compile(r'Misc\. Adjustment Amount:\s*([\$\(\)\d,\.]+)'),
"Payment Amount": re.compile(r'Payment Amount:\s*([\$\d,\.]+)'),
}
def extract_ra_header(pdf_bytes: bytes, filename: str) -> dict:
"""Extract the cover-page summary (Payee ID, RA #, Payment Amount, etc.)."""
header: dict[str, str] = {"Source File": filename}
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
# Header info lives on pages 1 and 2 — scan both to be safe
for page in pdf.pages[:2]:
text = page.extract_text() or ""
for field, pattern in _H.items():
if field not in header or not header[field]:
m = pattern.search(text)
if m:
header[field] = m.group(1).strip()
return header
def _c(val) -> str:
return str(val).replace("\n", " ").strip() if val else ""
def _amt(val: str) -> str:
return val.replace("$", "").replace(",", "").strip() if val else ""
def _col(headers: list[str], *keywords) -> int | None:
for i, h in enumerate(headers):
hl = h.lower()
if all(k.lower() in hl for k in keywords):
return i
return None
def _find_header_row(table: list[list]) -> tuple[int | None, list[str]]:
for i, row in enumerate(table):
# Skip merged context rows (only one non-None cell)
if sum(1 for c in row if c) <= 1:
continue
flat = [_c(c) for c in row]
j = " ".join(flat).lower()
if "patient name" in j and "icn" in j and "code" not in j:
return i, flat
if ("submitted" in j and "code" in j) or "paid code" in j:
return i, flat
return None, []
def _is_summary(h: list[str]) -> bool:
j = " ".join(h).lower()
return "patient name" in j and "icn" in j and "code" not in j
def _is_detail(h: list[str]) -> bool:
j = " ".join(h).lower()
return ("submitted" in j and "code" in j) or "paid code" in j
def _merge_headers(table: list[list], hdr_idx: int) -> list[str]:
n = max(len(r) for r in table[: hdr_idx + 1])
merged = []
for ci in range(n):
parts = [_c(table[ri][ci]) for ri in range(hdr_idx + 1)
if ci < len(table[ri]) and table[ri][ci]]
merged.append(" ".join(parts))
return merged
# ── Pass 1: summary pages → {icn: patient_name} ──────────────────────────────
def _build_patient_map(pdf) -> dict[str, str]:
patient_map: dict[str, str] = {}
for page in pdf.pages:
for tobj in page.find_tables():
table = tobj.extract()
if not table or len(table) < 2:
continue
hdr_idx, headers = _find_header_row(table)
if hdr_idx is None or not _is_summary(headers):
continue
pi = _col(headers, "Patient Name")
ii = _col(headers, "ICN")
if pi is None or ii is None:
continue
for row in table[hdr_idx + 1:]:
if not row:
continue
patient = _c(row[pi]) if pi < len(row) else ""
icn = _c(row[ii]) if ii < len(row) else ""
if not patient or not icn:
continue
if "Total" in patient or not icn.replace(" ", "").isdigit():
continue
patient_map[icn] = patient
return patient_map
# ── Pass 2: detail pages → {icn: procedure_dict} ─────────────────────────────
def _build_detail_map(pdf) -> dict[str, dict]:
detail_map: dict[str, dict] = {}
for page in pdf.pages:
for tobj in page.find_tables():
table = tobj.extract()
if not table or len(table) < 2:
continue
hdr_idx, headers = _find_header_row(table)
if hdr_idx is None or not _is_detail(headers):
continue
# ICN is in the merged first cell (row 0)
context_cell = str(table[0][0]) if table[0] and table[0][0] else ""
icn_m = re.search(r'ICN:\s*(\d+)', context_cell)
member_m = MEMBER_RE.search(context_cell)
icn = icn_m.group(1) if icn_m else ""
member = member_m.group(1) if member_m else ""
if not icn:
continue
h = _merge_headers(table, hdr_idx)
sub_code_i = _col(h, "Submitted", "Code")
paid_code_i = _col(h, "Paid", "Code")
tooth_i = _col(h, "Tooth")
date_i = _col(h, "Date")
allowed_i = _col(h, "Allowed")
sub_amt_i = paid_amt_i = None
for i, col_h in enumerate(h):
lh = col_h.lower()
if "submitted" in lh and "code" not in lh and sub_amt_i is None:
sub_amt_i = i
if "paid" in lh and "code" not in lh and ("amount" in lh or paid_amt_i is None):
paid_amt_i = i
for row in table[hdr_idx + 1:]:
if not row:
continue
cdt = _c(row[sub_code_i]) if sub_code_i is not None and sub_code_i < len(row) else ""
if not DCODE_RE.match(cdt):
continue
paid_code = _c(row[paid_code_i]) if paid_code_i is not None and paid_code_i < len(row) else ""
tooth = _c(row[tooth_i]) if tooth_i is not None and tooth_i < len(row) else ""
date = _c(row[date_i]) if date_i is not None and date_i < len(row) else ""
sub_a = _amt(_c(row[sub_amt_i])) if sub_amt_i is not None and sub_amt_i < len(row) else ""
allow_a = _amt(_c(row[allowed_i])) if allowed_i is not None and allowed_i < len(row) else ""
paid_a = _amt(_c(row[paid_amt_i])) if paid_amt_i is not None and paid_amt_i < len(row) else ""
detail_map[icn] = {
"Member #": member,
"Submitted Code": cdt,
"Paid Code": paid_code,
"Tooth": tooth,
"Date of Service": date,
"Submitted Amount": sub_a,
"Allowed Amount": allow_a,
"Paid Amount": paid_a,
}
return detail_map
# ── Main: join on ICN ─────────────────────────────────────────────────────────
def extract_ra_pdf(pdf_bytes: bytes, filename: str) -> dict:
"""
Two-pass extraction of a MassHealth Remittance Advice PDF.
Returns:
{
"header": { Payee ID, Business NPI, Run #, RA #, RA Date,
Claim Detail Amount, Claim Adjustment Amount,
Misc. Adjustment Amount, Payment Amount },
"rows": [ one dict per ICN … ]
}
"""
header = extract_ra_header(pdf_bytes, filename)
with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
patient_map = _build_patient_map(pdf)
detail_map = _build_detail_map(pdf)
rows = []
for icn, patient_name in patient_map.items():
detail = detail_map.get(icn, {})
rows.append({
"Patient Name": patient_name,
"Member #": detail.get("Member #", ""),
"ICN": icn,
"Submitted Code": detail.get("Submitted Code", ""),
"Paid Code": detail.get("Paid Code", ""),
"Tooth": detail.get("Tooth", ""),
"Date of Service": detail.get("Date of Service", ""),
"Submitted Amount": detail.get("Submitted Amount", ""),
"Allowed Amount": detail.get("Allowed Amount", ""),
"Paid Amount": detail.get("Paid Amount", ""),
"Source File": filename,
})
return {"header": header, "rows": rows}