DentalManagementMHAprilgg/apps/ProcedureCodeFromMhPdf/compareJson_matchingPrice.py

#!/usr/bin/env python3
"""
Compare prices between two JSON files (file1 vs file2) — CONFIG-DRIVEN version.

Behavior:
- Loads two JSON arrays of records (file1 and file2).
- Indexes by procedure code (tries common keys like "Procedure Code", "Code", etc).
- Normalizes money tokens: removes $ and commas, treats "NC" as literal.
- Compares all three price fields:
    - Price
    - PriceLTEQ21
    - PriceGT21
  Matching rules:
    - If both records have the same named field, compare them.
    - If file1 has only a single "Price" and file2 has PriceLTEQ21 / PriceGT21,
      the script will compare file1.Price to BOTH PriceLTEQ21 and PriceGT21 (and
      report mismatch if file1.Price differs from either).
    - "NC" only equals "NC".
    - Numeric tokens compared numerically within tolerance (default 0.005).
- Produces output JSON (configured below) listing:
    - mismatches: detailed entries for codes that differ
    - only_in_file1: codes found only in file1
    - only_in_file2: codes found only in file2
    - summary

Edit the CONFIG block below, then run the script.
"""

import json
import re
from typing import List, Dict, Any, Optional

# =========================
# CONFIG — EDIT THESE ONLY
# =========================
FILE1_PATH = "procedureCodes_v2.json"         # path to file 1 (your base/reference file)
FILE2_PATH = "output.json"         # path to file 2 (the file to compare)
OUT_PATH = "price_diffs.json"     # output JSON writing mismatches
TOLERANCE = 0.005                 # numeric tolerance for floats
CODE_KEY_CANDIDATES = ("Procedure Code", "Code", "procedure_code", "procedure code")
# If True: when file1 has single "Price" and file2 has both LTEQ/GT values,
# compare file1.Price against both fields and flag mismatch if either differs.
COMPARE_SINGLE_PRICE_AGAINST_BOTH = True
# =========================

_money_re = re.compile(r"^\s*(NC|\$?\s*[\d,]+(?:\.\d{1,2})?)\s*$", re.IGNORECASE)


def normalize_money_token(token: Optional[str]) -> Optional[str]:
    """Normalize money token to canonical string or 'NC'. Return None if missing/empty."""
    if token is None:
        return None
    t = str(token).strip()
    if not t:
        return None
    m = _money_re.match(t)
    if not m:
        # unknown format — return trimmed token so mismatch is visible
        return t
    val = m.group(1)
    if val.upper() == "NC":
        return "NC"
    val = val.replace("$", "").replace(",", "").strip()
    # Remove trailing zeros from decimals, but preserve integer form
    if "." in val:
        val = val.rstrip("0").rstrip(".")
    return val


def numeric_compare(a: Optional[str], b: Optional[str], tol: float = TOLERANCE) -> bool:
    """Compare normalized tokens. NC compares only equal to NC. Otherwise numeric compare."""
    if a is None or b is None:
        return False
    if a == b:
        return True
    if a.upper() == "NC" or b.upper() == "NC":
        return a.upper() == b.upper()
    try:
        return abs(float(a) - float(b)) <= tol
    except Exception:
        # fallback to exact match if non-numeric
        return a == b


def load_json(path: str) -> List[Dict[str, Any]]:
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, list):
        raise ValueError(f"Expected JSON array in {path}")
    return data


def build_index(records: List[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
    """Index records by procedure code. First match wins for duplicates."""
    idx: Dict[str, Dict[str, Any]] = {}
    for rec in records:
        code = None
        for k in CODE_KEY_CANDIDATES:
            if k in rec and rec[k]:
                code = str(rec[k]).strip()
                break
        if not code:
            # try to find any field with a Dxxxx-like value
            for v in rec.values():
                if isinstance(v, str) and re.match(r"^\s*D\d{4}\s*$", v):
                    code = v.strip()
                    break
        if not code:
            continue
        if code in idx:
            # duplicate: keep first occurrence
            continue
        idx[code] = rec
    return idx


def extract_price_fields(rec: Dict[str, Any]) -> Dict[str, Optional[str]]:
    """
    Return dict with normalized values for 'Price', 'PriceLTEQ21', and 'PriceGT21'.
    Keys always present with None when missing.
    """
    return {
        "Price": normalize_money_token(rec.get("Price")),
        "PriceLTEQ21": normalize_money_token(rec.get("PriceLTEQ21")),
        "PriceGT21": normalize_money_token(rec.get("PriceGT21")),
    }


def compare_code_records(code: str, rec1: Dict[str, Any], rec2: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Compare price fields for a single code. Return mismatch dict if any mismatch present, else None.
    Mismatch dict includes file1/file2 price fields and per-field mismatch details.
    """
    p1 = extract_price_fields(rec1)
    p2 = extract_price_fields(rec2)

    mismatches = []

    # 1) Compare same-named fields if both present
    for key in ("Price", "PriceLTEQ21", "PriceGT21"):
        a = p1.get(key)
        b = p2.get(key)
        if a is None and b is None:
            continue
        if a is None or b is None:
            # present in one but not the other: count as mismatch
            mismatches.append({"field": key, "file1": a, "file2": b, "reason": "missing_in_one"})
            continue
        if not numeric_compare(a, b):
            mismatches.append({"field": key, "file1": a, "file2": b, "reason": "value_mismatch"})

    # 2) Special-case: if file1 has only single Price, and file2 has LTEQ/GT present,
    #    optionally compare file1.Price against each of them.
    if COMPARE_SINGLE_PRICE_AGAINST_BOTH:
        # Only apply if file1.Price exists and file1 does NOT have LTEQ/GT (both None),
        # but file2 has at least one of LTEQ/GT.
        file1_has_price = p1.get("Price") is not None
        file1_has_any_special = (p1.get("PriceLTEQ21") is not None) or (p1.get("PriceGT21") is not None)
        file2_has_any_special = (p2.get("PriceLTEQ21") is not None) or (p2.get("PriceGT21") is not None)
        if file1_has_price and (not file1_has_any_special) and file2_has_any_special:
            # compare file1.Price to each present file2 special price
            left = p1.get("Price")
            for special_key in ("PriceLTEQ21", "PriceGT21"):
                right = p2.get(special_key)
                if right is None:
                    continue
                # If already recorded a same-named mismatch for this special_key above,
                # that mismatch covered the case where file1 was missing that named field.
                # But since file1 lacked that special field, we still want to compare single Price vs special.
                if not numeric_compare(left, right):
                    mismatches.append({
                        "field": f"Price_vs_{special_key}",
                        "file1": left,
                        "file2": right,
                        "reason": "single_price_vs_special_mismatch"
                    })

    if mismatches:
        return {
            "Procedure Code": code,
            "Description_file1": rec1.get("Description"),
            "Description_file2": rec2.get("Description"),
            "file1_prices": p1,
            "file2_prices": p2,
            "mismatches": mismatches
        }
    return None


def main():
    # load inputs
    data1 = load_json(FILE1_PATH)
    data2 = load_json(FILE2_PATH)

    idx1 = build_index(data1)
    idx2 = build_index(data2)

    codes_all = sorted(set(list(idx1.keys()) + list(idx2.keys())))

    mismatched: List[Dict[str, Any]] = []
    only_in_file1: List[str] = []
    only_in_file2: List[str] = []

    for code in codes_all:
        rec1 = idx1.get(code)
        rec2 = idx2.get(code)
        if rec1 is None:
            only_in_file2.append(code)
            continue
        if rec2 is None:
            only_in_file1.append(code)
            continue
        diff = compare_code_records(code, rec1, rec2)
        if diff:
            mismatched.append(diff)

    out = {
        "summary": {
            "total_codes_found": len(codes_all),
            "only_in_file1_count": len(only_in_file1),
            "only_in_file2_count": len(only_in_file2),
            "mismatched_count": len(mismatched),
        },
        "only_in_file1": only_in_file1,
        "only_in_file2": only_in_file2,
        "mismatches": mismatched
    }

    with open(OUT_PATH, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

    # brief console summary
    print(f"Compared {len(codes_all)} procedure codes.")
    print(f"Only in {FILE1_PATH}: {len(only_in_file1)} codes.")
    print(f"Only in {FILE2_PATH}: {len(only_in_file2)} codes.")
    print(f"Mismatched prices: {len(mismatched)} codes.")
    print(f"Wrote detailed diffs to {OUT_PATH}")


if __name__ == "__main__":
    main()