DentalManagementMHAprilgg/apps/ProcedureCodeFromMhPdf/compareJson.py

#!/usr/bin/env python3
"""
Compare a main dental JSON file with one or more other JSON files and
return all records whose 'Procedure Code' is NOT present in the main file.

- Matching key: 'Procedure Code' (case-insensitive, trimmed).
- Keeps the full record from the other files (including extra fields like 'Full Price').
- Deduplicates by Procedure Code across the collected "missing" results.

CONFIG: set MAIN_PATH, OTHER_PATHS, OUT_PATH below.
"""

import json
from pathlib import Path
from typing import List, Dict, Any

# =========================
# CONFIG — EDIT THESE ONLY
# =========================
MAIN_PATH = "procedureCodes_v2.json"  # your main JSON (with PriceLTEQ21/PriceGT21)
OTHER_PATHS = [
    # "procedureCodesOld.json",       # one or more other JSON files to compare against the main
    "output.json",
]
OUT_PATH = "not_in_main.json"  # where to write the results
# =========================


def _load_json_any(path: str) -> List[Dict[str, Any]]:
    """
    Load JSON. Accept:
      - a list of objects
      - a single object (wraps into a list)
    """
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, dict):
        return [data]
    if isinstance(data, list):
        # filter out non-dict items defensively
        return [x for x in data if isinstance(x, dict)]
    raise ValueError(f"Unsupported JSON top-level type in {path}: {type(data)}")


def _norm_code(record: Dict[str, Any]) -> str:
    # Normalize the 'Procedure Code' for matching
    code = str(record.get("Procedure Code", "")).strip().upper()
    # Some PDFs might have stray spaces, tabs, or zero-width chars
    code = "".join(ch for ch in code if not ch.isspace())
    return code


def collect_main_codes(main_path: str) -> set:
    main_items = _load_json_any(main_path)
    codes = {_norm_code(rec) for rec in main_items if _norm_code(rec)}
    return codes


def collect_missing_records(other_paths: List[str], main_codes: set) -> List[Dict[str, Any]]:
    missing: Dict[str, Dict[str, Any]] = {}  # map normalized code -> record
    for p in other_paths:
        items = _load_json_any(p)
        for rec in items:
            code_norm = _norm_code(rec)
            if not code_norm:
                continue
            if code_norm not in main_codes and code_norm not in missing:
                # Keep the full original record
                missing[code_norm] = rec
    # return in a stable, sorted order by code
    return [missing[k] for k in sorted(missing.keys())]


def main():
    # Validate files exist
    if not Path(MAIN_PATH).exists():
        raise FileNotFoundError(f"Main file not found: {MAIN_PATH}")
    for p in OTHER_PATHS:
        if not Path(p).exists():
            raise FileNotFoundError(f"Other file not found: {p}")

    main_codes = collect_main_codes(MAIN_PATH)
    missing_records = collect_missing_records(OTHER_PATHS, main_codes)

    with open(OUT_PATH, "w", encoding="utf-8") as f:
        json.dump(missing_records, f, ensure_ascii=False, indent=2)

    print(f"Main codes: {len(main_codes)}")
    print(f"Missing from main: {len(missing_records)}")
    print(f"Wrote results to {OUT_PATH}")
    # Also echo to stdout
    print(json.dumps(missing_records, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    main()