structued well

This commit is contained in:
2025-08-29 18:17:24 +05:30
parent d89bee4f07
commit e5271d6979
14 changed files with 1 additions and 4498 deletions

View File

@@ -0,0 +1,71 @@
from flask import Flask, request, jsonify
import fitz # PyMuPDF
import re
app = Flask(__name__)
DOB_RE = re.compile(r'(?<!\d)(\d{1,2})/(\d{1,2})/(\d{4})(?!\d)')
ID_RE = re.compile(r'^\d{8,14}$') # 814 digits, whole line
# lines that tell us we've moved past the name/DOB area
STOP_WORDS = {
'eligibility', 'coverage', 'age band', 'date of', 'service',
'tooth', 'number', 'surface', 'procedure', 'code', 'description',
'provider', 'printed on', 'member id', 'name', 'date of birth'
}
@app.route("/extract", methods=["POST"])
def extract():
file = request.files['pdf']
doc = fitz.open(stream=file.read(), filetype="pdf")
text = "\n".join(page.get_text("text") for page in doc)
lines = [line.strip() for line in text.splitlines() if line.strip()]
member_id = ""
name = ""
dob = ""
# 1) Find the first plausible member ID (814 digits)
id_idx = -1
for i, line in enumerate(lines):
if ID_RE.match(line):
member_id = line
id_idx = i
break
if id_idx == -1:
return jsonify({"memberId": "", "name": "", "dob": ""})
# 2) Scan forward to collect name + DOB; handle both same-line and next-line cases
collected = []
j = id_idx + 1
while j < len(lines):
low = lines[j].lower()
if any(sw in low for sw in STOP_WORDS):
break
collected.append(lines[j])
# If we already found a DOB, we can stop early
if DOB_RE.search(lines[j]):
break
j += 1
# Flatten the collected chunk to search for a date (works if DOB is on same line or next)
blob = " ".join(collected).strip()
m = DOB_RE.search(blob)
if m:
dob = m.group(0)
# name is everything before the date within the same blob
name = blob[:m.start()].strip()
else:
# fallback: if we didn't find a date, assume first collected line(s) are name
name = blob
return jsonify({
"memberId": member_id,
"name": name,
"dob": dob
})
if __name__ == "__main__":
app.run(port=5001)

View File

@@ -0,0 +1,8 @@
{
"name": "patientdataextractorservice",
"private": true,
"scripts": {
"postinstall": "pip install -r requirements.txt",
"dev": "python main.py"
}
}

View File

@@ -0,0 +1,2 @@
flask
pymupdf