removed test'
This commit is contained in:
@@ -1,37 +0,0 @@
|
|||||||
import fitz # PyMuPDF
|
|
||||||
import re
|
|
||||||
|
|
||||||
def extract_from_pdf(file_path):
|
|
||||||
doc = fitz.open(file_path)
|
|
||||||
text = "\n".join(page.get_text() for page in doc)
|
|
||||||
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
||||||
member_id = ""
|
|
||||||
name = ""
|
|
||||||
dob = ""
|
|
||||||
|
|
||||||
for i, line in enumerate(lines):
|
|
||||||
if line.isdigit() and (len(line) <= 14 or len(line) >= 8):
|
|
||||||
member_id = line
|
|
||||||
name_lines = []
|
|
||||||
j = i + 1
|
|
||||||
while j < len(lines) and not re.match(r"\d{1,2}/\d{1,2}/\d{4}", lines[j]):
|
|
||||||
name_lines.append(lines[j])
|
|
||||||
j += 1
|
|
||||||
name = " ".join(name_lines).strip()
|
|
||||||
|
|
||||||
if j < len(lines):
|
|
||||||
dob = lines[j].strip()
|
|
||||||
break
|
|
||||||
|
|
||||||
return {
|
|
||||||
"memberId": member_id,
|
|
||||||
"name": name,
|
|
||||||
"dob": dob
|
|
||||||
}
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
result = extract_from_pdf("PDF_To_Test/sample1.pdf")
|
|
||||||
print(result)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user