Browse Source

fix pdf_paser char content confusion (#1462)

### What problem does this PR solve?

#1407 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.9.0
H 1 year ago
parent
commit
2290c2a2f0
No account linked to committer's email address
1 changed files with 4 additions and 3 deletions
  1. 4
    3
      deepdoc/parser/pdf_parser.py

+ 4
- 3
deepdoc/parser/pdf_parser.py View File

@@ -287,14 +287,15 @@ class RAGFlowPdfParser:
)

# solve char content confusion
record_error_length = 0
record_error_length, ct = 0, 1
for c in chars[0:128]:
ii = Recognizer.find_overlapped(c, bxs)
if ii is None:
continue
record_error_length += abs((bxs[ii]["bottom"] + bxs[ii]["bottom"] - c["bottom"] - c["top"]) / 2)
record_error_length += abs((bxs[ii]["bottom"] + bxs[ii]["top"] - c["bottom"] - c["top"]) / 2)
ct += 1

record_error_length = record_error_length / 128
record_error_length = record_error_length / ct
for char in chars:
char["top"] -= record_error_length
char["bottom"] -= record_error_length

Loading…
Cancel
Save