|
|
|
@@ -957,6 +957,8 @@ class RAGFlowPdfParser: |
|
|
|
fnm, str) else pdfplumber.open(BytesIO(fnm)) |
|
|
|
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in |
|
|
|
enumerate(self.pdf.pages[page_from:page_to])] |
|
|
|
self.page_images_x2 = [p.to_image(resolution=72 * zoomin * 2).annotated for i, p in |
|
|
|
enumerate(self.pdf.pages[page_from:page_to])] |
|
|
|
self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in |
|
|
|
self.pdf.pages[page_from:page_to]] |
|
|
|
self.total_page = len(self.pdf.pages) |
|
|
|
@@ -992,7 +994,7 @@ class RAGFlowPdfParser: |
|
|
|
self.is_english = False |
|
|
|
|
|
|
|
st = timer() |
|
|
|
for i, img in enumerate(self.page_images): |
|
|
|
for i, img in enumerate(self.page_images_x2): |
|
|
|
chars = self.page_chars[i] if not self.is_english else [] |
|
|
|
self.mean_height.append( |
|
|
|
np.median(sorted([c["height"] for c in chars])) if chars else 0 |
|
|
|
@@ -1000,7 +1002,7 @@ class RAGFlowPdfParser: |
|
|
|
self.mean_width.append( |
|
|
|
np.median(sorted([c["width"] for c in chars])) if chars else 8 |
|
|
|
) |
|
|
|
self.page_cum_height.append(img.size[1] / zoomin) |
|
|
|
self.page_cum_height.append(img.size[1] / zoomin/2) |
|
|
|
j = 0 |
|
|
|
while j + 1 < len(chars): |
|
|
|
if chars[j]["text"] and chars[j + 1]["text"] \ |
|
|
|
@@ -1010,7 +1012,7 @@ class RAGFlowPdfParser: |
|
|
|
chars[j]["text"] += " " |
|
|
|
j += 1 |
|
|
|
|
|
|
|
self.__ocr(i + 1, img, chars, zoomin) |
|
|
|
self.__ocr(i + 1, img, chars, zoomin*2) |
|
|
|
if callback and i % 6 == 5: |
|
|
|
callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="") |
|
|
|
# print("OCR:", timer()-st) |