### What problem does this PR solve? #613 ### Type of change - [x] Other (please describe):tags/v0.5.0
| @@ -19,7 +19,7 @@ import os | |||
| import re | |||
| from io import BytesIO | |||
| import fitz | |||
| import pdfplumber | |||
| from PIL import Image | |||
| from cachetools import LRUCache, cached | |||
| from ruamel.yaml import YAML | |||
| @@ -172,11 +172,9 @@ def filename_type(filename): | |||
| def thumbnail(filename, blob): | |||
| filename = filename.lower() | |||
| if re.match(r".*\.pdf$", filename): | |||
| pdf = fitz.open(stream=blob, filetype="pdf") | |||
| pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03)) | |||
| pdf = pdfplumber.open(BytesIO(blob)) | |||
| buffered = BytesIO() | |||
| Image.frombytes("RGB", [pix.width, pix.height], | |||
| pix.samples).save(buffered, format="png") | |||
| pdf.pages[0].to_image().annotated.save(buffered, format="png") | |||
| return "data:image/png;base64," + \ | |||
| base64.b64encode(buffered.getvalue()).decode("utf-8") | |||
| @@ -2,7 +2,6 @@ | |||
| import os | |||
| import random | |||
| import fitz | |||
| import xgboost as xgb | |||
| from io import BytesIO | |||
| import torch | |||
| @@ -922,9 +921,7 @@ class RAGFlowPdfParser: | |||
| fnm) if not binary else pdfplumber.open(BytesIO(binary)) | |||
| return len(pdf.pages) | |||
| except Exception as e: | |||
| pdf = fitz.open(fnm) if not binary else fitz.open( | |||
| stream=fnm, filetype="pdf") | |||
| return len(pdf) | |||
| logging.error(str(e)) | |||
| def __images__(self, fnm, zoomin=3, page_from=0, | |||
| page_to=299, callback=None): | |||
| @@ -946,23 +943,7 @@ class RAGFlowPdfParser: | |||
| self.pdf.pages[page_from:page_to]] | |||
| self.total_page = len(self.pdf.pages) | |||
| except Exception as e: | |||
| self.pdf = fitz.open(fnm) if isinstance( | |||
| fnm, str) else fitz.open( | |||
| stream=fnm, filetype="pdf") | |||
| self.page_images = [] | |||
| self.page_chars = [] | |||
| mat = fitz.Matrix(zoomin, zoomin) | |||
| self.total_page = len(self.pdf) | |||
| for i, page in enumerate(self.pdf): | |||
| if i < page_from: | |||
| continue | |||
| if i >= page_to: | |||
| break | |||
| pix = page.get_pixmap(matrix=mat) | |||
| img = Image.frombytes("RGB", [pix.width, pix.height], | |||
| pix.samples) | |||
| self.page_images.append(img) | |||
| self.page_chars.append([]) | |||
| logging.error(str(e)) | |||
| self.outlines = [] | |||
| try: | |||
| @@ -1,12 +1,13 @@ | |||
| import pdfplumber | |||
| from .ocr import OCR | |||
| from .recognizer import Recognizer | |||
| from .layout_recognizer import LayoutRecognizer | |||
| from .table_structure_recognizer import TableStructureRecognizer | |||
| def init_in_out(args): | |||
| from PIL import Image | |||
| import fitz | |||
| import os | |||
| import traceback | |||
| from api.utils.file_utils import traversal_files | |||
| @@ -18,13 +19,11 @@ def init_in_out(args): | |||
| def pdf_pages(fnm, zoomin=3): | |||
| nonlocal outputs, images | |||
| pdf = fitz.open(fnm) | |||
| mat = fitz.Matrix(zoomin, zoomin) | |||
| for i, page in enumerate(pdf): | |||
| pix = page.get_pixmap(matrix=mat) | |||
| img = Image.frombytes("RGB", [pix.width, pix.height], | |||
| pix.samples) | |||
| images.append(img) | |||
| pdf = pdfplumber.open(fnm) | |||
| images = [p.to_image(resolution=72 * zoomin).annotated for i, p in | |||
| enumerate(pdf.pages)] | |||
| for i, page in enumerate(images): | |||
| outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg") | |||
| def images_and_outputs(fnm): | |||
| @@ -35,7 +35,7 @@ class RAGFlowMinio(object): | |||
| self.conn = None | |||
| def put(self, bucket, fnm, binary): | |||
| for _ in range(10): | |||
| for _ in range(3): | |||
| try: | |||
| if not self.conn.bucket_exists(bucket): | |||
| self.conn.make_bucket(bucket) | |||
| @@ -91,8 +91,6 @@ pycryptodomex==3.20.0 | |||
| pydantic==2.6.2 | |||
| pydantic_core==2.16.3 | |||
| PyJWT==2.8.0 | |||
| PyMuPDF==1.23.25 | |||
| PyMuPDFb==1.23.22 | |||
| PyMySQL==1.1.0 | |||
| PyPDF2==3.0.1 | |||
| pypdfium2==4.27.0 | |||