Browse Source

remove PyMuPDF (#618)

### What problem does this PR solve?
#613 

### Type of change


- [x] Other (please describe):
tags/v0.5.0
KevinHuSh 1 year ago
parent
commit
cab274f560
No account linked to committer's email address
5 changed files with 13 additions and 37 deletions
  1. 3
    5
      api/utils/file_utils.py
  2. 2
    21
      deepdoc/parser/pdf_parser.py
  3. 7
    8
      deepdoc/vision/__init__.py
  4. 1
    1
      rag/utils/minio_conn.py
  5. 0
    2
      requirements.txt

+ 3
- 5
api/utils/file_utils.py View File

@@ -19,7 +19,7 @@ import os
import re
from io import BytesIO
import fitz
import pdfplumber
from PIL import Image
from cachetools import LRUCache, cached
from ruamel.yaml import YAML
@@ -172,11 +172,9 @@ def filename_type(filename):
def thumbnail(filename, blob):
filename = filename.lower()
if re.match(r".*\.pdf$", filename):
pdf = fitz.open(stream=blob, filetype="pdf")
pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03))
pdf = pdfplumber.open(BytesIO(blob))
buffered = BytesIO()
Image.frombytes("RGB", [pix.width, pix.height],
pix.samples).save(buffered, format="png")
pdf.pages[0].to_image().annotated.save(buffered, format="png")
return "data:image/png;base64," + \
base64.b64encode(buffered.getvalue()).decode("utf-8")

+ 2
- 21
deepdoc/parser/pdf_parser.py View File

@@ -2,7 +2,6 @@
import os
import random

import fitz
import xgboost as xgb
from io import BytesIO
import torch
@@ -922,9 +921,7 @@ class RAGFlowPdfParser:
fnm) if not binary else pdfplumber.open(BytesIO(binary))
return len(pdf.pages)
except Exception as e:
pdf = fitz.open(fnm) if not binary else fitz.open(
stream=fnm, filetype="pdf")
return len(pdf)
logging.error(str(e))

def __images__(self, fnm, zoomin=3, page_from=0,
page_to=299, callback=None):
@@ -946,23 +943,7 @@ class RAGFlowPdfParser:
self.pdf.pages[page_from:page_to]]
self.total_page = len(self.pdf.pages)
except Exception as e:
self.pdf = fitz.open(fnm) if isinstance(
fnm, str) else fitz.open(
stream=fnm, filetype="pdf")
self.page_images = []
self.page_chars = []
mat = fitz.Matrix(zoomin, zoomin)
self.total_page = len(self.pdf)
for i, page in enumerate(self.pdf):
if i < page_from:
continue
if i >= page_to:
break
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height],
pix.samples)
self.page_images.append(img)
self.page_chars.append([])
logging.error(str(e))

self.outlines = []
try:

+ 7
- 8
deepdoc/vision/__init__.py View File

@@ -1,12 +1,13 @@
import pdfplumber
from .ocr import OCR
from .recognizer import Recognizer
from .layout_recognizer import LayoutRecognizer
from .table_structure_recognizer import TableStructureRecognizer
def init_in_out(args):
from PIL import Image
import fitz
import os
import traceback
from api.utils.file_utils import traversal_files
@@ -18,13 +19,11 @@ def init_in_out(args):
def pdf_pages(fnm, zoomin=3):
nonlocal outputs, images
pdf = fitz.open(fnm)
mat = fitz.Matrix(zoomin, zoomin)
for i, page in enumerate(pdf):
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height],
pix.samples)
images.append(img)
pdf = pdfplumber.open(fnm)
images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
enumerate(pdf.pages)]
for i, page in enumerate(images):
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
def images_and_outputs(fnm):

+ 1
- 1
rag/utils/minio_conn.py View File

@@ -35,7 +35,7 @@ class RAGFlowMinio(object):
self.conn = None

def put(self, bucket, fnm, binary):
for _ in range(10):
for _ in range(3):
try:
if not self.conn.bucket_exists(bucket):
self.conn.make_bucket(bucket)

+ 0
- 2
requirements.txt View File

@@ -91,8 +91,6 @@ pycryptodomex==3.20.0
pydantic==2.6.2
pydantic_core==2.16.3
PyJWT==2.8.0
PyMuPDF==1.23.25
PyMuPDFb==1.23.22
PyMySQL==1.1.0
PyPDF2==3.0.1
pypdfium2==4.27.0

Loading…
Cancel
Save