浏览代码

remove PyMuPDF (#618)

### What problem does this PR solve?
#613 

### Type of change


- [x] Other (please describe):
tags/v0.5.0
KevinHuSh 1年前
父节点
当前提交
cab274f560
没有帐户链接到提交者的电子邮件
共有 5 个文件被更改,包括 13 次插入37 次删除
  1. 3
    5
      api/utils/file_utils.py
  2. 2
    21
      deepdoc/parser/pdf_parser.py
  3. 7
    8
      deepdoc/vision/__init__.py
  4. 1
    1
      rag/utils/minio_conn.py
  5. 0
    2
      requirements.txt

+ 3
- 5
api/utils/file_utils.py 查看文件

import re import re
from io import BytesIO from io import BytesIO
import fitz
import pdfplumber
from PIL import Image from PIL import Image
from cachetools import LRUCache, cached from cachetools import LRUCache, cached
from ruamel.yaml import YAML from ruamel.yaml import YAML
def thumbnail(filename, blob): def thumbnail(filename, blob):
filename = filename.lower() filename = filename.lower()
if re.match(r".*\.pdf$", filename): if re.match(r".*\.pdf$", filename):
pdf = fitz.open(stream=blob, filetype="pdf")
pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03))
pdf = pdfplumber.open(BytesIO(blob))
buffered = BytesIO() buffered = BytesIO()
Image.frombytes("RGB", [pix.width, pix.height],
pix.samples).save(buffered, format="png")
pdf.pages[0].to_image().annotated.save(buffered, format="png")
return "data:image/png;base64," + \ return "data:image/png;base64," + \
base64.b64encode(buffered.getvalue()).decode("utf-8") base64.b64encode(buffered.getvalue()).decode("utf-8")

+ 2
- 21
deepdoc/parser/pdf_parser.py 查看文件

import os import os
import random import random


import fitz
import xgboost as xgb import xgboost as xgb
from io import BytesIO from io import BytesIO
import torch import torch
fnm) if not binary else pdfplumber.open(BytesIO(binary)) fnm) if not binary else pdfplumber.open(BytesIO(binary))
return len(pdf.pages) return len(pdf.pages)
except Exception as e: except Exception as e:
pdf = fitz.open(fnm) if not binary else fitz.open(
stream=fnm, filetype="pdf")
return len(pdf)
logging.error(str(e))


def __images__(self, fnm, zoomin=3, page_from=0, def __images__(self, fnm, zoomin=3, page_from=0,
page_to=299, callback=None): page_to=299, callback=None):
self.pdf.pages[page_from:page_to]] self.pdf.pages[page_from:page_to]]
self.total_page = len(self.pdf.pages) self.total_page = len(self.pdf.pages)
except Exception as e: except Exception as e:
self.pdf = fitz.open(fnm) if isinstance(
fnm, str) else fitz.open(
stream=fnm, filetype="pdf")
self.page_images = []
self.page_chars = []
mat = fitz.Matrix(zoomin, zoomin)
self.total_page = len(self.pdf)
for i, page in enumerate(self.pdf):
if i < page_from:
continue
if i >= page_to:
break
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height],
pix.samples)
self.page_images.append(img)
self.page_chars.append([])
logging.error(str(e))


self.outlines = [] self.outlines = []
try: try:

+ 7
- 8
deepdoc/vision/__init__.py 查看文件

import pdfplumber
from .ocr import OCR from .ocr import OCR
from .recognizer import Recognizer from .recognizer import Recognizer
from .layout_recognizer import LayoutRecognizer from .layout_recognizer import LayoutRecognizer
from .table_structure_recognizer import TableStructureRecognizer from .table_structure_recognizer import TableStructureRecognizer
def init_in_out(args): def init_in_out(args):
from PIL import Image from PIL import Image
import fitz
import os import os
import traceback import traceback
from api.utils.file_utils import traversal_files from api.utils.file_utils import traversal_files
def pdf_pages(fnm, zoomin=3): def pdf_pages(fnm, zoomin=3):
nonlocal outputs, images nonlocal outputs, images
pdf = fitz.open(fnm)
mat = fitz.Matrix(zoomin, zoomin)
for i, page in enumerate(pdf):
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height],
pix.samples)
images.append(img)
pdf = pdfplumber.open(fnm)
images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
enumerate(pdf.pages)]
for i, page in enumerate(images):
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg") outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
def images_and_outputs(fnm): def images_and_outputs(fnm):

+ 1
- 1
rag/utils/minio_conn.py 查看文件

self.conn = None self.conn = None


def put(self, bucket, fnm, binary): def put(self, bucket, fnm, binary):
for _ in range(10):
for _ in range(3):
try: try:
if not self.conn.bucket_exists(bucket): if not self.conn.bucket_exists(bucket):
self.conn.make_bucket(bucket) self.conn.make_bucket(bucket)

+ 0
- 2
requirements.txt 查看文件

pydantic==2.6.2 pydantic==2.6.2
pydantic_core==2.16.3 pydantic_core==2.16.3
PyJWT==2.8.0 PyJWT==2.8.0
PyMuPDF==1.23.25
PyMuPDFb==1.23.22
PyMySQL==1.1.0 PyMySQL==1.1.0
PyPDF2==3.0.1 PyPDF2==3.0.1
pypdfium2==4.27.0 pypdfium2==4.27.0

正在加载...
取消
保存