### What problem does this PR solve? #613 ### Type of change - [x] Other (please describe):tags/v0.5.0
| import re | import re | ||||
| from io import BytesIO | from io import BytesIO | ||||
| import fitz | |||||
| import pdfplumber | |||||
| from PIL import Image | from PIL import Image | ||||
| from cachetools import LRUCache, cached | from cachetools import LRUCache, cached | ||||
| from ruamel.yaml import YAML | from ruamel.yaml import YAML | ||||
| def thumbnail(filename, blob): | def thumbnail(filename, blob): | ||||
| filename = filename.lower() | filename = filename.lower() | ||||
| if re.match(r".*\.pdf$", filename): | if re.match(r".*\.pdf$", filename): | ||||
| pdf = fitz.open(stream=blob, filetype="pdf") | |||||
| pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03)) | |||||
| pdf = pdfplumber.open(BytesIO(blob)) | |||||
| buffered = BytesIO() | buffered = BytesIO() | ||||
| Image.frombytes("RGB", [pix.width, pix.height], | |||||
| pix.samples).save(buffered, format="png") | |||||
| pdf.pages[0].to_image().annotated.save(buffered, format="png") | |||||
| return "data:image/png;base64," + \ | return "data:image/png;base64," + \ | ||||
| base64.b64encode(buffered.getvalue()).decode("utf-8") | base64.b64encode(buffered.getvalue()).decode("utf-8") | ||||
| import os | import os | ||||
| import random | import random | ||||
| import fitz | |||||
| import xgboost as xgb | import xgboost as xgb | ||||
| from io import BytesIO | from io import BytesIO | ||||
| import torch | import torch | ||||
| fnm) if not binary else pdfplumber.open(BytesIO(binary)) | fnm) if not binary else pdfplumber.open(BytesIO(binary)) | ||||
| return len(pdf.pages) | return len(pdf.pages) | ||||
| except Exception as e: | except Exception as e: | ||||
| pdf = fitz.open(fnm) if not binary else fitz.open( | |||||
| stream=fnm, filetype="pdf") | |||||
| return len(pdf) | |||||
| logging.error(str(e)) | |||||
| def __images__(self, fnm, zoomin=3, page_from=0, | def __images__(self, fnm, zoomin=3, page_from=0, | ||||
| page_to=299, callback=None): | page_to=299, callback=None): | ||||
| self.pdf.pages[page_from:page_to]] | self.pdf.pages[page_from:page_to]] | ||||
| self.total_page = len(self.pdf.pages) | self.total_page = len(self.pdf.pages) | ||||
| except Exception as e: | except Exception as e: | ||||
| self.pdf = fitz.open(fnm) if isinstance( | |||||
| fnm, str) else fitz.open( | |||||
| stream=fnm, filetype="pdf") | |||||
| self.page_images = [] | |||||
| self.page_chars = [] | |||||
| mat = fitz.Matrix(zoomin, zoomin) | |||||
| self.total_page = len(self.pdf) | |||||
| for i, page in enumerate(self.pdf): | |||||
| if i < page_from: | |||||
| continue | |||||
| if i >= page_to: | |||||
| break | |||||
| pix = page.get_pixmap(matrix=mat) | |||||
| img = Image.frombytes("RGB", [pix.width, pix.height], | |||||
| pix.samples) | |||||
| self.page_images.append(img) | |||||
| self.page_chars.append([]) | |||||
| logging.error(str(e)) | |||||
| self.outlines = [] | self.outlines = [] | ||||
| try: | try: |
| import pdfplumber | |||||
| from .ocr import OCR | from .ocr import OCR | ||||
| from .recognizer import Recognizer | from .recognizer import Recognizer | ||||
| from .layout_recognizer import LayoutRecognizer | from .layout_recognizer import LayoutRecognizer | ||||
| from .table_structure_recognizer import TableStructureRecognizer | from .table_structure_recognizer import TableStructureRecognizer | ||||
| def init_in_out(args): | def init_in_out(args): | ||||
| from PIL import Image | from PIL import Image | ||||
| import fitz | |||||
| import os | import os | ||||
| import traceback | import traceback | ||||
| from api.utils.file_utils import traversal_files | from api.utils.file_utils import traversal_files | ||||
| def pdf_pages(fnm, zoomin=3): | def pdf_pages(fnm, zoomin=3): | ||||
| nonlocal outputs, images | nonlocal outputs, images | ||||
| pdf = fitz.open(fnm) | |||||
| mat = fitz.Matrix(zoomin, zoomin) | |||||
| for i, page in enumerate(pdf): | |||||
| pix = page.get_pixmap(matrix=mat) | |||||
| img = Image.frombytes("RGB", [pix.width, pix.height], | |||||
| pix.samples) | |||||
| images.append(img) | |||||
| pdf = pdfplumber.open(fnm) | |||||
| images = [p.to_image(resolution=72 * zoomin).annotated for i, p in | |||||
| enumerate(pdf.pages)] | |||||
| for i, page in enumerate(images): | |||||
| outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg") | outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg") | ||||
| def images_and_outputs(fnm): | def images_and_outputs(fnm): |
| self.conn = None | self.conn = None | ||||
| def put(self, bucket, fnm, binary): | def put(self, bucket, fnm, binary): | ||||
| for _ in range(10): | |||||
| for _ in range(3): | |||||
| try: | try: | ||||
| if not self.conn.bucket_exists(bucket): | if not self.conn.bucket_exists(bucket): | ||||
| self.conn.make_bucket(bucket) | self.conn.make_bucket(bucket) |
| pydantic==2.6.2 | pydantic==2.6.2 | ||||
| pydantic_core==2.16.3 | pydantic_core==2.16.3 | ||||
| PyJWT==2.8.0 | PyJWT==2.8.0 | ||||
| PyMuPDF==1.23.25 | |||||
| PyMuPDFb==1.23.22 | |||||
| PyMySQL==1.1.0 | PyMySQL==1.1.0 | ||||
| PyPDF2==3.0.1 | PyPDF2==3.0.1 | ||||
| pypdfium2==4.27.0 | pypdfium2==4.27.0 |