| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286 |
- #
- # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- #
- import base64
- import json
- import os
- import re
- import shutil
- import subprocess
- import sys
- import tempfile
- import threading
- from io import BytesIO
-
- import pdfplumber
- from cachetools import LRUCache, cached
- from PIL import Image
- from ruamel.yaml import YAML
-
- from api.constants import IMG_BASE64_PREFIX
- from api.db import FileType
-
- PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
- RAG_BASE = os.getenv("RAG_BASE")
-
- LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
- if LOCK_KEY_pdfplumber not in sys.modules:
- sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
-
-
- def get_project_base_directory(*args):
- global PROJECT_BASE
- if PROJECT_BASE is None:
- PROJECT_BASE = os.path.abspath(
- os.path.join(
- os.path.dirname(os.path.realpath(__file__)),
- os.pardir,
- os.pardir,
- )
- )
-
- if args:
- return os.path.join(PROJECT_BASE, *args)
- return PROJECT_BASE
-
-
- def get_rag_directory(*args):
- global RAG_BASE
- if RAG_BASE is None:
- RAG_BASE = os.path.abspath(
- os.path.join(
- os.path.dirname(os.path.realpath(__file__)),
- os.pardir,
- os.pardir,
- os.pardir,
- )
- )
- if args:
- return os.path.join(RAG_BASE, *args)
- return RAG_BASE
-
-
- def get_rag_python_directory(*args):
- return get_rag_directory("python", *args)
-
-
- def get_home_cache_dir():
- dir = os.path.join(os.path.expanduser("~"), ".ragflow")
- try:
- os.mkdir(dir)
- except OSError:
- pass
- return dir
-
-
- @cached(cache=LRUCache(maxsize=10))
- def load_json_conf(conf_path):
- if os.path.isabs(conf_path):
- json_conf_path = conf_path
- else:
- json_conf_path = os.path.join(get_project_base_directory(), conf_path)
- try:
- with open(json_conf_path) as f:
- return json.load(f)
- except BaseException:
- raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
-
-
- def dump_json_conf(config_data, conf_path):
- if os.path.isabs(conf_path):
- json_conf_path = conf_path
- else:
- json_conf_path = os.path.join(get_project_base_directory(), conf_path)
- try:
- with open(json_conf_path, "w") as f:
- json.dump(config_data, f, indent=4)
- except BaseException:
- raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
-
-
- def load_json_conf_real_time(conf_path):
- if os.path.isabs(conf_path):
- json_conf_path = conf_path
- else:
- json_conf_path = os.path.join(get_project_base_directory(), conf_path)
- try:
- with open(json_conf_path) as f:
- return json.load(f)
- except BaseException:
- raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
-
-
- def load_yaml_conf(conf_path):
- if not os.path.isabs(conf_path):
- conf_path = os.path.join(get_project_base_directory(), conf_path)
- try:
- with open(conf_path) as f:
- yaml = YAML(typ="safe", pure=True)
- return yaml.load(f)
- except Exception as e:
- raise EnvironmentError("loading yaml file config from {} failed:".format(conf_path), e)
-
-
- def rewrite_yaml_conf(conf_path, config):
- if not os.path.isabs(conf_path):
- conf_path = os.path.join(get_project_base_directory(), conf_path)
- try:
- with open(conf_path, "w") as f:
- yaml = YAML(typ="safe")
- yaml.dump(config, f)
- except Exception as e:
- raise EnvironmentError("rewrite yaml file config {} failed:".format(conf_path), e)
-
-
- def rewrite_json_file(filepath, json_data):
- with open(filepath, "w", encoding="utf-8") as f:
- json.dump(json_data, f, indent=4, separators=(",", ": "))
- f.close()
-
-
- def filename_type(filename):
- filename = filename.lower()
- if re.match(r".*\.pdf$", filename):
- return FileType.PDF.value
-
- if re.match(r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
- return FileType.DOC.value
-
- if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus)$", filename):
- return FileType.AURAL.value
-
- if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
- return FileType.VISUAL.value
-
- return FileType.OTHER.value
-
-
- def thumbnail_img(filename, blob):
- """
- MySQL LongText max length is 65535
- """
- filename = filename.lower()
- if re.match(r".*\.pdf$", filename):
- with sys.modules[LOCK_KEY_pdfplumber]:
- pdf = pdfplumber.open(BytesIO(blob))
-
- buffered = BytesIO()
- resolution = 32
- img = None
- for _ in range(10):
- # https://github.com/jsvine/pdfplumber?tab=readme-ov-file#creating-a-pageimage-with-to_image
- pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png")
- img = buffered.getvalue()
- if len(img) >= 64000 and resolution >= 2:
- resolution = resolution / 2
- buffered = BytesIO()
- else:
- break
- pdf.close()
- return img
-
- elif re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
- image = Image.open(BytesIO(blob))
- image.thumbnail((30, 30))
- buffered = BytesIO()
- image.save(buffered, format="png")
- return buffered.getvalue()
-
- elif re.match(r".*\.(ppt|pptx)$", filename):
- import aspose.pydrawing as drawing
- import aspose.slides as slides
-
- try:
- with slides.Presentation(BytesIO(blob)) as presentation:
- buffered = BytesIO()
- scale = 0.03
- img = None
- for _ in range(10):
- # https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float
- presentation.slides[0].get_thumbnail(scale, scale).save(buffered, drawing.imaging.ImageFormat.png)
- img = buffered.getvalue()
- if len(img) >= 64000:
- scale = scale / 2.0
- buffered = BytesIO()
- else:
- break
- return img
- except Exception:
- pass
- return None
-
-
- def thumbnail(filename, blob):
- img = thumbnail_img(filename, blob)
- if img is not None:
- return IMG_BASE64_PREFIX + base64.b64encode(img).decode("utf-8")
- else:
- return ""
-
-
- def traversal_files(base):
- for root, ds, fs in os.walk(base):
- for f in fs:
- fullname = os.path.join(root, f)
- yield fullname
-
-
- def repair_pdf_with_ghostscript(input_bytes):
- if shutil.which("gs") is None:
- return input_bytes
-
- with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_in, tempfile.NamedTemporaryFile(suffix=".pdf") as temp_out:
- temp_in.write(input_bytes)
- temp_in.flush()
-
- cmd = [
- "gs",
- "-o",
- temp_out.name,
- "-sDEVICE=pdfwrite",
- "-dPDFSETTINGS=/prepress",
- temp_in.name,
- ]
- try:
- proc = subprocess.run(cmd, capture_output=True, text=True)
- if proc.returncode != 0:
- return input_bytes
- except Exception:
- return input_bytes
-
- temp_out.seek(0)
- repaired_bytes = temp_out.read()
-
- return repaired_bytes
-
-
- def read_potential_broken_pdf(blob):
- def try_open(blob):
- try:
- with pdfplumber.open(BytesIO(blob)) as pdf:
- if pdf.pages:
- return True
- except Exception:
- return False
- return False
-
- if try_open(blob):
- return blob
-
- repaired = repair_pdf_with_ghostscript(blob)
- if try_open(repaired):
- return repaired
-
- return blob
|