Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

file_utils.py 8.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import base64
  17. import json
  18. import os
  19. import re
  20. import shutil
  21. import subprocess
  22. import sys
  23. import tempfile
  24. import threading
  25. from io import BytesIO
  26. import pdfplumber
  27. from cachetools import LRUCache, cached
  28. from PIL import Image
  29. from ruamel.yaml import YAML
  30. from api.constants import IMG_BASE64_PREFIX
  31. from api.db import FileType
  32. PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
  33. RAG_BASE = os.getenv("RAG_BASE")
  34. LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
  35. if LOCK_KEY_pdfplumber not in sys.modules:
  36. sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
  37. def get_project_base_directory(*args):
  38. global PROJECT_BASE
  39. if PROJECT_BASE is None:
  40. PROJECT_BASE = os.path.abspath(
  41. os.path.join(
  42. os.path.dirname(os.path.realpath(__file__)),
  43. os.pardir,
  44. os.pardir,
  45. )
  46. )
  47. if args:
  48. return os.path.join(PROJECT_BASE, *args)
  49. return PROJECT_BASE
  50. def get_rag_directory(*args):
  51. global RAG_BASE
  52. if RAG_BASE is None:
  53. RAG_BASE = os.path.abspath(
  54. os.path.join(
  55. os.path.dirname(os.path.realpath(__file__)),
  56. os.pardir,
  57. os.pardir,
  58. os.pardir,
  59. )
  60. )
  61. if args:
  62. return os.path.join(RAG_BASE, *args)
  63. return RAG_BASE
  64. def get_rag_python_directory(*args):
  65. return get_rag_directory("python", *args)
  66. def get_home_cache_dir():
  67. dir = os.path.join(os.path.expanduser("~"), ".ragflow")
  68. try:
  69. os.mkdir(dir)
  70. except OSError:
  71. pass
  72. return dir
  73. @cached(cache=LRUCache(maxsize=10))
  74. def load_json_conf(conf_path):
  75. if os.path.isabs(conf_path):
  76. json_conf_path = conf_path
  77. else:
  78. json_conf_path = os.path.join(get_project_base_directory(), conf_path)
  79. try:
  80. with open(json_conf_path) as f:
  81. return json.load(f)
  82. except BaseException:
  83. raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
  84. def dump_json_conf(config_data, conf_path):
  85. if os.path.isabs(conf_path):
  86. json_conf_path = conf_path
  87. else:
  88. json_conf_path = os.path.join(get_project_base_directory(), conf_path)
  89. try:
  90. with open(json_conf_path, "w") as f:
  91. json.dump(config_data, f, indent=4)
  92. except BaseException:
  93. raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
  94. def load_json_conf_real_time(conf_path):
  95. if os.path.isabs(conf_path):
  96. json_conf_path = conf_path
  97. else:
  98. json_conf_path = os.path.join(get_project_base_directory(), conf_path)
  99. try:
  100. with open(json_conf_path) as f:
  101. return json.load(f)
  102. except BaseException:
  103. raise EnvironmentError("loading json file config from '{}' failed!".format(json_conf_path))
  104. def load_yaml_conf(conf_path):
  105. if not os.path.isabs(conf_path):
  106. conf_path = os.path.join(get_project_base_directory(), conf_path)
  107. try:
  108. with open(conf_path) as f:
  109. yaml = YAML(typ="safe", pure=True)
  110. return yaml.load(f)
  111. except Exception as e:
  112. raise EnvironmentError("loading yaml file config from {} failed:".format(conf_path), e)
  113. def rewrite_yaml_conf(conf_path, config):
  114. if not os.path.isabs(conf_path):
  115. conf_path = os.path.join(get_project_base_directory(), conf_path)
  116. try:
  117. with open(conf_path, "w") as f:
  118. yaml = YAML(typ="safe")
  119. yaml.dump(config, f)
  120. except Exception as e:
  121. raise EnvironmentError("rewrite yaml file config {} failed:".format(conf_path), e)
  122. def rewrite_json_file(filepath, json_data):
  123. with open(filepath, "w", encoding="utf-8") as f:
  124. json.dump(json_data, f, indent=4, separators=(",", ": "))
  125. f.close()
  126. def filename_type(filename):
  127. filename = filename.lower()
  128. if re.match(r".*\.pdf$", filename):
  129. return FileType.PDF.value
  130. if re.match(r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
  131. return FileType.DOC.value
  132. if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):
  133. return FileType.AURAL.value
  134. if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
  135. return FileType.VISUAL.value
  136. return FileType.OTHER.value
  137. def thumbnail_img(filename, blob):
  138. """
  139. MySQL LongText max length is 65535
  140. """
  141. filename = filename.lower()
  142. if re.match(r".*\.pdf$", filename):
  143. with sys.modules[LOCK_KEY_pdfplumber]:
  144. pdf = pdfplumber.open(BytesIO(blob))
  145. buffered = BytesIO()
  146. resolution = 32
  147. img = None
  148. for _ in range(10):
  149. # https://github.com/jsvine/pdfplumber?tab=readme-ov-file#creating-a-pageimage-with-to_image
  150. pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png")
  151. img = buffered.getvalue()
  152. if len(img) >= 64000 and resolution >= 2:
  153. resolution = resolution / 2
  154. buffered = BytesIO()
  155. else:
  156. break
  157. pdf.close()
  158. return img
  159. elif re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
  160. image = Image.open(BytesIO(blob))
  161. image.thumbnail((30, 30))
  162. buffered = BytesIO()
  163. image.save(buffered, format="png")
  164. return buffered.getvalue()
  165. elif re.match(r".*\.(ppt|pptx)$", filename):
  166. import aspose.pydrawing as drawing
  167. import aspose.slides as slides
  168. try:
  169. with slides.Presentation(BytesIO(blob)) as presentation:
  170. buffered = BytesIO()
  171. scale = 0.03
  172. img = None
  173. for _ in range(10):
  174. # https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float
  175. presentation.slides[0].get_thumbnail(scale, scale).save(buffered, drawing.imaging.ImageFormat.png)
  176. img = buffered.getvalue()
  177. if len(img) >= 64000:
  178. scale = scale / 2.0
  179. buffered = BytesIO()
  180. else:
  181. break
  182. return img
  183. except Exception:
  184. pass
  185. return None
  186. def thumbnail(filename, blob):
  187. img = thumbnail_img(filename, blob)
  188. if img is not None:
  189. return IMG_BASE64_PREFIX + base64.b64encode(img).decode("utf-8")
  190. else:
  191. return ""
  192. def traversal_files(base):
  193. for root, ds, fs in os.walk(base):
  194. for f in fs:
  195. fullname = os.path.join(root, f)
  196. yield fullname
  197. def repair_pdf_with_ghostscript(input_bytes):
  198. if shutil.which("gs") is None:
  199. return input_bytes
  200. with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_in, tempfile.NamedTemporaryFile(suffix=".pdf") as temp_out:
  201. temp_in.write(input_bytes)
  202. temp_in.flush()
  203. cmd = [
  204. "gs",
  205. "-o",
  206. temp_out.name,
  207. "-sDEVICE=pdfwrite",
  208. "-dPDFSETTINGS=/prepress",
  209. temp_in.name,
  210. ]
  211. try:
  212. proc = subprocess.run(cmd, capture_output=True, text=True)
  213. if proc.returncode != 0:
  214. return input_bytes
  215. except Exception:
  216. return input_bytes
  217. temp_out.seek(0)
  218. repaired_bytes = temp_out.read()
  219. return repaired_bytes
  220. def read_potential_broken_pdf(blob):
  221. def try_open(blob):
  222. try:
  223. with pdfplumber.open(BytesIO(blob)) as pdf:
  224. if pdf.pages:
  225. return True
  226. except Exception:
  227. return False
  228. return False
  229. if try_open(blob):
  230. return blob
  231. repaired = repair_pdf_with_ghostscript(blob)
  232. if try_open(repaired):
  233. return repaired
  234. return blob