Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

file_utils.py 7.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import base64
  17. import json
  18. import os
  19. import re
  20. import sys
  21. import threading
  22. from io import BytesIO
  23. import pdfplumber
  24. from PIL import Image
  25. from cachetools import LRUCache, cached
  26. from ruamel.yaml import YAML
  27. from api.db import FileType
  28. from api.constants import IMG_BASE64_PREFIX
  29. PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
  30. RAG_BASE = os.getenv("RAG_BASE")
  31. LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
  32. if LOCK_KEY_pdfplumber not in sys.modules:
  33. sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
  34. def get_project_base_directory(*args):
  35. global PROJECT_BASE
  36. if PROJECT_BASE is None:
  37. PROJECT_BASE = os.path.abspath(
  38. os.path.join(
  39. os.path.dirname(os.path.realpath(__file__)),
  40. os.pardir,
  41. os.pardir,
  42. )
  43. )
  44. if args:
  45. return os.path.join(PROJECT_BASE, *args)
  46. return PROJECT_BASE
  47. def get_rag_directory(*args):
  48. global RAG_BASE
  49. if RAG_BASE is None:
  50. RAG_BASE = os.path.abspath(
  51. os.path.join(
  52. os.path.dirname(os.path.realpath(__file__)),
  53. os.pardir,
  54. os.pardir,
  55. os.pardir,
  56. )
  57. )
  58. if args:
  59. return os.path.join(RAG_BASE, *args)
  60. return RAG_BASE
  61. def get_rag_python_directory(*args):
  62. return get_rag_directory("python", *args)
  63. def get_home_cache_dir():
  64. dir = os.path.join(os.path.expanduser('~'), ".ragflow")
  65. try:
  66. os.mkdir(dir)
  67. except OSError:
  68. pass
  69. return dir
  70. @cached(cache=LRUCache(maxsize=10))
  71. def load_json_conf(conf_path):
  72. if os.path.isabs(conf_path):
  73. json_conf_path = conf_path
  74. else:
  75. json_conf_path = os.path.join(get_project_base_directory(), conf_path)
  76. try:
  77. with open(json_conf_path) as f:
  78. return json.load(f)
  79. except BaseException:
  80. raise EnvironmentError(
  81. "loading json file config from '{}' failed!".format(json_conf_path)
  82. )
  83. def dump_json_conf(config_data, conf_path):
  84. if os.path.isabs(conf_path):
  85. json_conf_path = conf_path
  86. else:
  87. json_conf_path = os.path.join(get_project_base_directory(), conf_path)
  88. try:
  89. with open(json_conf_path, "w") as f:
  90. json.dump(config_data, f, indent=4)
  91. except BaseException:
  92. raise EnvironmentError(
  93. "loading json file config from '{}' failed!".format(json_conf_path)
  94. )
  95. def load_json_conf_real_time(conf_path):
  96. if os.path.isabs(conf_path):
  97. json_conf_path = conf_path
  98. else:
  99. json_conf_path = os.path.join(get_project_base_directory(), conf_path)
  100. try:
  101. with open(json_conf_path) as f:
  102. return json.load(f)
  103. except BaseException:
  104. raise EnvironmentError(
  105. "loading json file config from '{}' failed!".format(json_conf_path)
  106. )
  107. def load_yaml_conf(conf_path):
  108. if not os.path.isabs(conf_path):
  109. conf_path = os.path.join(get_project_base_directory(), conf_path)
  110. try:
  111. with open(conf_path) as f:
  112. yaml = YAML(typ='safe', pure=True)
  113. return yaml.load(f)
  114. except Exception as e:
  115. raise EnvironmentError(
  116. "loading yaml file config from {} failed:".format(conf_path), e
  117. )
  118. def rewrite_yaml_conf(conf_path, config):
  119. if not os.path.isabs(conf_path):
  120. conf_path = os.path.join(get_project_base_directory(), conf_path)
  121. try:
  122. with open(conf_path, "w") as f:
  123. yaml = YAML(typ="safe")
  124. yaml.dump(config, f)
  125. except Exception as e:
  126. raise EnvironmentError(
  127. "rewrite yaml file config {} failed:".format(conf_path), e
  128. )
  129. def rewrite_json_file(filepath, json_data):
  130. with open(filepath, "w", encoding='utf-8') as f:
  131. json.dump(json_data, f, indent=4, separators=(",", ": "))
  132. f.close()
  133. def filename_type(filename):
  134. filename = filename.lower()
  135. if re.match(r".*\.pdf$", filename):
  136. return FileType.PDF.value
  137. if re.match(
  138. r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
  139. return FileType.DOC.value
  140. if re.match(
  141. r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):
  142. return FileType.AURAL.value
  143. if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
  144. return FileType.VISUAL.value
  145. return FileType.OTHER.value
  146. def thumbnail_img(filename, blob):
  147. """
  148. MySQL LongText max length is 65535
  149. """
  150. filename = filename.lower()
  151. if re.match(r".*\.pdf$", filename):
  152. with sys.modules[LOCK_KEY_pdfplumber]:
  153. pdf = pdfplumber.open(BytesIO(blob))
  154. buffered = BytesIO()
  155. resolution = 32
  156. img = None
  157. for _ in range(10):
  158. # https://github.com/jsvine/pdfplumber?tab=readme-ov-file#creating-a-pageimage-with-to_image
  159. pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png")
  160. img = buffered.getvalue()
  161. if len(img) >= 64000 and resolution >= 2:
  162. resolution = resolution / 2
  163. buffered = BytesIO()
  164. else:
  165. break
  166. pdf.close()
  167. return img
  168. elif re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
  169. image = Image.open(BytesIO(blob))
  170. image.thumbnail((30, 30))
  171. buffered = BytesIO()
  172. image.save(buffered, format="png")
  173. return buffered.getvalue()
  174. elif re.match(r".*\.(ppt|pptx)$", filename):
  175. import aspose.slides as slides
  176. import aspose.pydrawing as drawing
  177. try:
  178. with slides.Presentation(BytesIO(blob)) as presentation:
  179. buffered = BytesIO()
  180. scale = 0.03
  181. img = None
  182. for _ in range(10):
  183. # https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float
  184. presentation.slides[0].get_thumbnail(scale, scale).save(
  185. buffered, drawing.imaging.ImageFormat.png)
  186. img = buffered.getvalue()
  187. if len(img) >= 64000:
  188. scale = scale / 2.0
  189. buffered = BytesIO()
  190. else:
  191. break
  192. return img
  193. except Exception:
  194. pass
  195. return None
  196. def thumbnail(filename, blob):
  197. img = thumbnail_img(filename, blob)
  198. if img is not None:
  199. return IMG_BASE64_PREFIX + \
  200. base64.b64encode(img).decode("utf-8")
  201. else:
  202. return ''
  203. def traversal_files(base):
  204. for root, ds, fs in os.walk(base):
  205. for f in fs:
  206. fullname = os.path.join(root, f)
  207. yield fullname