You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

file_utils.py 7.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import base64
  17. import json
  18. import os
  19. import re
  20. from io import BytesIO
  21. import pdfplumber
  22. from PIL import Image
  23. from cachetools import LRUCache, cached
  24. from ruamel.yaml import YAML
  25. from api.db import FileType
  26. from api.constants import IMG_BASE64_PREFIX
  27. PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
  28. RAG_BASE = os.getenv("RAG_BASE")
  29. def get_project_base_directory(*args):
  30. global PROJECT_BASE
  31. if PROJECT_BASE is None:
  32. PROJECT_BASE = os.path.abspath(
  33. os.path.join(
  34. os.path.dirname(os.path.realpath(__file__)),
  35. os.pardir,
  36. os.pardir,
  37. )
  38. )
  39. if args:
  40. return os.path.join(PROJECT_BASE, *args)
  41. return PROJECT_BASE
  42. def get_rag_directory(*args):
  43. global RAG_BASE
  44. if RAG_BASE is None:
  45. RAG_BASE = os.path.abspath(
  46. os.path.join(
  47. os.path.dirname(os.path.realpath(__file__)),
  48. os.pardir,
  49. os.pardir,
  50. os.pardir,
  51. )
  52. )
  53. if args:
  54. return os.path.join(RAG_BASE, *args)
  55. return RAG_BASE
  56. def get_rag_python_directory(*args):
  57. return get_rag_directory("python", *args)
  58. def get_home_cache_dir():
  59. dir = os.path.join(os.path.expanduser('~'), ".ragflow")
  60. try:
  61. os.mkdir(dir)
  62. except OSError:
  63. pass
  64. return dir
  65. @cached(cache=LRUCache(maxsize=10))
  66. def load_json_conf(conf_path):
  67. if os.path.isabs(conf_path):
  68. json_conf_path = conf_path
  69. else:
  70. json_conf_path = os.path.join(get_project_base_directory(), conf_path)
  71. try:
  72. with open(json_conf_path) as f:
  73. return json.load(f)
  74. except BaseException:
  75. raise EnvironmentError(
  76. "loading json file config from '{}' failed!".format(json_conf_path)
  77. )
  78. def dump_json_conf(config_data, conf_path):
  79. if os.path.isabs(conf_path):
  80. json_conf_path = conf_path
  81. else:
  82. json_conf_path = os.path.join(get_project_base_directory(), conf_path)
  83. try:
  84. with open(json_conf_path, "w") as f:
  85. json.dump(config_data, f, indent=4)
  86. except BaseException:
  87. raise EnvironmentError(
  88. "loading json file config from '{}' failed!".format(json_conf_path)
  89. )
  90. def load_json_conf_real_time(conf_path):
  91. if os.path.isabs(conf_path):
  92. json_conf_path = conf_path
  93. else:
  94. json_conf_path = os.path.join(get_project_base_directory(), conf_path)
  95. try:
  96. with open(json_conf_path) as f:
  97. return json.load(f)
  98. except BaseException:
  99. raise EnvironmentError(
  100. "loading json file config from '{}' failed!".format(json_conf_path)
  101. )
  102. def load_yaml_conf(conf_path):
  103. if not os.path.isabs(conf_path):
  104. conf_path = os.path.join(get_project_base_directory(), conf_path)
  105. try:
  106. with open(conf_path) as f:
  107. yaml = YAML(typ='safe', pure=True)
  108. return yaml.load(f)
  109. except Exception as e:
  110. raise EnvironmentError(
  111. "loading yaml file config from {} failed:".format(conf_path), e
  112. )
  113. def rewrite_yaml_conf(conf_path, config):
  114. if not os.path.isabs(conf_path):
  115. conf_path = os.path.join(get_project_base_directory(), conf_path)
  116. try:
  117. with open(conf_path, "w") as f:
  118. yaml = YAML(typ="safe")
  119. yaml.dump(config, f)
  120. except Exception as e:
  121. raise EnvironmentError(
  122. "rewrite yaml file config {} failed:".format(conf_path), e
  123. )
  124. def rewrite_json_file(filepath, json_data):
  125. with open(filepath, "w", encoding='utf-8') as f:
  126. json.dump(json_data, f, indent=4, separators=(",", ": "))
  127. f.close()
  128. def filename_type(filename):
  129. filename = filename.lower()
  130. if re.match(r".*\.pdf$", filename):
  131. return FileType.PDF.value
  132. if re.match(
  133. r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
  134. return FileType.DOC.value
  135. if re.match(
  136. r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):
  137. return FileType.AURAL.value
  138. if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
  139. return FileType.VISUAL.value
  140. return FileType.OTHER.value
  141. def thumbnail_img(filename, blob):
  142. """
  143. MySQL LongText max length is 65535
  144. """
  145. filename = filename.lower()
  146. if re.match(r".*\.pdf$", filename):
  147. pdf = pdfplumber.open(BytesIO(blob))
  148. buffered = BytesIO()
  149. resolution = 32
  150. img = None
  151. for _ in range(10):
  152. # https://github.com/jsvine/pdfplumber?tab=readme-ov-file#creating-a-pageimage-with-to_image
  153. pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png")
  154. img = buffered.getvalue()
  155. if len(img) >= 64000 and resolution >= 2:
  156. resolution = resolution / 2
  157. buffered = BytesIO()
  158. else:
  159. break
  160. return img
  161. elif re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
  162. image = Image.open(BytesIO(blob))
  163. image.thumbnail((30, 30))
  164. buffered = BytesIO()
  165. image.save(buffered, format="png")
  166. return buffered.getvalue()
  167. elif re.match(r".*\.(ppt|pptx)$", filename):
  168. import aspose.slides as slides
  169. import aspose.pydrawing as drawing
  170. try:
  171. with slides.Presentation(BytesIO(blob)) as presentation:
  172. buffered = BytesIO()
  173. scale = 0.03
  174. img = None
  175. for _ in range(10):
  176. # https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float
  177. presentation.slides[0].get_thumbnail(scale, scale).save(
  178. buffered, drawing.imaging.ImageFormat.png)
  179. img = buffered.getvalue()
  180. if len(img) >= 64000:
  181. scale = scale / 2.0
  182. buffered = BytesIO()
  183. else:
  184. break
  185. return img
  186. except Exception:
  187. pass
  188. return None
  189. def thumbnail(filename, blob):
  190. img = thumbnail_img(filename, blob)
  191. if img is not None:
  192. return IMG_BASE64_PREFIX + \
  193. base64.b64encode(img).decode("utf-8")
  194. else:
  195. return ''
  196. def traversal_files(base):
  197. for root, ds, fs in os.walk(base):
  198. for f in fs:
  199. fullname = os.path.join(root, f)
  200. yield fullname