Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import base64
  17. import json
  18. import os
  19. import re
  20. from io import BytesIO
  21. import pdfplumber
  22. from PIL import Image
  23. from cachetools import LRUCache, cached
  24. from ruamel.yaml import YAML
  25. from api.db import FileType
  26. PROJECT_BASE = os.getenv("RAG_PROJECT_BASE") or os.getenv("RAG_DEPLOY_BASE")
  27. RAG_BASE = os.getenv("RAG_BASE")
  28. def get_project_base_directory(*args):
  29. global PROJECT_BASE
  30. if PROJECT_BASE is None:
  31. PROJECT_BASE = os.path.abspath(
  32. os.path.join(
  33. os.path.dirname(os.path.realpath(__file__)),
  34. os.pardir,
  35. os.pardir,
  36. )
  37. )
  38. if args:
  39. return os.path.join(PROJECT_BASE, *args)
  40. return PROJECT_BASE
  41. def get_rag_directory(*args):
  42. global RAG_BASE
  43. if RAG_BASE is None:
  44. RAG_BASE = os.path.abspath(
  45. os.path.join(
  46. os.path.dirname(os.path.realpath(__file__)),
  47. os.pardir,
  48. os.pardir,
  49. os.pardir,
  50. )
  51. )
  52. if args:
  53. return os.path.join(RAG_BASE, *args)
  54. return RAG_BASE
  55. def get_rag_python_directory(*args):
  56. return get_rag_directory("python", *args)
  57. def get_home_cache_dir():
  58. dir = os.path.join(os.path.expanduser('~'), ".ragflow")
  59. try:
  60. os.mkdir(dir)
  61. except OSError as error:
  62. pass
  63. return dir
  64. @cached(cache=LRUCache(maxsize=10))
  65. def load_json_conf(conf_path):
  66. if os.path.isabs(conf_path):
  67. json_conf_path = conf_path
  68. else:
  69. json_conf_path = os.path.join(get_project_base_directory(), conf_path)
  70. try:
  71. with open(json_conf_path) as f:
  72. return json.load(f)
  73. except BaseException:
  74. raise EnvironmentError(
  75. "loading json file config from '{}' failed!".format(json_conf_path)
  76. )
  77. def dump_json_conf(config_data, conf_path):
  78. if os.path.isabs(conf_path):
  79. json_conf_path = conf_path
  80. else:
  81. json_conf_path = os.path.join(get_project_base_directory(), conf_path)
  82. try:
  83. with open(json_conf_path, "w") as f:
  84. json.dump(config_data, f, indent=4)
  85. except BaseException:
  86. raise EnvironmentError(
  87. "loading json file config from '{}' failed!".format(json_conf_path)
  88. )
  89. def load_json_conf_real_time(conf_path):
  90. if os.path.isabs(conf_path):
  91. json_conf_path = conf_path
  92. else:
  93. json_conf_path = os.path.join(get_project_base_directory(), conf_path)
  94. try:
  95. with open(json_conf_path) as f:
  96. return json.load(f)
  97. except BaseException:
  98. raise EnvironmentError(
  99. "loading json file config from '{}' failed!".format(json_conf_path)
  100. )
  101. def load_yaml_conf(conf_path):
  102. if not os.path.isabs(conf_path):
  103. conf_path = os.path.join(get_project_base_directory(), conf_path)
  104. try:
  105. with open(conf_path) as f:
  106. yaml = YAML(typ='safe', pure=True)
  107. return yaml.load(f)
  108. except Exception as e:
  109. raise EnvironmentError(
  110. "loading yaml file config from {} failed:".format(conf_path), e
  111. )
  112. def rewrite_yaml_conf(conf_path, config):
  113. if not os.path.isabs(conf_path):
  114. conf_path = os.path.join(get_project_base_directory(), conf_path)
  115. try:
  116. with open(conf_path, "w") as f:
  117. yaml = YAML(typ="safe")
  118. yaml.dump(config, f)
  119. except Exception as e:
  120. raise EnvironmentError(
  121. "rewrite yaml file config {} failed:".format(conf_path), e
  122. )
  123. def rewrite_json_file(filepath, json_data):
  124. with open(filepath, "w") as f:
  125. json.dump(json_data, f, indent=4, separators=(",", ": "))
  126. f.close()
  127. def filename_type(filename):
  128. filename = filename.lower()
  129. if re.match(r".*\.pdf$", filename):
  130. return FileType.PDF.value
  131. if re.match(
  132. r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
  133. return FileType.DOC.value
  134. if re.match(
  135. r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus|mp3)$", filename):
  136. return FileType.AURAL.value
  137. if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4)$", filename):
  138. return FileType.VISUAL.value
  139. return FileType.OTHER.value
  140. def thumbnail(filename, blob):
  141. filename = filename.lower()
  142. if re.match(r".*\.pdf$", filename):
  143. pdf = pdfplumber.open(BytesIO(blob))
  144. buffered = BytesIO()
  145. pdf.pages[0].to_image(resolution=32).annotated.save(buffered, format="png")
  146. return "data:image/png;base64," + \
  147. base64.b64encode(buffered.getvalue()).decode("utf-8")
  148. if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
  149. image = Image.open(BytesIO(blob))
  150. image.thumbnail((30, 30))
  151. buffered = BytesIO()
  152. image.save(buffered, format="png")
  153. return "data:image/png;base64," + \
  154. base64.b64encode(buffered.getvalue()).decode("utf-8")
  155. if re.match(r".*\.(ppt|pptx)$", filename):
  156. import aspose.slides as slides
  157. import aspose.pydrawing as drawing
  158. try:
  159. with slides.Presentation(BytesIO(blob)) as presentation:
  160. buffered = BytesIO()
  161. presentation.slides[0].get_thumbnail(0.03, 0.03).save(
  162. buffered, drawing.imaging.ImageFormat.png)
  163. return "data:image/png;base64," + \
  164. base64.b64encode(buffered.getvalue()).decode("utf-8")
  165. except Exception as e:
  166. pass
  167. def traversal_files(base):
  168. for root, ds, fs in os.walk(base):
  169. for f in fs:
  170. fullname = os.path.join(root, f)
  171. yield fullname