Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

file_factory.py 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. import mimetypes
  2. import uuid
  3. from collections.abc import Callable, Mapping, Sequence
  4. from typing import Any, cast
  5. import httpx
  6. from sqlalchemy import select
  7. from constants import AUDIO_EXTENSIONS, DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS
  8. from core.file import File, FileBelongsTo, FileTransferMethod, FileType, FileUploadConfig, helpers
  9. from core.helper import ssrf_proxy
  10. from extensions.ext_database import db
  11. from models import MessageFile, ToolFile, UploadFile
  12. def build_from_message_files(
  13. *,
  14. message_files: Sequence["MessageFile"],
  15. tenant_id: str,
  16. config: FileUploadConfig,
  17. ) -> Sequence[File]:
  18. results = [
  19. build_from_message_file(message_file=file, tenant_id=tenant_id, config=config)
  20. for file in message_files
  21. if file.belongs_to != FileBelongsTo.ASSISTANT
  22. ]
  23. return results
  24. def build_from_message_file(
  25. *,
  26. message_file: "MessageFile",
  27. tenant_id: str,
  28. config: FileUploadConfig,
  29. ):
  30. mapping = {
  31. "transfer_method": message_file.transfer_method,
  32. "url": message_file.url,
  33. "id": message_file.id,
  34. "type": message_file.type,
  35. "upload_file_id": message_file.upload_file_id,
  36. }
  37. return build_from_mapping(
  38. mapping=mapping,
  39. tenant_id=tenant_id,
  40. config=config,
  41. )
  42. def build_from_mapping(
  43. *,
  44. mapping: Mapping[str, Any],
  45. tenant_id: str,
  46. config: FileUploadConfig | None = None,
  47. strict_type_validation: bool = False,
  48. ) -> File:
  49. transfer_method = FileTransferMethod.value_of(mapping.get("transfer_method"))
  50. build_functions: dict[FileTransferMethod, Callable] = {
  51. FileTransferMethod.LOCAL_FILE: _build_from_local_file,
  52. FileTransferMethod.REMOTE_URL: _build_from_remote_url,
  53. FileTransferMethod.TOOL_FILE: _build_from_tool_file,
  54. }
  55. build_func = build_functions.get(transfer_method)
  56. if not build_func:
  57. raise ValueError(f"Invalid file transfer method: {transfer_method}")
  58. file: File = build_func(
  59. mapping=mapping,
  60. tenant_id=tenant_id,
  61. transfer_method=transfer_method,
  62. strict_type_validation=strict_type_validation,
  63. )
  64. if config and not _is_file_valid_with_config(
  65. input_file_type=mapping.get("type", FileType.CUSTOM),
  66. file_extension=file.extension or "",
  67. file_transfer_method=file.transfer_method,
  68. config=config,
  69. ):
  70. raise ValueError(f"File validation failed for file: {file.filename}")
  71. return file
  72. def build_from_mappings(
  73. *,
  74. mappings: Sequence[Mapping[str, Any]],
  75. config: FileUploadConfig | None = None,
  76. tenant_id: str,
  77. strict_type_validation: bool = False,
  78. ) -> Sequence[File]:
  79. files = [
  80. build_from_mapping(
  81. mapping=mapping,
  82. tenant_id=tenant_id,
  83. config=config,
  84. strict_type_validation=strict_type_validation,
  85. )
  86. for mapping in mappings
  87. ]
  88. if (
  89. config
  90. # If image config is set.
  91. and config.image_config
  92. # And the number of image files exceeds the maximum limit
  93. and sum(1 for _ in (filter(lambda x: x.type == FileType.IMAGE, files))) > config.image_config.number_limits
  94. ):
  95. raise ValueError(f"Number of image files exceeds the maximum limit {config.image_config.number_limits}")
  96. if config and config.number_limits and len(files) > config.number_limits:
  97. raise ValueError(f"Number of files exceeds the maximum limit {config.number_limits}")
  98. return files
  99. def _build_from_local_file(
  100. *,
  101. mapping: Mapping[str, Any],
  102. tenant_id: str,
  103. transfer_method: FileTransferMethod,
  104. strict_type_validation: bool = False,
  105. ) -> File:
  106. upload_file_id = mapping.get("upload_file_id")
  107. if not upload_file_id:
  108. raise ValueError("Invalid upload file id")
  109. # check if upload_file_id is a valid uuid
  110. try:
  111. uuid.UUID(upload_file_id)
  112. except ValueError:
  113. raise ValueError("Invalid upload file id format")
  114. stmt = select(UploadFile).where(
  115. UploadFile.id == upload_file_id,
  116. UploadFile.tenant_id == tenant_id,
  117. )
  118. row = db.session.scalar(stmt)
  119. if row is None:
  120. raise ValueError("Invalid upload file")
  121. detected_file_type = _standardize_file_type(extension="." + row.extension, mime_type=row.mime_type)
  122. specified_type = mapping.get("type", "custom")
  123. if strict_type_validation and detected_file_type.value != specified_type:
  124. raise ValueError("Detected file type does not match the specified type. Please verify the file.")
  125. file_type = (
  126. FileType(specified_type) if specified_type and specified_type != FileType.CUSTOM.value else detected_file_type
  127. )
  128. return File(
  129. id=mapping.get("id"),
  130. filename=row.name,
  131. extension="." + row.extension,
  132. mime_type=row.mime_type,
  133. tenant_id=tenant_id,
  134. type=file_type,
  135. transfer_method=transfer_method,
  136. remote_url=row.source_url,
  137. related_id=mapping.get("upload_file_id"),
  138. size=row.size,
  139. storage_key=row.key,
  140. )
  141. def _build_from_remote_url(
  142. *,
  143. mapping: Mapping[str, Any],
  144. tenant_id: str,
  145. transfer_method: FileTransferMethod,
  146. strict_type_validation: bool = False,
  147. ) -> File:
  148. upload_file_id = mapping.get("upload_file_id")
  149. if upload_file_id:
  150. try:
  151. uuid.UUID(upload_file_id)
  152. except ValueError:
  153. raise ValueError("Invalid upload file id format")
  154. stmt = select(UploadFile).where(
  155. UploadFile.id == upload_file_id,
  156. UploadFile.tenant_id == tenant_id,
  157. )
  158. upload_file = db.session.scalar(stmt)
  159. if upload_file is None:
  160. raise ValueError("Invalid upload file")
  161. detected_file_type = _standardize_file_type(
  162. extension="." + upload_file.extension, mime_type=upload_file.mime_type
  163. )
  164. specified_type = mapping.get("type")
  165. if strict_type_validation and specified_type and detected_file_type.value != specified_type:
  166. raise ValueError("Detected file type does not match the specified type. Please verify the file.")
  167. file_type = (
  168. FileType(specified_type)
  169. if specified_type and specified_type != FileType.CUSTOM.value
  170. else detected_file_type
  171. )
  172. return File(
  173. id=mapping.get("id"),
  174. filename=upload_file.name,
  175. extension="." + upload_file.extension,
  176. mime_type=upload_file.mime_type,
  177. tenant_id=tenant_id,
  178. type=file_type,
  179. transfer_method=transfer_method,
  180. remote_url=helpers.get_signed_file_url(upload_file_id=str(upload_file_id)),
  181. related_id=mapping.get("upload_file_id"),
  182. size=upload_file.size,
  183. storage_key=upload_file.key,
  184. )
  185. url = mapping.get("url") or mapping.get("remote_url")
  186. if not url:
  187. raise ValueError("Invalid file url")
  188. mime_type, filename, file_size = _get_remote_file_info(url)
  189. extension = mimetypes.guess_extension(mime_type) or ("." + filename.split(".")[-1] if "." in filename else ".bin")
  190. file_type = _standardize_file_type(extension=extension, mime_type=mime_type)
  191. if file_type.value != mapping.get("type", "custom"):
  192. raise ValueError("Detected file type does not match the specified type. Please verify the file.")
  193. return File(
  194. id=mapping.get("id"),
  195. filename=filename,
  196. tenant_id=tenant_id,
  197. type=file_type,
  198. transfer_method=transfer_method,
  199. remote_url=url,
  200. mime_type=mime_type,
  201. extension=extension,
  202. size=file_size,
  203. storage_key="",
  204. )
  205. def _get_remote_file_info(url: str):
  206. file_size = -1
  207. filename = url.split("/")[-1].split("?")[0] or "unknown_file"
  208. mime_type = mimetypes.guess_type(filename)[0] or ""
  209. resp = ssrf_proxy.head(url, follow_redirects=True)
  210. resp = cast(httpx.Response, resp)
  211. if resp.status_code == httpx.codes.OK:
  212. if content_disposition := resp.headers.get("Content-Disposition"):
  213. filename = str(content_disposition.split("filename=")[-1].strip('"'))
  214. file_size = int(resp.headers.get("Content-Length", file_size))
  215. mime_type = mime_type or str(resp.headers.get("Content-Type", ""))
  216. return mime_type, filename, file_size
  217. def _build_from_tool_file(
  218. *,
  219. mapping: Mapping[str, Any],
  220. tenant_id: str,
  221. transfer_method: FileTransferMethod,
  222. strict_type_validation: bool = False,
  223. ) -> File:
  224. tool_file = (
  225. db.session.query(ToolFile)
  226. .filter(
  227. ToolFile.id == mapping.get("tool_file_id"),
  228. ToolFile.tenant_id == tenant_id,
  229. )
  230. .first()
  231. )
  232. if tool_file is None:
  233. raise ValueError(f"ToolFile {mapping.get('tool_file_id')} not found")
  234. extension = "." + tool_file.file_key.split(".")[-1] if "." in tool_file.file_key else ".bin"
  235. detected_file_type = _standardize_file_type(extension="." + extension, mime_type=tool_file.mimetype)
  236. specified_type = mapping.get("type")
  237. if strict_type_validation and specified_type and detected_file_type.value != specified_type:
  238. raise ValueError("Detected file type does not match the specified type. Please verify the file.")
  239. file_type = (
  240. FileType(specified_type) if specified_type and specified_type != FileType.CUSTOM.value else detected_file_type
  241. )
  242. return File(
  243. id=mapping.get("id"),
  244. tenant_id=tenant_id,
  245. filename=tool_file.name,
  246. type=file_type,
  247. transfer_method=transfer_method,
  248. remote_url=tool_file.original_url,
  249. related_id=tool_file.id,
  250. extension=extension,
  251. mime_type=tool_file.mimetype,
  252. size=tool_file.size,
  253. storage_key=tool_file.file_key,
  254. )
  255. def _is_file_valid_with_config(
  256. *,
  257. input_file_type: str,
  258. file_extension: str,
  259. file_transfer_method: FileTransferMethod,
  260. config: FileUploadConfig,
  261. ) -> bool:
  262. if (
  263. config.allowed_file_types
  264. and input_file_type not in config.allowed_file_types
  265. and input_file_type != FileType.CUSTOM
  266. ):
  267. return False
  268. if (
  269. input_file_type == FileType.CUSTOM
  270. and config.allowed_file_extensions is not None
  271. and file_extension not in config.allowed_file_extensions
  272. ):
  273. return False
  274. if input_file_type == FileType.IMAGE:
  275. if (
  276. config.image_config
  277. and config.image_config.transfer_methods
  278. and file_transfer_method not in config.image_config.transfer_methods
  279. ):
  280. return False
  281. elif config.allowed_file_upload_methods and file_transfer_method not in config.allowed_file_upload_methods:
  282. return False
  283. return True
  284. def _standardize_file_type(*, extension: str = "", mime_type: str = "") -> FileType:
  285. """
  286. Infer the possible actual type of the file based on the extension and mime_type
  287. """
  288. guessed_type = None
  289. if extension:
  290. guessed_type = _get_file_type_by_extension(extension)
  291. if guessed_type is None and mime_type:
  292. guessed_type = _get_file_type_by_mimetype(mime_type)
  293. return guessed_type or FileType.CUSTOM
  294. def _get_file_type_by_extension(extension: str) -> FileType | None:
  295. extension = extension.lstrip(".")
  296. if extension in IMAGE_EXTENSIONS:
  297. return FileType.IMAGE
  298. elif extension in VIDEO_EXTENSIONS:
  299. return FileType.VIDEO
  300. elif extension in AUDIO_EXTENSIONS:
  301. return FileType.AUDIO
  302. elif extension in DOCUMENT_EXTENSIONS:
  303. return FileType.DOCUMENT
  304. return None
  305. def _get_file_type_by_mimetype(mime_type: str) -> FileType | None:
  306. if "image" in mime_type:
  307. file_type = FileType.IMAGE
  308. elif "video" in mime_type:
  309. file_type = FileType.VIDEO
  310. elif "audio" in mime_type:
  311. file_type = FileType.AUDIO
  312. elif "text" in mime_type or "pdf" in mime_type:
  313. file_type = FileType.DOCUMENT
  314. else:
  315. file_type = FileType.CUSTOM
  316. return file_type
  317. def get_file_type_by_mime_type(mime_type: str) -> FileType:
  318. return _get_file_type_by_mimetype(mime_type) or FileType.CUSTOM