Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

file_factory.py 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. import mimetypes
  2. import os
  3. import urllib.parse
  4. import uuid
  5. from collections.abc import Callable, Mapping, Sequence
  6. from typing import Any
  7. import httpx
  8. from sqlalchemy import select
  9. from sqlalchemy.orm import Session
  10. from constants import AUDIO_EXTENSIONS, DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS
  11. from core.file import File, FileBelongsTo, FileTransferMethod, FileType, FileUploadConfig, helpers
  12. from core.helper import ssrf_proxy
  13. from extensions.ext_database import db
  14. from models import MessageFile, ToolFile, UploadFile
  15. def build_from_message_files(
  16. *,
  17. message_files: Sequence["MessageFile"],
  18. tenant_id: str,
  19. config: FileUploadConfig,
  20. ) -> Sequence[File]:
  21. results = [
  22. build_from_message_file(message_file=file, tenant_id=tenant_id, config=config)
  23. for file in message_files
  24. if file.belongs_to != FileBelongsTo.ASSISTANT
  25. ]
  26. return results
  27. def build_from_message_file(
  28. *,
  29. message_file: "MessageFile",
  30. tenant_id: str,
  31. config: FileUploadConfig,
  32. ):
  33. mapping = {
  34. "transfer_method": message_file.transfer_method,
  35. "url": message_file.url,
  36. "id": message_file.id,
  37. "type": message_file.type,
  38. }
  39. # Set the correct ID field based on transfer method
  40. if message_file.transfer_method == FileTransferMethod.TOOL_FILE.value:
  41. mapping["tool_file_id"] = message_file.upload_file_id
  42. else:
  43. mapping["upload_file_id"] = message_file.upload_file_id
  44. return build_from_mapping(
  45. mapping=mapping,
  46. tenant_id=tenant_id,
  47. config=config,
  48. )
  49. def build_from_mapping(
  50. *,
  51. mapping: Mapping[str, Any],
  52. tenant_id: str,
  53. config: FileUploadConfig | None = None,
  54. strict_type_validation: bool = False,
  55. ) -> File:
  56. transfer_method = FileTransferMethod.value_of(mapping.get("transfer_method"))
  57. build_functions: dict[FileTransferMethod, Callable] = {
  58. FileTransferMethod.LOCAL_FILE: _build_from_local_file,
  59. FileTransferMethod.REMOTE_URL: _build_from_remote_url,
  60. FileTransferMethod.TOOL_FILE: _build_from_tool_file,
  61. }
  62. build_func = build_functions.get(transfer_method)
  63. if not build_func:
  64. raise ValueError(f"Invalid file transfer method: {transfer_method}")
  65. file: File = build_func(
  66. mapping=mapping,
  67. tenant_id=tenant_id,
  68. transfer_method=transfer_method,
  69. strict_type_validation=strict_type_validation,
  70. )
  71. if config and not _is_file_valid_with_config(
  72. input_file_type=mapping.get("type", FileType.CUSTOM),
  73. file_extension=file.extension or "",
  74. file_transfer_method=file.transfer_method,
  75. config=config,
  76. ):
  77. raise ValueError(f"File validation failed for file: {file.filename}")
  78. return file
  79. def build_from_mappings(
  80. *,
  81. mappings: Sequence[Mapping[str, Any]],
  82. config: FileUploadConfig | None = None,
  83. tenant_id: str,
  84. strict_type_validation: bool = False,
  85. ) -> Sequence[File]:
  86. # TODO(QuantumGhost): Performance concern - each mapping triggers a separate database query.
  87. # Implement batch processing to reduce database load when handling multiple files.
  88. files = [
  89. build_from_mapping(
  90. mapping=mapping,
  91. tenant_id=tenant_id,
  92. config=config,
  93. strict_type_validation=strict_type_validation,
  94. )
  95. for mapping in mappings
  96. ]
  97. if (
  98. config
  99. # If image config is set.
  100. and config.image_config
  101. # And the number of image files exceeds the maximum limit
  102. and sum(1 for _ in (filter(lambda x: x.type == FileType.IMAGE, files))) > config.image_config.number_limits
  103. ):
  104. raise ValueError(f"Number of image files exceeds the maximum limit {config.image_config.number_limits}")
  105. if config and config.number_limits and len(files) > config.number_limits:
  106. raise ValueError(f"Number of files exceeds the maximum limit {config.number_limits}")
  107. return files
  108. def _build_from_local_file(
  109. *,
  110. mapping: Mapping[str, Any],
  111. tenant_id: str,
  112. transfer_method: FileTransferMethod,
  113. strict_type_validation: bool = False,
  114. ) -> File:
  115. upload_file_id = mapping.get("upload_file_id")
  116. if not upload_file_id:
  117. raise ValueError("Invalid upload file id")
  118. # check if upload_file_id is a valid uuid
  119. try:
  120. uuid.UUID(upload_file_id)
  121. except ValueError:
  122. raise ValueError("Invalid upload file id format")
  123. stmt = select(UploadFile).where(
  124. UploadFile.id == upload_file_id,
  125. UploadFile.tenant_id == tenant_id,
  126. )
  127. row = db.session.scalar(stmt)
  128. if row is None:
  129. raise ValueError("Invalid upload file")
  130. detected_file_type = _standardize_file_type(extension="." + row.extension, mime_type=row.mime_type)
  131. specified_type = mapping.get("type", "custom")
  132. if strict_type_validation and detected_file_type.value != specified_type:
  133. raise ValueError("Detected file type does not match the specified type. Please verify the file.")
  134. file_type = FileType(specified_type) if specified_type and specified_type != FileType.CUSTOM else detected_file_type
  135. return File(
  136. id=mapping.get("id"),
  137. filename=row.name,
  138. extension="." + row.extension,
  139. mime_type=row.mime_type,
  140. tenant_id=tenant_id,
  141. type=file_type,
  142. transfer_method=transfer_method,
  143. remote_url=row.source_url,
  144. related_id=mapping.get("upload_file_id"),
  145. size=row.size,
  146. storage_key=row.key,
  147. )
  148. def _build_from_remote_url(
  149. *,
  150. mapping: Mapping[str, Any],
  151. tenant_id: str,
  152. transfer_method: FileTransferMethod,
  153. strict_type_validation: bool = False,
  154. ) -> File:
  155. upload_file_id = mapping.get("upload_file_id")
  156. if upload_file_id:
  157. try:
  158. uuid.UUID(upload_file_id)
  159. except ValueError:
  160. raise ValueError("Invalid upload file id format")
  161. stmt = select(UploadFile).where(
  162. UploadFile.id == upload_file_id,
  163. UploadFile.tenant_id == tenant_id,
  164. )
  165. upload_file = db.session.scalar(stmt)
  166. if upload_file is None:
  167. raise ValueError("Invalid upload file")
  168. detected_file_type = _standardize_file_type(
  169. extension="." + upload_file.extension, mime_type=upload_file.mime_type
  170. )
  171. specified_type = mapping.get("type")
  172. if strict_type_validation and specified_type and detected_file_type.value != specified_type:
  173. raise ValueError("Detected file type does not match the specified type. Please verify the file.")
  174. file_type = (
  175. FileType(specified_type) if specified_type and specified_type != FileType.CUSTOM else detected_file_type
  176. )
  177. return File(
  178. id=mapping.get("id"),
  179. filename=upload_file.name,
  180. extension="." + upload_file.extension,
  181. mime_type=upload_file.mime_type,
  182. tenant_id=tenant_id,
  183. type=file_type,
  184. transfer_method=transfer_method,
  185. remote_url=helpers.get_signed_file_url(upload_file_id=str(upload_file_id)),
  186. related_id=mapping.get("upload_file_id"),
  187. size=upload_file.size,
  188. storage_key=upload_file.key,
  189. )
  190. url = mapping.get("url") or mapping.get("remote_url")
  191. if not url:
  192. raise ValueError("Invalid file url")
  193. mime_type, filename, file_size = _get_remote_file_info(url)
  194. extension = mimetypes.guess_extension(mime_type) or ("." + filename.split(".")[-1] if "." in filename else ".bin")
  195. file_type = _standardize_file_type(extension=extension, mime_type=mime_type)
  196. if file_type.value != mapping.get("type", "custom"):
  197. raise ValueError("Detected file type does not match the specified type. Please verify the file.")
  198. return File(
  199. id=mapping.get("id"),
  200. filename=filename,
  201. tenant_id=tenant_id,
  202. type=file_type,
  203. transfer_method=transfer_method,
  204. remote_url=url,
  205. mime_type=mime_type,
  206. extension=extension,
  207. size=file_size,
  208. storage_key="",
  209. )
  210. def _get_remote_file_info(url: str):
  211. file_size = -1
  212. parsed_url = urllib.parse.urlparse(url)
  213. url_path = parsed_url.path
  214. filename = os.path.basename(url_path)
  215. # Initialize mime_type from filename as fallback
  216. mime_type, _ = mimetypes.guess_type(filename)
  217. if mime_type is None:
  218. mime_type = ""
  219. resp = ssrf_proxy.head(url, follow_redirects=True)
  220. if resp.status_code == httpx.codes.OK:
  221. if content_disposition := resp.headers.get("Content-Disposition"):
  222. filename = str(content_disposition.split("filename=")[-1].strip('"'))
  223. # Re-guess mime_type from updated filename
  224. mime_type, _ = mimetypes.guess_type(filename)
  225. if mime_type is None:
  226. mime_type = ""
  227. file_size = int(resp.headers.get("Content-Length", file_size))
  228. # Fallback to Content-Type header if mime_type is still empty
  229. if not mime_type:
  230. mime_type = resp.headers.get("Content-Type", "").split(";")[0].strip()
  231. return mime_type, filename, file_size
  232. def _build_from_tool_file(
  233. *,
  234. mapping: Mapping[str, Any],
  235. tenant_id: str,
  236. transfer_method: FileTransferMethod,
  237. strict_type_validation: bool = False,
  238. ) -> File:
  239. tool_file = db.session.scalar(
  240. select(ToolFile).where(
  241. ToolFile.id == mapping.get("tool_file_id"),
  242. ToolFile.tenant_id == tenant_id,
  243. )
  244. )
  245. if tool_file is None:
  246. raise ValueError(f"ToolFile {mapping.get('tool_file_id')} not found")
  247. extension = "." + tool_file.file_key.split(".")[-1] if "." in tool_file.file_key else ".bin"
  248. detected_file_type = _standardize_file_type(extension=extension, mime_type=tool_file.mimetype)
  249. specified_type = mapping.get("type")
  250. if strict_type_validation and specified_type and detected_file_type.value != specified_type:
  251. raise ValueError("Detected file type does not match the specified type. Please verify the file.")
  252. file_type = FileType(specified_type) if specified_type and specified_type != FileType.CUSTOM else detected_file_type
  253. return File(
  254. id=mapping.get("id"),
  255. tenant_id=tenant_id,
  256. filename=tool_file.name,
  257. type=file_type,
  258. transfer_method=transfer_method,
  259. remote_url=tool_file.original_url,
  260. related_id=tool_file.id,
  261. extension=extension,
  262. mime_type=tool_file.mimetype,
  263. size=tool_file.size,
  264. storage_key=tool_file.file_key,
  265. )
  266. def _is_file_valid_with_config(
  267. *,
  268. input_file_type: str,
  269. file_extension: str,
  270. file_transfer_method: FileTransferMethod,
  271. config: FileUploadConfig,
  272. ) -> bool:
  273. # FIXME(QIN2DIM): Always allow tool files (files generated by the assistant/model)
  274. # These are internally generated and should bypass user upload restrictions
  275. if file_transfer_method == FileTransferMethod.TOOL_FILE:
  276. return True
  277. if (
  278. config.allowed_file_types
  279. and input_file_type not in config.allowed_file_types
  280. and input_file_type != FileType.CUSTOM
  281. ):
  282. return False
  283. if (
  284. input_file_type == FileType.CUSTOM
  285. and config.allowed_file_extensions is not None
  286. and file_extension not in config.allowed_file_extensions
  287. ):
  288. return False
  289. if input_file_type == FileType.IMAGE:
  290. if (
  291. config.image_config
  292. and config.image_config.transfer_methods
  293. and file_transfer_method not in config.image_config.transfer_methods
  294. ):
  295. return False
  296. elif config.allowed_file_upload_methods and file_transfer_method not in config.allowed_file_upload_methods:
  297. return False
  298. return True
  299. def _standardize_file_type(*, extension: str = "", mime_type: str = "") -> FileType:
  300. """
  301. Infer the possible actual type of the file based on the extension and mime_type
  302. """
  303. guessed_type = None
  304. if extension:
  305. guessed_type = _get_file_type_by_extension(extension)
  306. if guessed_type is None and mime_type:
  307. guessed_type = _get_file_type_by_mimetype(mime_type)
  308. return guessed_type or FileType.CUSTOM
  309. def _get_file_type_by_extension(extension: str) -> FileType | None:
  310. extension = extension.lstrip(".")
  311. if extension in IMAGE_EXTENSIONS:
  312. return FileType.IMAGE
  313. elif extension in VIDEO_EXTENSIONS:
  314. return FileType.VIDEO
  315. elif extension in AUDIO_EXTENSIONS:
  316. return FileType.AUDIO
  317. elif extension in DOCUMENT_EXTENSIONS:
  318. return FileType.DOCUMENT
  319. return None
  320. def _get_file_type_by_mimetype(mime_type: str) -> FileType | None:
  321. if "image" in mime_type:
  322. file_type = FileType.IMAGE
  323. elif "video" in mime_type:
  324. file_type = FileType.VIDEO
  325. elif "audio" in mime_type:
  326. file_type = FileType.AUDIO
  327. elif "text" in mime_type or "pdf" in mime_type:
  328. file_type = FileType.DOCUMENT
  329. else:
  330. file_type = FileType.CUSTOM
  331. return file_type
  332. def get_file_type_by_mime_type(mime_type: str) -> FileType:
  333. return _get_file_type_by_mimetype(mime_type) or FileType.CUSTOM
  334. class StorageKeyLoader:
  335. """FileKeyLoader load the storage key from database for a list of files.
  336. This loader is batched, the database query count is constant regardless of the input size.
  337. """
  338. def __init__(self, session: Session, tenant_id: str):
  339. self._session = session
  340. self._tenant_id = tenant_id
  341. def _load_upload_files(self, upload_file_ids: Sequence[uuid.UUID]) -> Mapping[uuid.UUID, UploadFile]:
  342. stmt = select(UploadFile).where(
  343. UploadFile.id.in_(upload_file_ids),
  344. UploadFile.tenant_id == self._tenant_id,
  345. )
  346. return {uuid.UUID(i.id): i for i in self._session.scalars(stmt)}
  347. def _load_tool_files(self, tool_file_ids: Sequence[uuid.UUID]) -> Mapping[uuid.UUID, ToolFile]:
  348. stmt = select(ToolFile).where(
  349. ToolFile.id.in_(tool_file_ids),
  350. ToolFile.tenant_id == self._tenant_id,
  351. )
  352. return {uuid.UUID(i.id): i for i in self._session.scalars(stmt)}
  353. def load_storage_keys(self, files: Sequence[File]):
  354. """Loads storage keys for a sequence of files by retrieving the corresponding
  355. `UploadFile` or `ToolFile` records from the database based on their transfer method.
  356. This method doesn't modify the input sequence structure but updates the `_storage_key`
  357. property of each file object by extracting the relevant key from its database record.
  358. Performance note: This is a batched operation where database query count remains constant
  359. regardless of input size. However, for optimal performance, input sequences should contain
  360. fewer than 1000 files. For larger collections, split into smaller batches and process each
  361. batch separately.
  362. """
  363. upload_file_ids: list[uuid.UUID] = []
  364. tool_file_ids: list[uuid.UUID] = []
  365. for file in files:
  366. related_model_id = file.related_id
  367. if file.related_id is None:
  368. raise ValueError("file id should not be None.")
  369. if file.tenant_id != self._tenant_id:
  370. err_msg = (
  371. f"invalid file, expected tenant_id={self._tenant_id}, "
  372. f"got tenant_id={file.tenant_id}, file_id={file.id}, related_model_id={related_model_id}"
  373. )
  374. raise ValueError(err_msg)
  375. model_id = uuid.UUID(related_model_id)
  376. if file.transfer_method in (FileTransferMethod.LOCAL_FILE, FileTransferMethod.REMOTE_URL):
  377. upload_file_ids.append(model_id)
  378. elif file.transfer_method == FileTransferMethod.TOOL_FILE:
  379. tool_file_ids.append(model_id)
  380. tool_files = self._load_tool_files(tool_file_ids)
  381. upload_files = self._load_upload_files(upload_file_ids)
  382. for file in files:
  383. model_id = uuid.UUID(file.related_id)
  384. if file.transfer_method in (FileTransferMethod.LOCAL_FILE, FileTransferMethod.REMOTE_URL):
  385. upload_file_row = upload_files.get(model_id)
  386. if upload_file_row is None:
  387. raise ValueError(f"Upload file not found for id: {model_id}")
  388. file.storage_key = upload_file_row.key
  389. elif file.transfer_method == FileTransferMethod.TOOL_FILE:
  390. tool_file_row = tool_files.get(model_id)
  391. if tool_file_row is None:
  392. raise ValueError(f"Tool file not found for id: {model_id}")
  393. file.storage_key = tool_file_row.file_key