| @@ -2,6 +2,7 @@ import mimetypes | |||
| from collections.abc import Mapping, Sequence | |||
| from typing import Any | |||
| import httpx | |||
| from sqlalchemy import select | |||
| from constants import AUDIO_EXTENSIONS, DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS | |||
| @@ -154,7 +155,7 @@ def _build_from_local_file( | |||
| file = File( | |||
| id=mapping.get("id"), | |||
| filename=row.name, | |||
| extension=row.extension, | |||
| extension="." + row.extension, | |||
| mime_type=row.mime_type, | |||
| tenant_id=tenant_id, | |||
| type=file_type, | |||
| @@ -177,25 +178,29 @@ def _build_from_remote_url( | |||
| url = mapping.get("url") | |||
| if not url: | |||
| raise ValueError("Invalid file url") | |||
| resp = ssrf_proxy.head(url, follow_redirects=True) | |||
| resp.raise_for_status() | |||
| # Try to extract filename from response headers or URL | |||
| content_disposition = resp.headers.get("Content-Disposition") | |||
| if content_disposition: | |||
| filename = content_disposition.split("filename=")[-1].strip('"') | |||
| resp = ssrf_proxy.head(url, follow_redirects=True) | |||
| if resp.status_code == httpx.codes.OK: | |||
| # Try to extract filename from response headers or URL | |||
| content_disposition = resp.headers.get("Content-Disposition") | |||
| if content_disposition: | |||
| filename = content_disposition.split("filename=")[-1].strip('"') | |||
| else: | |||
| filename = url.split("/")[-1].split("?")[0] | |||
| # Create the File object | |||
| file_size = int(resp.headers.get("Content-Length", -1)) | |||
| mime_type = str(resp.headers.get("Content-Type", "")) | |||
| else: | |||
| filename = url.split("/")[-1].split("?")[0] | |||
| filename = "" | |||
| file_size = -1 | |||
| mime_type = "" | |||
| # If filename is empty, set a default one | |||
| if not filename: | |||
| filename = "unknown_file" | |||
| # Determine file extension | |||
| extension = "." + filename.split(".")[-1] if "." in filename else ".bin" | |||
| # Create the File object | |||
| file_size = int(resp.headers.get("Content-Length", -1)) | |||
| mime_type = str(resp.headers.get("Content-Type", "")) | |||
| if not mime_type: | |||
| mime_type, _ = mimetypes.guess_type(url) | |||
| file = File( | |||