| UNSTRUCTURED_API_KEY: Optional[str] = Field( | UNSTRUCTURED_API_KEY: Optional[str] = Field( | ||||
| description="API key for Unstructured.io service", | description="API key for Unstructured.io service", | ||||
| default=None, | |||||
| default="", | |||||
| ) | ) | ||||
| SCARF_NO_ANALYTICS: Optional[str] = Field( | SCARF_NO_ANALYTICS: Optional[str] = Field( |
| input_file = Path(file_path) | input_file = Path(file_path) | ||||
| file_extension = input_file.suffix.lower() | file_extension = input_file.suffix.lower() | ||||
| etl_type = dify_config.ETL_TYPE | etl_type = dify_config.ETL_TYPE | ||||
| unstructured_api_url = dify_config.UNSTRUCTURED_API_URL | |||||
| unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY | |||||
| assert unstructured_api_url is not None, "unstructured_api_url is required" | |||||
| assert unstructured_api_key is not None, "unstructured_api_key is required" | |||||
| extractor: Optional[BaseExtractor] = None | extractor: Optional[BaseExtractor] = None | ||||
| if etl_type == "Unstructured": | if etl_type == "Unstructured": | ||||
| unstructured_api_url = dify_config.UNSTRUCTURED_API_URL | |||||
| unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or "" | |||||
| if file_extension in {".xlsx", ".xls"}: | if file_extension in {".xlsx", ".xls"}: | ||||
| extractor = ExcelExtractor(file_path) | extractor = ExcelExtractor(file_path) | ||||
| elif file_extension == ".pdf": | elif file_extension == ".pdf": |
| import base64 | import base64 | ||||
| import logging | import logging | ||||
| from typing import Optional | |||||
| from bs4 import BeautifulSoup # type: ignore | from bs4 import BeautifulSoup # type: ignore | ||||
| file_path: Path to the file to load. | file_path: Path to the file to load. | ||||
| """ | """ | ||||
| def __init__(self, file_path: str, api_url: str, api_key: str): | |||||
| def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): | |||||
| """Initialize with file path.""" | """Initialize with file path.""" | ||||
| self._file_path = file_path | self._file_path = file_path | ||||
| self._api_url = api_url | self._api_url = api_url |
| self, | self, | ||||
| file_path: str, | file_path: str, | ||||
| api_url: Optional[str] = None, | api_url: Optional[str] = None, | ||||
| api_key: Optional[str] = None, | |||||
| api_key: str = "", | |||||
| ): | ): | ||||
| """Initialize with file path.""" | """Initialize with file path.""" | ||||
| self._file_path = file_path | self._file_path = file_path | ||||
| if self._api_url: | if self._api_url: | ||||
| from unstructured.partition.api import partition_via_api | from unstructured.partition.api import partition_via_api | ||||
| if self._api_key is None: | |||||
| raise ValueError("api_key is required") | |||||
| elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key) | elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key) | ||||
| else: | else: | ||||
| from unstructured.partition.epub import partition_epub | from unstructured.partition.epub import partition_epub |
| import logging | import logging | ||||
| from typing import Optional | |||||
| from core.rag.extractor.extractor_base import BaseExtractor | from core.rag.extractor.extractor_base import BaseExtractor | ||||
| from core.rag.models.document import Document | from core.rag.models.document import Document | ||||
| if the specified encoding fails. | if the specified encoding fails. | ||||
| """ | """ | ||||
| def __init__(self, file_path: str, api_url: str, api_key: str): | |||||
| def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): | |||||
| """Initialize with file path.""" | """Initialize with file path.""" | ||||
| self._file_path = file_path | self._file_path = file_path | ||||
| self._api_url = api_url | self._api_url = api_url |
| import logging | import logging | ||||
| from typing import Optional | |||||
| from core.rag.extractor.extractor_base import BaseExtractor | from core.rag.extractor.extractor_base import BaseExtractor | ||||
| from core.rag.models.document import Document | from core.rag.models.document import Document | ||||
| file_path: Path to the file to load. | file_path: Path to the file to load. | ||||
| """ | """ | ||||
| def __init__(self, file_path: str, api_url: str, api_key: str): | |||||
| def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): | |||||
| """Initialize with file path.""" | """Initialize with file path.""" | ||||
| self._file_path = file_path | self._file_path = file_path | ||||
| self._api_url = api_url | self._api_url = api_url |
| import logging | import logging | ||||
| from typing import Optional | |||||
| from core.rag.extractor.extractor_base import BaseExtractor | from core.rag.extractor.extractor_base import BaseExtractor | ||||
| from core.rag.models.document import Document | from core.rag.models.document import Document | ||||
| file_path: Path to the file to load. | file_path: Path to the file to load. | ||||
| """ | """ | ||||
| def __init__(self, file_path: str, api_url: str, api_key: str): | |||||
| def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): | |||||
| """Initialize with file path.""" | """Initialize with file path.""" | ||||
| self._file_path = file_path | self._file_path = file_path | ||||
| self._api_url = api_url | self._api_url = api_url |
| import logging | import logging | ||||
| from typing import Optional | |||||
| from core.rag.extractor.extractor_base import BaseExtractor | from core.rag.extractor.extractor_base import BaseExtractor | ||||
| from core.rag.models.document import Document | from core.rag.models.document import Document | ||||
| file_path: Path to the file to load. | file_path: Path to the file to load. | ||||
| """ | """ | ||||
| def __init__(self, file_path: str, api_url: str, api_key: str): | |||||
| def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): | |||||
| """Initialize with file path.""" | """Initialize with file path.""" | ||||
| self._file_path = file_path | self._file_path = file_path | ||||
| self._api_url = api_url | self._api_url = api_url |
| import logging | import logging | ||||
| from typing import Optional | |||||
| from core.rag.extractor.extractor_base import BaseExtractor | from core.rag.extractor.extractor_base import BaseExtractor | ||||
| from core.rag.models.document import Document | from core.rag.models.document import Document | ||||
| file_path: Path to the file to load. | file_path: Path to the file to load. | ||||
| """ | """ | ||||
| def __init__(self, file_path: str, api_url: str, api_key: str): | |||||
| def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): | |||||
| """Initialize with file path.""" | """Initialize with file path.""" | ||||
| self._file_path = file_path | self._file_path = file_path | ||||
| self._api_url = api_url | self._api_url = api_url |