| @@ -601,7 +601,7 @@ class RagEtlConfig(BaseSettings): | |||
| UNSTRUCTURED_API_KEY: Optional[str] = Field( | |||
| description="API key for Unstructured.io service", | |||
| default=None, | |||
| default="", | |||
| ) | |||
| SCARF_NO_ANALYTICS: Optional[str] = Field( | |||
| @@ -102,12 +102,11 @@ class ExtractProcessor: | |||
| input_file = Path(file_path) | |||
| file_extension = input_file.suffix.lower() | |||
| etl_type = dify_config.ETL_TYPE | |||
| unstructured_api_url = dify_config.UNSTRUCTURED_API_URL | |||
| unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY | |||
| assert unstructured_api_url is not None, "unstructured_api_url is required" | |||
| assert unstructured_api_key is not None, "unstructured_api_key is required" | |||
| extractor: Optional[BaseExtractor] = None | |||
| if etl_type == "Unstructured": | |||
| unstructured_api_url = dify_config.UNSTRUCTURED_API_URL | |||
| unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or "" | |||
| if file_extension in {".xlsx", ".xls"}: | |||
| extractor = ExcelExtractor(file_path) | |||
| elif file_extension == ".pdf": | |||
| @@ -1,5 +1,6 @@ | |||
| import base64 | |||
| import logging | |||
| from typing import Optional | |||
| from bs4 import BeautifulSoup # type: ignore | |||
| @@ -15,7 +16,7 @@ class UnstructuredEmailExtractor(BaseExtractor): | |||
| file_path: Path to the file to load. | |||
| """ | |||
| def __init__(self, file_path: str, api_url: str, api_key: str): | |||
| def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||
| @@ -19,7 +19,7 @@ class UnstructuredEpubExtractor(BaseExtractor): | |||
| self, | |||
| file_path: str, | |||
| api_url: Optional[str] = None, | |||
| api_key: Optional[str] = None, | |||
| api_key: str = "", | |||
| ): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| @@ -30,9 +30,6 @@ class UnstructuredEpubExtractor(BaseExtractor): | |||
| if self._api_url: | |||
| from unstructured.partition.api import partition_via_api | |||
| if self._api_key is None: | |||
| raise ValueError("api_key is required") | |||
| elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key) | |||
| else: | |||
| from unstructured.partition.epub import partition_epub | |||
| @@ -1,4 +1,5 @@ | |||
| import logging | |||
| from typing import Optional | |||
| from core.rag.extractor.extractor_base import BaseExtractor | |||
| from core.rag.models.document import Document | |||
| @@ -24,7 +25,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor): | |||
| if the specified encoding fails. | |||
| """ | |||
| def __init__(self, file_path: str, api_url: str, api_key: str): | |||
| def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||
| @@ -1,4 +1,5 @@ | |||
| import logging | |||
| from typing import Optional | |||
| from core.rag.extractor.extractor_base import BaseExtractor | |||
| from core.rag.models.document import Document | |||
| @@ -14,7 +15,7 @@ class UnstructuredMsgExtractor(BaseExtractor): | |||
| file_path: Path to the file to load. | |||
| """ | |||
| def __init__(self, file_path: str, api_url: str, api_key: str): | |||
| def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||
| @@ -1,4 +1,5 @@ | |||
| import logging | |||
| from typing import Optional | |||
| from core.rag.extractor.extractor_base import BaseExtractor | |||
| from core.rag.models.document import Document | |||
| @@ -14,7 +15,7 @@ class UnstructuredPPTExtractor(BaseExtractor): | |||
| file_path: Path to the file to load. | |||
| """ | |||
| def __init__(self, file_path: str, api_url: str, api_key: str): | |||
| def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||
| @@ -1,4 +1,5 @@ | |||
| import logging | |||
| from typing import Optional | |||
| from core.rag.extractor.extractor_base import BaseExtractor | |||
| from core.rag.models.document import Document | |||
| @@ -14,7 +15,7 @@ class UnstructuredPPTXExtractor(BaseExtractor): | |||
| file_path: Path to the file to load. | |||
| """ | |||
| def __init__(self, file_path: str, api_url: str, api_key: str): | |||
| def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||
| @@ -1,4 +1,5 @@ | |||
| import logging | |||
| from typing import Optional | |||
| from core.rag.extractor.extractor_base import BaseExtractor | |||
| from core.rag.models.document import Document | |||
| @@ -14,7 +15,7 @@ class UnstructuredXmlExtractor(BaseExtractor): | |||
| file_path: Path to the file to load. | |||
| """ | |||
| def __init__(self, file_path: str, api_url: str, api_key: str): | |||
| def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||