7 months ago · 6104b91d3f
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
 from core.rag.extractor.notion_extractor import NotionExtractor
 from core.rag.extractor.pdf_extractor import PdfExtractor
 from core.rag.extractor.text_extractor import TextExtractor
 from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor
 from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor
 from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
 from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
                etl_type = dify_config.ETL_TYPE
                extractor: Optional[BaseExtractor] = None
                if etl_type == "Unstructured":
                    unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
                    unstructured_api_url = dify_config.UNSTRUCTURED_API_URL or ""
                    unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""
                    if file_extension in {".xlsx", ".xls"}:
                        extractor = HtmlExtractor(file_path)
                    elif file_extension == ".docx":
                        extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
                    elif file_extension == ".doc":
                        extractor = UnstructuredWordExtractor(file_path, unstructured_api_url, unstructured_api_key)
                    elif file_extension == ".csv":
                        extractor = CSVExtractor(file_path, autodetect_encoding=True)
                    elif file_extension == ".msg":
--- a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
 class UnstructuredWordExtractor(BaseExtractor):
    """Loader that uses unstructured to load word documents."""
    def __init__(
        self,
        file_path: str,
        api_url: str,
    ):
    def __init__(self, file_path: str, api_url: str, api_key: str = ""):
        """Initialize with file path."""
        self._file_path = file_path
        self._api_url = api_url
        self._api_key = api_key
    def extract(self) -> list[Document]:
        from unstructured.__version__ import __version__ as __unstructured_version__
            )
        if is_doc:
            from unstructured.partition.doc import partition_doc
            from unstructured.partition.api import partition_via_api
            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
            elements = partition_doc(filename=self._file_path)
        else:
            from unstructured.partition.docx import partition_docx