Browse Source

add doc support in knowledge base for unstructured (#17352)

tags/1.2.0
Jyong 7 months ago
parent
commit
6104b91d3f
No account linked to committer's email address

+ 4
- 1
api/core/rag/extractor/extract_processor.py View File

from core.rag.extractor.notion_extractor import NotionExtractor from core.rag.extractor.notion_extractor import NotionExtractor
from core.rag.extractor.pdf_extractor import PdfExtractor from core.rag.extractor.pdf_extractor import PdfExtractor
from core.rag.extractor.text_extractor import TextExtractor from core.rag.extractor.text_extractor import TextExtractor
from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor
from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor
from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
etl_type = dify_config.ETL_TYPE etl_type = dify_config.ETL_TYPE
extractor: Optional[BaseExtractor] = None extractor: Optional[BaseExtractor] = None
if etl_type == "Unstructured": if etl_type == "Unstructured":
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL or ""
unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or "" unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""


if file_extension in {".xlsx", ".xls"}: if file_extension in {".xlsx", ".xls"}:
extractor = HtmlExtractor(file_path) extractor = HtmlExtractor(file_path)
elif file_extension == ".docx": elif file_extension == ".docx":
extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by) extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by)
elif file_extension == ".doc":
extractor = UnstructuredWordExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".csv": elif file_extension == ".csv":
extractor = CSVExtractor(file_path, autodetect_encoding=True) extractor = CSVExtractor(file_path, autodetect_encoding=True)
elif file_extension == ".msg": elif file_extension == ".msg":

+ 5
- 7
api/core/rag/extractor/unstructured/unstructured_doc_extractor.py View File

class UnstructuredWordExtractor(BaseExtractor): class UnstructuredWordExtractor(BaseExtractor):
"""Loader that uses unstructured to load word documents.""" """Loader that uses unstructured to load word documents."""


def __init__(
self,
file_path: str,
api_url: str,
):
def __init__(self, file_path: str, api_url: str, api_key: str = ""):
"""Initialize with file path.""" """Initialize with file path."""
self._file_path = file_path self._file_path = file_path
self._api_url = api_url self._api_url = api_url
self._api_key = api_key


def extract(self) -> list[Document]: def extract(self) -> list[Document]:
from unstructured.__version__ import __version__ as __unstructured_version__ from unstructured.__version__ import __version__ as __unstructured_version__
) )


if is_doc: if is_doc:
from unstructured.partition.doc import partition_doc
from unstructured.partition.api import partition_via_api

elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)


elements = partition_doc(filename=self._file_path)
else: else:
from unstructured.partition.docx import partition_docx from unstructured.partition.docx import partition_docx



Loading…
Cancel
Save