Browse Source

fix unstructured setting (#12116)

tags/0.15.0
Jyong 10 months ago
parent
commit
811e4bd0cf
No account linked to committer's email address

+ 1
- 1
api/configs/feature/__init__.py View File

@@ -601,7 +601,7 @@ class RagEtlConfig(BaseSettings):

UNSTRUCTURED_API_KEY: Optional[str] = Field(
description="API key for Unstructured.io service",
default=None,
default="",
)

SCARF_NO_ANALYTICS: Optional[str] = Field(

+ 3
- 4
api/core/rag/extractor/extract_processor.py View File

@@ -102,12 +102,11 @@ class ExtractProcessor:
input_file = Path(file_path)
file_extension = input_file.suffix.lower()
etl_type = dify_config.ETL_TYPE
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY
assert unstructured_api_url is not None, "unstructured_api_url is required"
assert unstructured_api_key is not None, "unstructured_api_key is required"
extractor: Optional[BaseExtractor] = None
if etl_type == "Unstructured":
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""

if file_extension in {".xlsx", ".xls"}:
extractor = ExcelExtractor(file_path)
elif file_extension == ".pdf":

+ 2
- 1
api/core/rag/extractor/unstructured/unstructured_eml_extractor.py View File

@@ -1,5 +1,6 @@
import base64
import logging
from typing import Optional

from bs4 import BeautifulSoup # type: ignore

@@ -15,7 +16,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
file_path: Path to the file to load.
"""

def __init__(self, file_path: str, api_url: str, api_key: str):
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

+ 1
- 4
api/core/rag/extractor/unstructured/unstructured_epub_extractor.py View File

@@ -19,7 +19,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
self,
file_path: str,
api_url: Optional[str] = None,
api_key: Optional[str] = None,
api_key: str = "",
):
"""Initialize with file path."""
self._file_path = file_path
@@ -30,9 +30,6 @@ class UnstructuredEpubExtractor(BaseExtractor):
if self._api_url:
from unstructured.partition.api import partition_via_api

if self._api_key is None:
raise ValueError("api_key is required")

elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.epub import partition_epub

+ 2
- 1
api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py View File

@@ -1,4 +1,5 @@
import logging
from typing import Optional

from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
@@ -24,7 +25,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
if the specified encoding fails.
"""

def __init__(self, file_path: str, api_url: str, api_key: str):
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

+ 2
- 1
api/core/rag/extractor/unstructured/unstructured_msg_extractor.py View File

@@ -1,4 +1,5 @@
import logging
from typing import Optional

from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
@@ -14,7 +15,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
file_path: Path to the file to load.
"""

def __init__(self, file_path: str, api_url: str, api_key: str):
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

+ 2
- 1
api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py View File

@@ -1,4 +1,5 @@
import logging
from typing import Optional

from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
@@ -14,7 +15,7 @@ class UnstructuredPPTExtractor(BaseExtractor):
file_path: Path to the file to load.
"""

def __init__(self, file_path: str, api_url: str, api_key: str):
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

+ 2
- 1
api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py View File

@@ -1,4 +1,5 @@
import logging
from typing import Optional

from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
@@ -14,7 +15,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
file_path: Path to the file to load.
"""

def __init__(self, file_path: str, api_url: str, api_key: str):
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

+ 2
- 1
api/core/rag/extractor/unstructured/unstructured_xml_extractor.py View File

@@ -1,4 +1,5 @@
import logging
from typing import Optional

from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
@@ -14,7 +15,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
file_path: Path to the file to load.
"""

def __init__(self, file_path: str, api_url: str, api_key: str):
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url

Loading…
Cancel
Save