Co-authored-by: jyong <jyong@dify.ai>

1 年之前 · 5b953c1ef2
--- a/api/controllers/console/datasets/data_source.py
+++ b/api/controllers/console/datasets/data_source.py
@@ -178,7 +178,8 @@ class DataSourceNotionApi(Resource):
            notion_workspace_id=workspace_id,
            notion_obj_id=page_id,
            notion_page_type=page_type,
            notion_access_token=data_source_binding.access_token
            notion_access_token=data_source_binding.access_token,
            tenant_id=current_user.current_tenant_id
        )

        text_docs = extractor.extract()
@@ -208,7 +209,8 @@ class DataSourceNotionApi(Resource):
                    notion_info={
                        "notion_workspace_id": workspace_id,
                        "notion_obj_id": page['page_id'],
                        "notion_page_type": page['type']
                        "notion_page_type": page['type'],
                        "tenant_id": current_user.current_tenant_id
                    },
                    document_model=args['doc_form']
                )
--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
@@ -298,7 +298,8 @@ class DatasetIndexingEstimateApi(Resource):
                        notion_info={
                            "notion_workspace_id": workspace_id,
                            "notion_obj_id": page['page_id'],
                            "notion_page_type": page['type']
                            "notion_page_type": page['type'],
                            "tenant_id": current_user.current_tenant_id
                        },
                        document_model=args['doc_form']
                    )
--- a/api/controllers/console/datasets/datasets_document.py
+++ b/api/controllers/console/datasets/datasets_document.py
@@ -455,7 +455,8 @@ class DocumentBatchIndexingEstimateApi(DocumentResource):
                    notion_info={
                        "notion_workspace_id": data_source_info['notion_workspace_id'],
                        "notion_obj_id": data_source_info['notion_page_id'],
                        "notion_page_type": data_source_info['type']
                        "notion_page_type": data_source_info['type'],
                        "tenant_id": current_user.current_tenant_id
                    },
                    document_model=document.doc_form
                )
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -366,7 +366,8 @@ class IndexingRunner:
                    "notion_workspace_id": data_source_info['notion_workspace_id'],
                    "notion_obj_id": data_source_info['notion_page_id'],
                    "notion_page_type": data_source_info['type'],
                    "document": dataset_document
                    "document": dataset_document,
                    "tenant_id": dataset_document.tenant_id
                },
                document_model=dataset_document.doc_form
            )
--- a/api/core/rag/datasource/retrieval_service.py
+++ b/api/core/rag/datasource/retrieval_service.py
@@ -39,7 +39,8 @@ class RetrievalService:
                'flask_app': current_app._get_current_object(),
                'dataset_id': dataset_id,
                'query': query,
                'top_k': top_k
                'top_k': top_k,
                'all_documents': all_documents
            })
            threads.append(keyword_thread)
            keyword_thread.start()
--- a/api/core/rag/extractor/entity/extract_setting.py
+++ b/api/core/rag/extractor/entity/extract_setting.py
@@ -12,6 +12,7 @@ class NotionInfo(BaseModel):
    notion_obj_id: str
    notion_page_type: str
    document: Document = None
    tenant_id: str

    class Config:
        arbitrary_types_allowed = True
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@@ -132,7 +132,8 @@ class ExtractProcessor:
                notion_workspace_id=extract_setting.notion_info.notion_workspace_id,
                notion_obj_id=extract_setting.notion_info.notion_obj_id,
                notion_page_type=extract_setting.notion_info.notion_page_type,
                document_model=extract_setting.notion_info.document
                document_model=extract_setting.notion_info.document,
                tenant_id=extract_setting.notion_info.tenant_id,
            )
            return extractor.extract()
        else:
--- a/api/core/rag/extractor/html_extractor.py
+++ b/api/core/rag/extractor/html_extractor.py
@@ -1,13 +1,14 @@
 """Abstract interface for document loader implementations."""
 from typing import Optional
 from bs4 import BeautifulSoup

 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.extractor.helpers import detect_file_encodings
 from core.rag.models.document import Document


 class HtmlExtractor(BaseExtractor):
    """Load html files.

    """
    Load html files.


    Args:
@@ -15,57 +16,19 @@ class HtmlExtractor(BaseExtractor):
    """

    def __init__(
            self,
            file_path: str,
            encoding: Optional[str] = None,
            autodetect_encoding: bool = False,
            source_column: Optional[str] = None,
            csv_args: Optional[dict] = None,
        self,
        file_path: str
    ):
        """Initialize with file path."""
        self._file_path = file_path
        self._encoding = encoding
        self._autodetect_encoding = autodetect_encoding
        self.source_column = source_column
        self.csv_args = csv_args or {}

    def extract(self) -> list[Document]:
        """Load data into document objects."""
        try:
            with open(self._file_path, newline="", encoding=self._encoding) as csvfile:
                docs = self._read_from_file(csvfile)
        except UnicodeDecodeError as e:
            if self._autodetect_encoding:
                detected_encodings = detect_file_encodings(self._file_path)
                for encoding in detected_encodings:
                    try:
                        with open(self._file_path, newline="", encoding=encoding.encoding) as csvfile:
                            docs = self._read_from_file(csvfile)
                        break
                    except UnicodeDecodeError:
                        continue
            else:
                raise RuntimeError(f"Error loading {self._file_path}") from e

        return docs
        return [Document(page_content=self._load_as_text())]

    def _read_from_file(self, csvfile) -> list[Document]:
        docs = []
        csv_reader = csv.DictReader(csvfile, **self.csv_args)  # type: ignore
        for i, row in enumerate(csv_reader):
            content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
            try:
                source = (
                    row[self.source_column]
                    if self.source_column is not None
                    else ''
                )
            except KeyError:
                raise ValueError(
                    f"Source column '{self.source_column}' not found in CSV file."
                )
            metadata = {"source": source, "row": i}
            doc = Document(page_content=content, metadata=metadata)
            docs.append(doc)
    def _load_as_text(self) -> str:
        with open(self._file_path, "rb") as fp:
            soup = BeautifulSoup(fp, 'html.parser')
            text = soup.get_text()
            text = text.strip() if text else ''

        return docs
        return text
--- a/api/core/rag/extractor/notion_extractor.py
+++ b/api/core/rag/extractor/notion_extractor.py
@@ -30,8 +30,10 @@ class NotionExtractor(BaseExtractor):
            notion_workspace_id: str,
            notion_obj_id: str,
            notion_page_type: str,
            tenant_id: str,
            document_model: Optional[DocumentModel] = None,
            notion_access_token: Optional[str] = None
            notion_access_token: Optional[str] = None,

    ):
        self._notion_access_token = None
        self._document_model = document_model
--- a/api/tasks/document_indexing_sync_task.py
+++ b/api/tasks/document_indexing_sync_task.py
@@ -58,7 +58,8 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
            notion_workspace_id=workspace_id,
            notion_obj_id=page_id,
            notion_page_type=page_type,
            notion_access_token=data_source_binding.access_token
            notion_access_token=data_source_binding.access_token,
            tenant_id=document.tenant_id
        )

        last_edited_time = loader.get_notion_last_edited_time()