Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>

2 months ago · bc9efa7ea8
--- a/api/controllers/console/app/workflow.py
+++ b/api/controllers/console/app/workflow.py
            )
            app_model.workflow_id = workflow.id
            db.session.commit()
            workflow_created_at = TimestampField().format(workflow.created_at)
--- a/api/controllers/console/datasets/data_source.py
+++ b/api/controllers/console/datasets/data_source.py
 from controllers.console import api
 from controllers.console.wraps import account_initialization_required, setup_required
 from core.indexing_runner import IndexingRunner
 from core.rag.extractor.entity.datasource_type import DatasourceType
 from core.rag.extractor.entity.extract_setting import ExtractSetting
 from core.rag.extractor.notion_extractor import NotionExtractor
 from extensions.ext_database import db
            workspace_id = notion_info["workspace_id"]
            for page in notion_info["pages"]:
                extract_setting = ExtractSetting(
                    datasource_type="notion_import",
                    datasource_type=DatasourceType.NOTION.value,
                    notion_info={
                        "notion_workspace_id": workspace_id,
                        "notion_obj_id": page["page_id"],
--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
 from core.plugin.entities.plugin import ModelProviderID
 from core.provider_manager import ProviderManager
 from core.rag.datasource.vdb.vector_type import VectorType
 from core.rag.extractor.entity.datasource_type import DatasourceType
 from core.rag.extractor.entity.extract_setting import ExtractSetting
 from core.rag.retrieval.retrieval_methods import RetrievalMethod
 from extensions.ext_database import db
            if file_details:
                for file_detail in file_details:
                    extract_setting = ExtractSetting(
                        datasource_type="upload_file", upload_file=file_detail, document_model=args["doc_form"]
                        datasource_type=DatasourceType.FILE.value,
                        upload_file=file_detail,
                        document_model=args["doc_form"],
                    )
                    extract_settings.append(extract_setting)
        elif args["info_list"]["data_source_type"] == "notion_import":
                workspace_id = notion_info["workspace_id"]
                for page in notion_info["pages"]:
                    extract_setting = ExtractSetting(
                        datasource_type="notion_import",
                        datasource_type=DatasourceType.NOTION.value,
                        notion_info={
                            "notion_workspace_id": workspace_id,
                            "notion_obj_id": page["page_id"],
            website_info_list = args["info_list"]["website_info_list"]
            for url in website_info_list["urls"]:
                extract_setting = ExtractSetting(
                    datasource_type="website_crawl",
                    datasource_type=DatasourceType.WEBSITE.value,
                    website_info={
                        "provider": website_info_list["provider"],
                        "job_id": website_info_list["job_id"],
--- a/api/controllers/console/datasets/datasets_document.py
+++ b/api/controllers/console/datasets/datasets_document.py
 from core.model_runtime.entities.model_entities import ModelType
 from core.model_runtime.errors.invoke import InvokeAuthorizationError
 from core.plugin.impl.exc import PluginDaemonClientSideError
 from core.rag.extractor.entity.datasource_type import DatasourceType
 from core.rag.extractor.entity.extract_setting import ExtractSetting
 from extensions.ext_database import db
 from fields.document_fields import (
                    raise NotFound("File not found.")
                extract_setting = ExtractSetting(
                    datasource_type="upload_file", upload_file=file, document_model=document.doc_form
                    datasource_type=DatasourceType.FILE.value, upload_file=file, document_model=document.doc_form
                )
                indexing_runner = IndexingRunner()
                    raise NotFound("File not found.")
                extract_setting = ExtractSetting(
                    datasource_type="upload_file", upload_file=file_detail, document_model=document.doc_form
                    datasource_type=DatasourceType.FILE.value, upload_file=file_detail, document_model=document.doc_form
                )
                extract_settings.append(extract_setting)
            elif document.data_source_type == "notion_import":
                extract_setting = ExtractSetting(
                    datasource_type="notion_import",
                    datasource_type=DatasourceType.NOTION.value,
                    notion_info={
                        "notion_workspace_id": data_source_info["notion_workspace_id"],
                        "notion_obj_id": data_source_info["notion_page_id"],
                extract_settings.append(extract_setting)
            elif document.data_source_type == "website_crawl":
                extract_setting = ExtractSetting(
                    datasource_type="website_crawl",
                    datasource_type=DatasourceType.WEBSITE.value,
                    website_info={
                        "provider": data_source_info["provider"],
                        "job_id": data_source_info["job_id"],
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
 from core.rag.cleaner.clean_processor import CleanProcessor
 from core.rag.datasource.keyword.keyword_factory import Keyword
 from core.rag.docstore.dataset_docstore import DatasetDocumentStore
 from core.rag.extractor.entity.datasource_type import DatasourceType
 from core.rag.extractor.entity.extract_setting import ExtractSetting
 from core.rag.index_processor.constant.index_type import IndexType
 from core.rag.index_processor.index_processor_base import BaseIndexProcessor
            if file_detail:
                extract_setting = ExtractSetting(
                    datasource_type="upload_file", upload_file=file_detail, document_model=dataset_document.doc_form
                    datasource_type=DatasourceType.FILE.value,
                    upload_file=file_detail,
                    document_model=dataset_document.doc_form,
                )
                text_docs = index_processor.extract(extract_setting, process_rule_mode=process_rule["mode"])
        elif dataset_document.data_source_type == "notion_import":
            ):
                raise ValueError("no notion import info found")
            extract_setting = ExtractSetting(
                datasource_type="notion_import",
                datasource_type=DatasourceType.NOTION.value,
                notion_info={
                    "notion_workspace_id": data_source_info["notion_workspace_id"],
                    "notion_obj_id": data_source_info["notion_page_id"],
            ):
                raise ValueError("no website import info found")
            extract_setting = ExtractSetting(
                datasource_type="website_crawl",
                datasource_type=DatasourceType.WEBSITE.value,
                website_info={
                    "provider": data_source_info["provider"],
                    "job_id": data_source_info["job_id"],
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
        cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False
    ) -> Union[list[Document], str]:
        extract_setting = ExtractSetting(
            datasource_type="upload_file", upload_file=upload_file, document_model="text_model"
            datasource_type=DatasourceType.FILE.value, upload_file=upload_file, document_model="text_model"
        )
        if return_text:
            delimiter = "\n"
            # https://stackoverflow.com/questions/26541416/generate-temporary-file-names-without-creating-actual-file-in-python#comment90414256_26541521
            file_path = f"{temp_dir}/{tempfile.gettempdir()}{suffix}"
            Path(file_path).write_bytes(response.content)
            extract_setting = ExtractSetting(datasource_type="upload_file", document_model="text_model")
            extract_setting = ExtractSetting(datasource_type=DatasourceType.FILE.value, document_model="text_model")
            if return_text:
                delimiter = "\n"
                return delimiter.join(