Signed-off-by: Yongtao Huang <yongtaoh2022@gmail.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>tags/1.8.1
| ) | ) | ||||
| app_model.workflow_id = workflow.id | app_model.workflow_id = workflow.id | ||||
| db.session.commit() | |||||
| workflow_created_at = TimestampField().format(workflow.created_at) | workflow_created_at = TimestampField().format(workflow.created_at) | ||||
| from controllers.console import api | from controllers.console import api | ||||
| from controllers.console.wraps import account_initialization_required, setup_required | from controllers.console.wraps import account_initialization_required, setup_required | ||||
| from core.indexing_runner import IndexingRunner | from core.indexing_runner import IndexingRunner | ||||
| from core.rag.extractor.entity.datasource_type import DatasourceType | |||||
| from core.rag.extractor.entity.extract_setting import ExtractSetting | from core.rag.extractor.entity.extract_setting import ExtractSetting | ||||
| from core.rag.extractor.notion_extractor import NotionExtractor | from core.rag.extractor.notion_extractor import NotionExtractor | ||||
| from extensions.ext_database import db | from extensions.ext_database import db | ||||
| workspace_id = notion_info["workspace_id"] | workspace_id = notion_info["workspace_id"] | ||||
| for page in notion_info["pages"]: | for page in notion_info["pages"]: | ||||
| extract_setting = ExtractSetting( | extract_setting = ExtractSetting( | ||||
| datasource_type="notion_import", | |||||
| datasource_type=DatasourceType.NOTION.value, | |||||
| notion_info={ | notion_info={ | ||||
| "notion_workspace_id": workspace_id, | "notion_workspace_id": workspace_id, | ||||
| "notion_obj_id": page["page_id"], | "notion_obj_id": page["page_id"], |
| from core.plugin.entities.plugin import ModelProviderID | from core.plugin.entities.plugin import ModelProviderID | ||||
| from core.provider_manager import ProviderManager | from core.provider_manager import ProviderManager | ||||
| from core.rag.datasource.vdb.vector_type import VectorType | from core.rag.datasource.vdb.vector_type import VectorType | ||||
| from core.rag.extractor.entity.datasource_type import DatasourceType | |||||
| from core.rag.extractor.entity.extract_setting import ExtractSetting | from core.rag.extractor.entity.extract_setting import ExtractSetting | ||||
| from core.rag.retrieval.retrieval_methods import RetrievalMethod | from core.rag.retrieval.retrieval_methods import RetrievalMethod | ||||
| from extensions.ext_database import db | from extensions.ext_database import db | ||||
| if file_details: | if file_details: | ||||
| for file_detail in file_details: | for file_detail in file_details: | ||||
| extract_setting = ExtractSetting( | extract_setting = ExtractSetting( | ||||
| datasource_type="upload_file", upload_file=file_detail, document_model=args["doc_form"] | |||||
| datasource_type=DatasourceType.FILE.value, | |||||
| upload_file=file_detail, | |||||
| document_model=args["doc_form"], | |||||
| ) | ) | ||||
| extract_settings.append(extract_setting) | extract_settings.append(extract_setting) | ||||
| elif args["info_list"]["data_source_type"] == "notion_import": | elif args["info_list"]["data_source_type"] == "notion_import": | ||||
| workspace_id = notion_info["workspace_id"] | workspace_id = notion_info["workspace_id"] | ||||
| for page in notion_info["pages"]: | for page in notion_info["pages"]: | ||||
| extract_setting = ExtractSetting( | extract_setting = ExtractSetting( | ||||
| datasource_type="notion_import", | |||||
| datasource_type=DatasourceType.NOTION.value, | |||||
| notion_info={ | notion_info={ | ||||
| "notion_workspace_id": workspace_id, | "notion_workspace_id": workspace_id, | ||||
| "notion_obj_id": page["page_id"], | "notion_obj_id": page["page_id"], | ||||
| website_info_list = args["info_list"]["website_info_list"] | website_info_list = args["info_list"]["website_info_list"] | ||||
| for url in website_info_list["urls"]: | for url in website_info_list["urls"]: | ||||
| extract_setting = ExtractSetting( | extract_setting = ExtractSetting( | ||||
| datasource_type="website_crawl", | |||||
| datasource_type=DatasourceType.WEBSITE.value, | |||||
| website_info={ | website_info={ | ||||
| "provider": website_info_list["provider"], | "provider": website_info_list["provider"], | ||||
| "job_id": website_info_list["job_id"], | "job_id": website_info_list["job_id"], |
| from core.model_runtime.entities.model_entities import ModelType | from core.model_runtime.entities.model_entities import ModelType | ||||
| from core.model_runtime.errors.invoke import InvokeAuthorizationError | from core.model_runtime.errors.invoke import InvokeAuthorizationError | ||||
| from core.plugin.impl.exc import PluginDaemonClientSideError | from core.plugin.impl.exc import PluginDaemonClientSideError | ||||
| from core.rag.extractor.entity.datasource_type import DatasourceType | |||||
| from core.rag.extractor.entity.extract_setting import ExtractSetting | from core.rag.extractor.entity.extract_setting import ExtractSetting | ||||
| from extensions.ext_database import db | from extensions.ext_database import db | ||||
| from fields.document_fields import ( | from fields.document_fields import ( | ||||
| raise NotFound("File not found.") | raise NotFound("File not found.") | ||||
| extract_setting = ExtractSetting( | extract_setting = ExtractSetting( | ||||
| datasource_type="upload_file", upload_file=file, document_model=document.doc_form | |||||
| datasource_type=DatasourceType.FILE.value, upload_file=file, document_model=document.doc_form | |||||
| ) | ) | ||||
| indexing_runner = IndexingRunner() | indexing_runner = IndexingRunner() | ||||
| raise NotFound("File not found.") | raise NotFound("File not found.") | ||||
| extract_setting = ExtractSetting( | extract_setting = ExtractSetting( | ||||
| datasource_type="upload_file", upload_file=file_detail, document_model=document.doc_form | |||||
| datasource_type=DatasourceType.FILE.value, upload_file=file_detail, document_model=document.doc_form | |||||
| ) | ) | ||||
| extract_settings.append(extract_setting) | extract_settings.append(extract_setting) | ||||
| elif document.data_source_type == "notion_import": | elif document.data_source_type == "notion_import": | ||||
| extract_setting = ExtractSetting( | extract_setting = ExtractSetting( | ||||
| datasource_type="notion_import", | |||||
| datasource_type=DatasourceType.NOTION.value, | |||||
| notion_info={ | notion_info={ | ||||
| "notion_workspace_id": data_source_info["notion_workspace_id"], | "notion_workspace_id": data_source_info["notion_workspace_id"], | ||||
| "notion_obj_id": data_source_info["notion_page_id"], | "notion_obj_id": data_source_info["notion_page_id"], | ||||
| extract_settings.append(extract_setting) | extract_settings.append(extract_setting) | ||||
| elif document.data_source_type == "website_crawl": | elif document.data_source_type == "website_crawl": | ||||
| extract_setting = ExtractSetting( | extract_setting = ExtractSetting( | ||||
| datasource_type="website_crawl", | |||||
| datasource_type=DatasourceType.WEBSITE.value, | |||||
| website_info={ | website_info={ | ||||
| "provider": data_source_info["provider"], | "provider": data_source_info["provider"], | ||||
| "job_id": data_source_info["job_id"], | "job_id": data_source_info["job_id"], |
| from core.rag.cleaner.clean_processor import CleanProcessor | from core.rag.cleaner.clean_processor import CleanProcessor | ||||
| from core.rag.datasource.keyword.keyword_factory import Keyword | from core.rag.datasource.keyword.keyword_factory import Keyword | ||||
| from core.rag.docstore.dataset_docstore import DatasetDocumentStore | from core.rag.docstore.dataset_docstore import DatasetDocumentStore | ||||
| from core.rag.extractor.entity.datasource_type import DatasourceType | |||||
| from core.rag.extractor.entity.extract_setting import ExtractSetting | from core.rag.extractor.entity.extract_setting import ExtractSetting | ||||
| from core.rag.index_processor.constant.index_type import IndexType | from core.rag.index_processor.constant.index_type import IndexType | ||||
| from core.rag.index_processor.index_processor_base import BaseIndexProcessor | from core.rag.index_processor.index_processor_base import BaseIndexProcessor | ||||
| if file_detail: | if file_detail: | ||||
| extract_setting = ExtractSetting( | extract_setting = ExtractSetting( | ||||
| datasource_type="upload_file", upload_file=file_detail, document_model=dataset_document.doc_form | |||||
| datasource_type=DatasourceType.FILE.value, | |||||
| upload_file=file_detail, | |||||
| document_model=dataset_document.doc_form, | |||||
| ) | ) | ||||
| text_docs = index_processor.extract(extract_setting, process_rule_mode=process_rule["mode"]) | text_docs = index_processor.extract(extract_setting, process_rule_mode=process_rule["mode"]) | ||||
| elif dataset_document.data_source_type == "notion_import": | elif dataset_document.data_source_type == "notion_import": | ||||
| ): | ): | ||||
| raise ValueError("no notion import info found") | raise ValueError("no notion import info found") | ||||
| extract_setting = ExtractSetting( | extract_setting = ExtractSetting( | ||||
| datasource_type="notion_import", | |||||
| datasource_type=DatasourceType.NOTION.value, | |||||
| notion_info={ | notion_info={ | ||||
| "notion_workspace_id": data_source_info["notion_workspace_id"], | "notion_workspace_id": data_source_info["notion_workspace_id"], | ||||
| "notion_obj_id": data_source_info["notion_page_id"], | "notion_obj_id": data_source_info["notion_page_id"], | ||||
| ): | ): | ||||
| raise ValueError("no website import info found") | raise ValueError("no website import info found") | ||||
| extract_setting = ExtractSetting( | extract_setting = ExtractSetting( | ||||
| datasource_type="website_crawl", | |||||
| datasource_type=DatasourceType.WEBSITE.value, | |||||
| website_info={ | website_info={ | ||||
| "provider": data_source_info["provider"], | "provider": data_source_info["provider"], | ||||
| "job_id": data_source_info["job_id"], | "job_id": data_source_info["job_id"], |
| cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False | cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False | ||||
| ) -> Union[list[Document], str]: | ) -> Union[list[Document], str]: | ||||
| extract_setting = ExtractSetting( | extract_setting = ExtractSetting( | ||||
| datasource_type="upload_file", upload_file=upload_file, document_model="text_model" | |||||
| datasource_type=DatasourceType.FILE.value, upload_file=upload_file, document_model="text_model" | |||||
| ) | ) | ||||
| if return_text: | if return_text: | ||||
| delimiter = "\n" | delimiter = "\n" | ||||
| # https://stackoverflow.com/questions/26541416/generate-temporary-file-names-without-creating-actual-file-in-python#comment90414256_26541521 | # https://stackoverflow.com/questions/26541416/generate-temporary-file-names-without-creating-actual-file-in-python#comment90414256_26541521 | ||||
| file_path = f"{temp_dir}/{tempfile.gettempdir()}{suffix}" | file_path = f"{temp_dir}/{tempfile.gettempdir()}{suffix}" | ||||
| Path(file_path).write_bytes(response.content) | Path(file_path).write_bytes(response.content) | ||||
| extract_setting = ExtractSetting(datasource_type="upload_file", document_model="text_model") | |||||
| extract_setting = ExtractSetting(datasource_type=DatasourceType.FILE.value, document_model="text_model") | |||||
| if return_text: | if return_text: | ||||
| delimiter = "\n" | delimiter = "\n" | ||||
| return delimiter.join( | return delimiter.join( |