| @@ -760,166 +760,168 @@ class DocumentService: | |||
| ) | |||
| db.session.add(dataset_process_rule) | |||
| db.session.commit() | |||
| position = DocumentService.get_documents_position(dataset.id) | |||
| document_ids = [] | |||
| duplicate_document_ids = [] | |||
| if document_data["data_source"]["type"] == "upload_file": | |||
| upload_file_list = document_data["data_source"]["info_list"]["file_info_list"]["file_ids"] | |||
| for file_id in upload_file_list: | |||
| file = ( | |||
| db.session.query(UploadFile) | |||
| .filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id) | |||
| .first() | |||
| ) | |||
| lock_name = "add_document_lock_dataset_id_{}".format(dataset.id) | |||
| with redis_client.lock(lock_name, timeout=600): | |||
| position = DocumentService.get_documents_position(dataset.id) | |||
| document_ids = [] | |||
| duplicate_document_ids = [] | |||
| if document_data["data_source"]["type"] == "upload_file": | |||
| upload_file_list = document_data["data_source"]["info_list"]["file_info_list"]["file_ids"] | |||
| for file_id in upload_file_list: | |||
| file = ( | |||
| db.session.query(UploadFile) | |||
| .filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id) | |||
| .first() | |||
| ) | |||
| # raise error if file not found | |||
| if not file: | |||
| raise FileNotExistsError() | |||
| # raise error if file not found | |||
| if not file: | |||
| raise FileNotExistsError() | |||
| file_name = file.name | |||
| data_source_info = { | |||
| "upload_file_id": file_id, | |||
| } | |||
| # check duplicate | |||
| if document_data.get("duplicate", False): | |||
| document = Document.query.filter_by( | |||
| dataset_id=dataset.id, | |||
| tenant_id=current_user.current_tenant_id, | |||
| data_source_type="upload_file", | |||
| enabled=True, | |||
| name=file_name, | |||
| ).first() | |||
| if document: | |||
| document.dataset_process_rule_id = dataset_process_rule.id | |||
| document.updated_at = datetime.datetime.utcnow() | |||
| document.created_from = created_from | |||
| document.doc_form = document_data["doc_form"] | |||
| document.doc_language = document_data["doc_language"] | |||
| document.data_source_info = json.dumps(data_source_info) | |||
| document.batch = batch | |||
| document.indexing_status = "waiting" | |||
| db.session.add(document) | |||
| documents.append(document) | |||
| duplicate_document_ids.append(document.id) | |||
| continue | |||
| document = DocumentService.build_document( | |||
| dataset, | |||
| dataset_process_rule.id, | |||
| document_data["data_source"]["type"], | |||
| document_data["doc_form"], | |||
| document_data["doc_language"], | |||
| data_source_info, | |||
| created_from, | |||
| position, | |||
| account, | |||
| file_name, | |||
| batch, | |||
| ) | |||
| db.session.add(document) | |||
| db.session.flush() | |||
| document_ids.append(document.id) | |||
| documents.append(document) | |||
| position += 1 | |||
| elif document_data["data_source"]["type"] == "notion_import": | |||
| notion_info_list = document_data["data_source"]["info_list"]["notion_info_list"] | |||
| exist_page_ids = [] | |||
| exist_document = {} | |||
| documents = Document.query.filter_by( | |||
| dataset_id=dataset.id, | |||
| tenant_id=current_user.current_tenant_id, | |||
| data_source_type="notion_import", | |||
| enabled=True, | |||
| ).all() | |||
| if documents: | |||
| for document in documents: | |||
| data_source_info = json.loads(document.data_source_info) | |||
| exist_page_ids.append(data_source_info["notion_page_id"]) | |||
| exist_document[data_source_info["notion_page_id"]] = document.id | |||
| for notion_info in notion_info_list: | |||
| workspace_id = notion_info["workspace_id"] | |||
| data_source_binding = DataSourceOauthBinding.query.filter( | |||
| db.and_( | |||
| DataSourceOauthBinding.tenant_id == current_user.current_tenant_id, | |||
| DataSourceOauthBinding.provider == "notion", | |||
| DataSourceOauthBinding.disabled == False, | |||
| DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"', | |||
| file_name = file.name | |||
| data_source_info = { | |||
| "upload_file_id": file_id, | |||
| } | |||
| # check duplicate | |||
| if document_data.get("duplicate", False): | |||
| document = Document.query.filter_by( | |||
| dataset_id=dataset.id, | |||
| tenant_id=current_user.current_tenant_id, | |||
| data_source_type="upload_file", | |||
| enabled=True, | |||
| name=file_name, | |||
| ).first() | |||
| if document: | |||
| document.dataset_process_rule_id = dataset_process_rule.id | |||
| document.updated_at = datetime.datetime.utcnow() | |||
| document.created_from = created_from | |||
| document.doc_form = document_data["doc_form"] | |||
| document.doc_language = document_data["doc_language"] | |||
| document.data_source_info = json.dumps(data_source_info) | |||
| document.batch = batch | |||
| document.indexing_status = "waiting" | |||
| db.session.add(document) | |||
| documents.append(document) | |||
| duplicate_document_ids.append(document.id) | |||
| continue | |||
| document = DocumentService.build_document( | |||
| dataset, | |||
| dataset_process_rule.id, | |||
| document_data["data_source"]["type"], | |||
| document_data["doc_form"], | |||
| document_data["doc_language"], | |||
| data_source_info, | |||
| created_from, | |||
| position, | |||
| account, | |||
| file_name, | |||
| batch, | |||
| ) | |||
| ).first() | |||
| if not data_source_binding: | |||
| raise ValueError("Data source binding not found.") | |||
| for page in notion_info["pages"]: | |||
| if page["page_id"] not in exist_page_ids: | |||
| data_source_info = { | |||
| "notion_workspace_id": workspace_id, | |||
| "notion_page_id": page["page_id"], | |||
| "notion_page_icon": page["page_icon"], | |||
| "type": page["type"], | |||
| } | |||
| document = DocumentService.build_document( | |||
| dataset, | |||
| dataset_process_rule.id, | |||
| document_data["data_source"]["type"], | |||
| document_data["doc_form"], | |||
| document_data["doc_language"], | |||
| data_source_info, | |||
| created_from, | |||
| position, | |||
| account, | |||
| page["page_name"], | |||
| batch, | |||
| db.session.add(document) | |||
| db.session.flush() | |||
| document_ids.append(document.id) | |||
| documents.append(document) | |||
| position += 1 | |||
| elif document_data["data_source"]["type"] == "notion_import": | |||
| notion_info_list = document_data["data_source"]["info_list"]["notion_info_list"] | |||
| exist_page_ids = [] | |||
| exist_document = {} | |||
| documents = Document.query.filter_by( | |||
| dataset_id=dataset.id, | |||
| tenant_id=current_user.current_tenant_id, | |||
| data_source_type="notion_import", | |||
| enabled=True, | |||
| ).all() | |||
| if documents: | |||
| for document in documents: | |||
| data_source_info = json.loads(document.data_source_info) | |||
| exist_page_ids.append(data_source_info["notion_page_id"]) | |||
| exist_document[data_source_info["notion_page_id"]] = document.id | |||
| for notion_info in notion_info_list: | |||
| workspace_id = notion_info["workspace_id"] | |||
| data_source_binding = DataSourceOauthBinding.query.filter( | |||
| db.and_( | |||
| DataSourceOauthBinding.tenant_id == current_user.current_tenant_id, | |||
| DataSourceOauthBinding.provider == "notion", | |||
| DataSourceOauthBinding.disabled == False, | |||
| DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"', | |||
| ) | |||
| db.session.add(document) | |||
| db.session.flush() | |||
| document_ids.append(document.id) | |||
| documents.append(document) | |||
| position += 1 | |||
| ).first() | |||
| if not data_source_binding: | |||
| raise ValueError("Data source binding not found.") | |||
| for page in notion_info["pages"]: | |||
| if page["page_id"] not in exist_page_ids: | |||
| data_source_info = { | |||
| "notion_workspace_id": workspace_id, | |||
| "notion_page_id": page["page_id"], | |||
| "notion_page_icon": page["page_icon"], | |||
| "type": page["type"], | |||
| } | |||
| document = DocumentService.build_document( | |||
| dataset, | |||
| dataset_process_rule.id, | |||
| document_data["data_source"]["type"], | |||
| document_data["doc_form"], | |||
| document_data["doc_language"], | |||
| data_source_info, | |||
| created_from, | |||
| position, | |||
| account, | |||
| page["page_name"], | |||
| batch, | |||
| ) | |||
| db.session.add(document) | |||
| db.session.flush() | |||
| document_ids.append(document.id) | |||
| documents.append(document) | |||
| position += 1 | |||
| else: | |||
| exist_document.pop(page["page_id"]) | |||
| # delete not selected documents | |||
| if len(exist_document) > 0: | |||
| clean_notion_document_task.delay(list(exist_document.values()), dataset.id) | |||
| elif document_data["data_source"]["type"] == "website_crawl": | |||
| website_info = document_data["data_source"]["info_list"]["website_info_list"] | |||
| urls = website_info["urls"] | |||
| for url in urls: | |||
| data_source_info = { | |||
| "url": url, | |||
| "provider": website_info["provider"], | |||
| "job_id": website_info["job_id"], | |||
| "only_main_content": website_info.get("only_main_content", False), | |||
| "mode": "crawl", | |||
| } | |||
| if len(url) > 255: | |||
| document_name = url[:200] + "..." | |||
| else: | |||
| exist_document.pop(page["page_id"]) | |||
| # delete not selected documents | |||
| if len(exist_document) > 0: | |||
| clean_notion_document_task.delay(list(exist_document.values()), dataset.id) | |||
| elif document_data["data_source"]["type"] == "website_crawl": | |||
| website_info = document_data["data_source"]["info_list"]["website_info_list"] | |||
| urls = website_info["urls"] | |||
| for url in urls: | |||
| data_source_info = { | |||
| "url": url, | |||
| "provider": website_info["provider"], | |||
| "job_id": website_info["job_id"], | |||
| "only_main_content": website_info.get("only_main_content", False), | |||
| "mode": "crawl", | |||
| } | |||
| if len(url) > 255: | |||
| document_name = url[:200] + "..." | |||
| else: | |||
| document_name = url | |||
| document = DocumentService.build_document( | |||
| dataset, | |||
| dataset_process_rule.id, | |||
| document_data["data_source"]["type"], | |||
| document_data["doc_form"], | |||
| document_data["doc_language"], | |||
| data_source_info, | |||
| created_from, | |||
| position, | |||
| account, | |||
| document_name, | |||
| batch, | |||
| ) | |||
| db.session.add(document) | |||
| db.session.flush() | |||
| document_ids.append(document.id) | |||
| documents.append(document) | |||
| position += 1 | |||
| db.session.commit() | |||
| document_name = url | |||
| document = DocumentService.build_document( | |||
| dataset, | |||
| dataset_process_rule.id, | |||
| document_data["data_source"]["type"], | |||
| document_data["doc_form"], | |||
| document_data["doc_language"], | |||
| data_source_info, | |||
| created_from, | |||
| position, | |||
| account, | |||
| document_name, | |||
| batch, | |||
| ) | |||
| db.session.add(document) | |||
| db.session.flush() | |||
| document_ids.append(document.id) | |||
| documents.append(document) | |||
| position += 1 | |||
| db.session.commit() | |||
| # trigger async task | |||
| if document_ids: | |||
| document_indexing_task.delay(dataset.id, document_ids) | |||
| if duplicate_document_ids: | |||
| duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids) | |||
| # trigger async task | |||
| if document_ids: | |||
| document_indexing_task.delay(dataset.id, document_ids) | |||
| if duplicate_document_ids: | |||
| duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids) | |||
| return documents, batch | |||
| return documents, batch | |||
| @staticmethod | |||
| def check_documents_upload_quota(count: int, features: FeatureModel): | |||