| ) | ) | ||||
| db.session.add(dataset_process_rule) | db.session.add(dataset_process_rule) | ||||
| db.session.commit() | db.session.commit() | ||||
| position = DocumentService.get_documents_position(dataset.id) | |||||
| document_ids = [] | |||||
| duplicate_document_ids = [] | |||||
| if document_data["data_source"]["type"] == "upload_file": | |||||
| upload_file_list = document_data["data_source"]["info_list"]["file_info_list"]["file_ids"] | |||||
| for file_id in upload_file_list: | |||||
| file = ( | |||||
| db.session.query(UploadFile) | |||||
| .filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id) | |||||
| .first() | |||||
| ) | |||||
| lock_name = "add_document_lock_dataset_id_{}".format(dataset.id) | |||||
| with redis_client.lock(lock_name, timeout=600): | |||||
| position = DocumentService.get_documents_position(dataset.id) | |||||
| document_ids = [] | |||||
| duplicate_document_ids = [] | |||||
| if document_data["data_source"]["type"] == "upload_file": | |||||
| upload_file_list = document_data["data_source"]["info_list"]["file_info_list"]["file_ids"] | |||||
| for file_id in upload_file_list: | |||||
| file = ( | |||||
| db.session.query(UploadFile) | |||||
| .filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id) | |||||
| .first() | |||||
| ) | |||||
| # raise error if file not found | |||||
| if not file: | |||||
| raise FileNotExistsError() | |||||
| # raise error if file not found | |||||
| if not file: | |||||
| raise FileNotExistsError() | |||||
| file_name = file.name | |||||
| data_source_info = { | |||||
| "upload_file_id": file_id, | |||||
| } | |||||
| # check duplicate | |||||
| if document_data.get("duplicate", False): | |||||
| document = Document.query.filter_by( | |||||
| dataset_id=dataset.id, | |||||
| tenant_id=current_user.current_tenant_id, | |||||
| data_source_type="upload_file", | |||||
| enabled=True, | |||||
| name=file_name, | |||||
| ).first() | |||||
| if document: | |||||
| document.dataset_process_rule_id = dataset_process_rule.id | |||||
| document.updated_at = datetime.datetime.utcnow() | |||||
| document.created_from = created_from | |||||
| document.doc_form = document_data["doc_form"] | |||||
| document.doc_language = document_data["doc_language"] | |||||
| document.data_source_info = json.dumps(data_source_info) | |||||
| document.batch = batch | |||||
| document.indexing_status = "waiting" | |||||
| db.session.add(document) | |||||
| documents.append(document) | |||||
| duplicate_document_ids.append(document.id) | |||||
| continue | |||||
| document = DocumentService.build_document( | |||||
| dataset, | |||||
| dataset_process_rule.id, | |||||
| document_data["data_source"]["type"], | |||||
| document_data["doc_form"], | |||||
| document_data["doc_language"], | |||||
| data_source_info, | |||||
| created_from, | |||||
| position, | |||||
| account, | |||||
| file_name, | |||||
| batch, | |||||
| ) | |||||
| db.session.add(document) | |||||
| db.session.flush() | |||||
| document_ids.append(document.id) | |||||
| documents.append(document) | |||||
| position += 1 | |||||
| elif document_data["data_source"]["type"] == "notion_import": | |||||
| notion_info_list = document_data["data_source"]["info_list"]["notion_info_list"] | |||||
| exist_page_ids = [] | |||||
| exist_document = {} | |||||
| documents = Document.query.filter_by( | |||||
| dataset_id=dataset.id, | |||||
| tenant_id=current_user.current_tenant_id, | |||||
| data_source_type="notion_import", | |||||
| enabled=True, | |||||
| ).all() | |||||
| if documents: | |||||
| for document in documents: | |||||
| data_source_info = json.loads(document.data_source_info) | |||||
| exist_page_ids.append(data_source_info["notion_page_id"]) | |||||
| exist_document[data_source_info["notion_page_id"]] = document.id | |||||
| for notion_info in notion_info_list: | |||||
| workspace_id = notion_info["workspace_id"] | |||||
| data_source_binding = DataSourceOauthBinding.query.filter( | |||||
| db.and_( | |||||
| DataSourceOauthBinding.tenant_id == current_user.current_tenant_id, | |||||
| DataSourceOauthBinding.provider == "notion", | |||||
| DataSourceOauthBinding.disabled == False, | |||||
| DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"', | |||||
| file_name = file.name | |||||
| data_source_info = { | |||||
| "upload_file_id": file_id, | |||||
| } | |||||
| # check duplicate | |||||
| if document_data.get("duplicate", False): | |||||
| document = Document.query.filter_by( | |||||
| dataset_id=dataset.id, | |||||
| tenant_id=current_user.current_tenant_id, | |||||
| data_source_type="upload_file", | |||||
| enabled=True, | |||||
| name=file_name, | |||||
| ).first() | |||||
| if document: | |||||
| document.dataset_process_rule_id = dataset_process_rule.id | |||||
| document.updated_at = datetime.datetime.utcnow() | |||||
| document.created_from = created_from | |||||
| document.doc_form = document_data["doc_form"] | |||||
| document.doc_language = document_data["doc_language"] | |||||
| document.data_source_info = json.dumps(data_source_info) | |||||
| document.batch = batch | |||||
| document.indexing_status = "waiting" | |||||
| db.session.add(document) | |||||
| documents.append(document) | |||||
| duplicate_document_ids.append(document.id) | |||||
| continue | |||||
| document = DocumentService.build_document( | |||||
| dataset, | |||||
| dataset_process_rule.id, | |||||
| document_data["data_source"]["type"], | |||||
| document_data["doc_form"], | |||||
| document_data["doc_language"], | |||||
| data_source_info, | |||||
| created_from, | |||||
| position, | |||||
| account, | |||||
| file_name, | |||||
| batch, | |||||
| ) | ) | ||||
| ).first() | |||||
| if not data_source_binding: | |||||
| raise ValueError("Data source binding not found.") | |||||
| for page in notion_info["pages"]: | |||||
| if page["page_id"] not in exist_page_ids: | |||||
| data_source_info = { | |||||
| "notion_workspace_id": workspace_id, | |||||
| "notion_page_id": page["page_id"], | |||||
| "notion_page_icon": page["page_icon"], | |||||
| "type": page["type"], | |||||
| } | |||||
| document = DocumentService.build_document( | |||||
| dataset, | |||||
| dataset_process_rule.id, | |||||
| document_data["data_source"]["type"], | |||||
| document_data["doc_form"], | |||||
| document_data["doc_language"], | |||||
| data_source_info, | |||||
| created_from, | |||||
| position, | |||||
| account, | |||||
| page["page_name"], | |||||
| batch, | |||||
| db.session.add(document) | |||||
| db.session.flush() | |||||
| document_ids.append(document.id) | |||||
| documents.append(document) | |||||
| position += 1 | |||||
| elif document_data["data_source"]["type"] == "notion_import": | |||||
| notion_info_list = document_data["data_source"]["info_list"]["notion_info_list"] | |||||
| exist_page_ids = [] | |||||
| exist_document = {} | |||||
| documents = Document.query.filter_by( | |||||
| dataset_id=dataset.id, | |||||
| tenant_id=current_user.current_tenant_id, | |||||
| data_source_type="notion_import", | |||||
| enabled=True, | |||||
| ).all() | |||||
| if documents: | |||||
| for document in documents: | |||||
| data_source_info = json.loads(document.data_source_info) | |||||
| exist_page_ids.append(data_source_info["notion_page_id"]) | |||||
| exist_document[data_source_info["notion_page_id"]] = document.id | |||||
| for notion_info in notion_info_list: | |||||
| workspace_id = notion_info["workspace_id"] | |||||
| data_source_binding = DataSourceOauthBinding.query.filter( | |||||
| db.and_( | |||||
| DataSourceOauthBinding.tenant_id == current_user.current_tenant_id, | |||||
| DataSourceOauthBinding.provider == "notion", | |||||
| DataSourceOauthBinding.disabled == False, | |||||
| DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"', | |||||
| ) | ) | ||||
| db.session.add(document) | |||||
| db.session.flush() | |||||
| document_ids.append(document.id) | |||||
| documents.append(document) | |||||
| position += 1 | |||||
| ).first() | |||||
| if not data_source_binding: | |||||
| raise ValueError("Data source binding not found.") | |||||
| for page in notion_info["pages"]: | |||||
| if page["page_id"] not in exist_page_ids: | |||||
| data_source_info = { | |||||
| "notion_workspace_id": workspace_id, | |||||
| "notion_page_id": page["page_id"], | |||||
| "notion_page_icon": page["page_icon"], | |||||
| "type": page["type"], | |||||
| } | |||||
| document = DocumentService.build_document( | |||||
| dataset, | |||||
| dataset_process_rule.id, | |||||
| document_data["data_source"]["type"], | |||||
| document_data["doc_form"], | |||||
| document_data["doc_language"], | |||||
| data_source_info, | |||||
| created_from, | |||||
| position, | |||||
| account, | |||||
| page["page_name"], | |||||
| batch, | |||||
| ) | |||||
| db.session.add(document) | |||||
| db.session.flush() | |||||
| document_ids.append(document.id) | |||||
| documents.append(document) | |||||
| position += 1 | |||||
| else: | |||||
| exist_document.pop(page["page_id"]) | |||||
| # delete not selected documents | |||||
| if len(exist_document) > 0: | |||||
| clean_notion_document_task.delay(list(exist_document.values()), dataset.id) | |||||
| elif document_data["data_source"]["type"] == "website_crawl": | |||||
| website_info = document_data["data_source"]["info_list"]["website_info_list"] | |||||
| urls = website_info["urls"] | |||||
| for url in urls: | |||||
| data_source_info = { | |||||
| "url": url, | |||||
| "provider": website_info["provider"], | |||||
| "job_id": website_info["job_id"], | |||||
| "only_main_content": website_info.get("only_main_content", False), | |||||
| "mode": "crawl", | |||||
| } | |||||
| if len(url) > 255: | |||||
| document_name = url[:200] + "..." | |||||
| else: | else: | ||||
| exist_document.pop(page["page_id"]) | |||||
| # delete not selected documents | |||||
| if len(exist_document) > 0: | |||||
| clean_notion_document_task.delay(list(exist_document.values()), dataset.id) | |||||
| elif document_data["data_source"]["type"] == "website_crawl": | |||||
| website_info = document_data["data_source"]["info_list"]["website_info_list"] | |||||
| urls = website_info["urls"] | |||||
| for url in urls: | |||||
| data_source_info = { | |||||
| "url": url, | |||||
| "provider": website_info["provider"], | |||||
| "job_id": website_info["job_id"], | |||||
| "only_main_content": website_info.get("only_main_content", False), | |||||
| "mode": "crawl", | |||||
| } | |||||
| if len(url) > 255: | |||||
| document_name = url[:200] + "..." | |||||
| else: | |||||
| document_name = url | |||||
| document = DocumentService.build_document( | |||||
| dataset, | |||||
| dataset_process_rule.id, | |||||
| document_data["data_source"]["type"], | |||||
| document_data["doc_form"], | |||||
| document_data["doc_language"], | |||||
| data_source_info, | |||||
| created_from, | |||||
| position, | |||||
| account, | |||||
| document_name, | |||||
| batch, | |||||
| ) | |||||
| db.session.add(document) | |||||
| db.session.flush() | |||||
| document_ids.append(document.id) | |||||
| documents.append(document) | |||||
| position += 1 | |||||
| db.session.commit() | |||||
| document_name = url | |||||
| document = DocumentService.build_document( | |||||
| dataset, | |||||
| dataset_process_rule.id, | |||||
| document_data["data_source"]["type"], | |||||
| document_data["doc_form"], | |||||
| document_data["doc_language"], | |||||
| data_source_info, | |||||
| created_from, | |||||
| position, | |||||
| account, | |||||
| document_name, | |||||
| batch, | |||||
| ) | |||||
| db.session.add(document) | |||||
| db.session.flush() | |||||
| document_ids.append(document.id) | |||||
| documents.append(document) | |||||
| position += 1 | |||||
| db.session.commit() | |||||
| # trigger async task | |||||
| if document_ids: | |||||
| document_indexing_task.delay(dataset.id, document_ids) | |||||
| if duplicate_document_ids: | |||||
| duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids) | |||||
| # trigger async task | |||||
| if document_ids: | |||||
| document_indexing_task.delay(dataset.id, document_ids) | |||||
| if duplicate_document_ids: | |||||
| duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids) | |||||
| return documents, batch | |||||
| return documents, batch | |||||
| @staticmethod | @staticmethod | ||||
| def check_documents_upload_quota(count: int, features: FeatureModel): | def check_documents_upload_quota(count: int, features: FeatureModel): |