| @@ -7,6 +7,7 @@ import time | |||
| import uuid | |||
| from typing import Optional, List, cast | |||
| from flask import current_app, Flask | |||
| from flask_login import current_user | |||
| from langchain.schema import Document | |||
| from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter | |||
| @@ -522,7 +523,8 @@ class IndexingRunner: | |||
| sub_documents = all_documents[i:i + 10] | |||
| for doc in sub_documents: | |||
| document_format_thread = threading.Thread(target=self.format_qa_document, kwargs={ | |||
| 'tenant_id': tenant_id, 'document_node': doc, 'all_qa_documents': all_qa_documents}) | |||
| 'flask_app': current_app._get_current_object(), 'tenant_id': tenant_id, 'document_node': doc, | |||
| 'all_qa_documents': all_qa_documents}) | |||
| threads.append(document_format_thread) | |||
| document_format_thread.start() | |||
| for thread in threads: | |||
| @@ -530,28 +532,29 @@ class IndexingRunner: | |||
| return all_qa_documents | |||
| return all_documents | |||
| def format_qa_document(self, tenant_id: str, document_node, all_qa_documents): | |||
| def format_qa_document(self, flask_app: Flask, tenant_id: str, document_node, all_qa_documents): | |||
| format_documents = [] | |||
| if document_node.page_content is None or not document_node.page_content.strip(): | |||
| return | |||
| try: | |||
| # qa model document | |||
| response = LLMGenerator.generate_qa_document(tenant_id, document_node.page_content) | |||
| document_qa_list = self.format_split_text(response) | |||
| qa_documents = [] | |||
| for result in document_qa_list: | |||
| qa_document = Document(page_content=result['question'], metadata=document_node.metadata.copy()) | |||
| doc_id = str(uuid.uuid4()) | |||
| hash = helper.generate_text_hash(result['question']) | |||
| qa_document.metadata['answer'] = result['answer'] | |||
| qa_document.metadata['doc_id'] = doc_id | |||
| qa_document.metadata['doc_hash'] = hash | |||
| qa_documents.append(qa_document) | |||
| format_documents.extend(qa_documents) | |||
| except Exception as e: | |||
| logging.exception(e) | |||
| with flask_app.app_context(): | |||
| try: | |||
| # qa model document | |||
| response = LLMGenerator.generate_qa_document(tenant_id, document_node.page_content) | |||
| document_qa_list = self.format_split_text(response) | |||
| qa_documents = [] | |||
| for result in document_qa_list: | |||
| qa_document = Document(page_content=result['question'], metadata=document_node.metadata.copy()) | |||
| doc_id = str(uuid.uuid4()) | |||
| hash = helper.generate_text_hash(result['question']) | |||
| qa_document.metadata['answer'] = result['answer'] | |||
| qa_document.metadata['doc_id'] = doc_id | |||
| qa_document.metadata['doc_hash'] = hash | |||
| qa_documents.append(qa_document) | |||
| format_documents.extend(qa_documents) | |||
| except Exception as e: | |||
| logging.exception(e) | |||
| all_qa_documents.extend(format_documents) | |||
| all_qa_documents.extend(format_documents) | |||
| def _split_to_documents_for_estimate(self, text_docs: List[Document], splitter: TextSplitter, | |||