|
|
|
@@ -11,6 +11,7 @@ from flask import current_app, Flask |
|
|
|
from flask_login import current_user |
|
|
|
from langchain.schema import Document |
|
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter |
|
|
|
from sqlalchemy.orm.exc import ObjectDeletedError |
|
|
|
|
|
|
|
from core.data_loader.file_extractor import FileExtractor |
|
|
|
from core.data_loader.loader.notion import NotionLoader |
|
|
|
@@ -79,6 +80,8 @@ class IndexingRunner: |
|
|
|
dataset_document.error = str(e.description) |
|
|
|
dataset_document.stopped_at = datetime.datetime.utcnow() |
|
|
|
db.session.commit() |
|
|
|
except ObjectDeletedError: |
|
|
|
logging.warning('Document deleted, document id: {}'.format(dataset_document.id)) |
|
|
|
except Exception as e: |
|
|
|
logging.exception("consume document failed") |
|
|
|
dataset_document.indexing_status = 'error' |
|
|
|
@@ -276,7 +279,8 @@ class IndexingRunner: |
|
|
|
) |
|
|
|
if len(preview_texts) > 0: |
|
|
|
# qa model document |
|
|
|
response = LLMGenerator.generate_qa_document(current_user.current_tenant_id, preview_texts[0], doc_language) |
|
|
|
response = LLMGenerator.generate_qa_document(current_user.current_tenant_id, preview_texts[0], |
|
|
|
doc_language) |
|
|
|
document_qa_list = self.format_split_text(response) |
|
|
|
return { |
|
|
|
"total_segments": total_segments * 20, |
|
|
|
@@ -372,7 +376,8 @@ class IndexingRunner: |
|
|
|
) |
|
|
|
if len(preview_texts) > 0: |
|
|
|
# qa model document |
|
|
|
response = LLMGenerator.generate_qa_document(current_user.current_tenant_id, preview_texts[0], doc_language) |
|
|
|
response = LLMGenerator.generate_qa_document(current_user.current_tenant_id, preview_texts[0], |
|
|
|
doc_language) |
|
|
|
document_qa_list = self.format_split_text(response) |
|
|
|
return { |
|
|
|
"total_segments": total_segments * 20, |
|
|
|
@@ -582,7 +587,6 @@ class IndexingRunner: |
|
|
|
|
|
|
|
all_qa_documents.extend(format_documents) |
|
|
|
|
|
|
|
|
|
|
|
def _split_to_documents_for_estimate(self, text_docs: List[Document], splitter: TextSplitter, |
|
|
|
processing_rule: DatasetProcessRule) -> List[Document]: |
|
|
|
""" |
|
|
|
@@ -734,6 +738,9 @@ class IndexingRunner: |
|
|
|
count = DatasetDocument.query.filter_by(id=document_id, is_paused=True).count() |
|
|
|
if count > 0: |
|
|
|
raise DocumentIsPausedException() |
|
|
|
document = DatasetDocument.query.filter_by(id=document_id).first() |
|
|
|
if not document: |
|
|
|
raise DocumentIsDeletedPausedException() |
|
|
|
|
|
|
|
update_params = { |
|
|
|
DatasetDocument.indexing_status: after_indexing_status |
|
|
|
@@ -781,3 +788,7 @@ class IndexingRunner: |
|
|
|
|
|
|
|
class DocumentIsPausedException(Exception): |
|
|
|
pass |
|
|
|
|
|
|
|
|
|
|
|
class DocumentIsDeletedPausedException(Exception): |
|
|
|
pass |