|
|
|
@@ -1,3 +1,4 @@ |
|
|
|
import concurrent.futures |
|
|
|
import datetime |
|
|
|
import json |
|
|
|
import logging |
|
|
|
@@ -650,17 +651,44 @@ class IndexingRunner: |
|
|
|
# chunk nodes by chunk size |
|
|
|
indexing_start_at = time.perf_counter() |
|
|
|
tokens = 0 |
|
|
|
chunk_size = 100 |
|
|
|
chunk_size = 10 |
|
|
|
|
|
|
|
embedding_model_type_instance = None |
|
|
|
if embedding_model_instance: |
|
|
|
embedding_model_type_instance = embedding_model_instance.model_type_instance |
|
|
|
embedding_model_type_instance = cast(TextEmbeddingModel, embedding_model_type_instance) |
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: |
|
|
|
futures = [] |
|
|
|
for i in range(0, len(documents), chunk_size): |
|
|
|
chunk_documents = documents[i:i + chunk_size] |
|
|
|
futures.append(executor.submit(self._process_chunk, current_app._get_current_object(), index_processor, |
|
|
|
chunk_documents, dataset, |
|
|
|
dataset_document, embedding_model_instance, |
|
|
|
embedding_model_type_instance)) |
|
|
|
|
|
|
|
for future in futures: |
|
|
|
tokens += future.result() |
|
|
|
|
|
|
|
for i in range(0, len(documents), chunk_size): |
|
|
|
indexing_end_at = time.perf_counter() |
|
|
|
|
|
|
|
# update document status to completed |
|
|
|
self._update_document_index_status( |
|
|
|
document_id=dataset_document.id, |
|
|
|
after_indexing_status="completed", |
|
|
|
extra_update_params={ |
|
|
|
DatasetDocument.tokens: tokens, |
|
|
|
DatasetDocument.completed_at: datetime.datetime.utcnow(), |
|
|
|
DatasetDocument.indexing_latency: indexing_end_at - indexing_start_at, |
|
|
|
} |
|
|
|
) |
|
|
|
|
|
|
|
def _process_chunk(self, flask_app, index_processor, chunk_documents, dataset, dataset_document, |
|
|
|
embedding_model_instance, embedding_model_type_instance): |
|
|
|
with flask_app.app_context(): |
|
|
|
# check document is paused |
|
|
|
self._check_document_paused_status(dataset_document.id) |
|
|
|
chunk_documents = documents[i:i + chunk_size] |
|
|
|
|
|
|
|
tokens = 0 |
|
|
|
if dataset.indexing_technique == 'high_quality' or embedding_model_type_instance: |
|
|
|
tokens += sum( |
|
|
|
embedding_model_type_instance.get_num_tokens( |
|
|
|
@@ -670,9 +698,9 @@ class IndexingRunner: |
|
|
|
) |
|
|
|
for document in chunk_documents |
|
|
|
) |
|
|
|
|
|
|
|
# load index |
|
|
|
index_processor.load(dataset, chunk_documents) |
|
|
|
db.session.add(dataset) |
|
|
|
|
|
|
|
document_ids = [document.metadata['doc_id'] for document in chunk_documents] |
|
|
|
db.session.query(DocumentSegment).filter( |
|
|
|
@@ -687,18 +715,7 @@ class IndexingRunner: |
|
|
|
|
|
|
|
db.session.commit() |
|
|
|
|
|
|
|
indexing_end_at = time.perf_counter() |
|
|
|
|
|
|
|
# update document status to completed |
|
|
|
self._update_document_index_status( |
|
|
|
document_id=dataset_document.id, |
|
|
|
after_indexing_status="completed", |
|
|
|
extra_update_params={ |
|
|
|
DatasetDocument.tokens: tokens, |
|
|
|
DatasetDocument.completed_at: datetime.datetime.utcnow(), |
|
|
|
DatasetDocument.indexing_latency: indexing_end_at - indexing_start_at, |
|
|
|
} |
|
|
|
) |
|
|
|
return tokens |
|
|
|
|
|
|
|
def _check_document_paused_status(self, document_id: str): |
|
|
|
indexing_cache_key = 'document_{}_is_paused'.format(document_id) |