|
|
|
|
|
|
|
|
credentials=embedding_model.credentials, |
|
|
credentials=embedding_model.credentials, |
|
|
texts=[content] |
|
|
texts=[content] |
|
|
) |
|
|
) |
|
|
max_position = db.session.query(func.max(DocumentSegment.position)).filter( |
|
|
|
|
|
DocumentSegment.document_id == document.id |
|
|
|
|
|
).scalar() |
|
|
|
|
|
segment_document = DocumentSegment( |
|
|
|
|
|
tenant_id=current_user.current_tenant_id, |
|
|
|
|
|
dataset_id=document.dataset_id, |
|
|
|
|
|
document_id=document.id, |
|
|
|
|
|
index_node_id=doc_id, |
|
|
|
|
|
index_node_hash=segment_hash, |
|
|
|
|
|
position=max_position + 1 if max_position else 1, |
|
|
|
|
|
content=content, |
|
|
|
|
|
word_count=len(content), |
|
|
|
|
|
tokens=tokens, |
|
|
|
|
|
status='completed', |
|
|
|
|
|
indexing_at=datetime.datetime.utcnow(), |
|
|
|
|
|
completed_at=datetime.datetime.utcnow(), |
|
|
|
|
|
created_by=current_user.id |
|
|
|
|
|
) |
|
|
|
|
|
if document.doc_form == 'qa_model': |
|
|
|
|
|
segment_document.answer = args['answer'] |
|
|
|
|
|
|
|
|
|
|
|
db.session.add(segment_document) |
|
|
|
|
|
db.session.commit() |
|
|
|
|
|
|
|
|
|
|
|
# save vector index |
|
|
|
|
|
try: |
|
|
|
|
|
VectorService.create_segments_vector([args['keywords']], [segment_document], dataset) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
logging.exception("create segment index failed") |
|
|
|
|
|
segment_document.enabled = False |
|
|
|
|
|
segment_document.disabled_at = datetime.datetime.utcnow() |
|
|
|
|
|
segment_document.status = 'error' |
|
|
|
|
|
segment_document.error = str(e) |
|
|
|
|
|
db.session.commit() |
|
|
|
|
|
segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_document.id).first() |
|
|
|
|
|
return segment |
|
|
|
|
|
|
|
|
|
|
|
@classmethod |
|
|
|
|
|
def multi_create_segment(cls, segments: list, document: Document, dataset: Dataset): |
|
|
|
|
|
embedding_model = None |
|
|
|
|
|
if dataset.indexing_technique == 'high_quality': |
|
|
|
|
|
model_manager = ModelManager() |
|
|
|
|
|
embedding_model = model_manager.get_model_instance( |
|
|
|
|
|
tenant_id=current_user.current_tenant_id, |
|
|
|
|
|
provider=dataset.embedding_model_provider, |
|
|
|
|
|
model_type=ModelType.TEXT_EMBEDDING, |
|
|
|
|
|
model=dataset.embedding_model |
|
|
|
|
|
) |
|
|
|
|
|
max_position = db.session.query(func.max(DocumentSegment.position)).filter( |
|
|
|
|
|
DocumentSegment.document_id == document.id |
|
|
|
|
|
).scalar() |
|
|
|
|
|
pre_segment_data_list = [] |
|
|
|
|
|
segment_data_list = [] |
|
|
|
|
|
keywords_list = [] |
|
|
|
|
|
for segment_item in segments: |
|
|
|
|
|
content = segment_item['content'] |
|
|
|
|
|
doc_id = str(uuid.uuid4()) |
|
|
|
|
|
segment_hash = helper.generate_text_hash(content) |
|
|
|
|
|
tokens = 0 |
|
|
|
|
|
if dataset.indexing_technique == 'high_quality' and embedding_model: |
|
|
|
|
|
# calc embedding use tokens |
|
|
|
|
|
model_type_instance = cast(TextEmbeddingModel, embedding_model.model_type_instance) |
|
|
|
|
|
tokens = model_type_instance.get_num_tokens( |
|
|
|
|
|
model=embedding_model.model, |
|
|
|
|
|
credentials=embedding_model.credentials, |
|
|
|
|
|
texts=[content] |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
lock_name = 'add_segment_lock_document_id_{}'.format(document.id) |
|
|
|
|
|
with redis_client.lock(lock_name, timeout=600): |
|
|
|
|
|
max_position = db.session.query(func.max(DocumentSegment.position)).filter( |
|
|
|
|
|
DocumentSegment.document_id == document.id |
|
|
|
|
|
).scalar() |
|
|
segment_document = DocumentSegment( |
|
|
segment_document = DocumentSegment( |
|
|
tenant_id=current_user.current_tenant_id, |
|
|
tenant_id=current_user.current_tenant_id, |
|
|
dataset_id=document.dataset_id, |
|
|
dataset_id=document.dataset_id, |
|
|
|
|
|
|
|
|
created_by=current_user.id |
|
|
created_by=current_user.id |
|
|
) |
|
|
) |
|
|
if document.doc_form == 'qa_model': |
|
|
if document.doc_form == 'qa_model': |
|
|
segment_document.answer = segment_item['answer'] |
|
|
|
|
|
db.session.add(segment_document) |
|
|
|
|
|
segment_data_list.append(segment_document) |
|
|
|
|
|
|
|
|
segment_document.answer = args['answer'] |
|
|
|
|
|
|
|
|
pre_segment_data_list.append(segment_document) |
|
|
|
|
|
keywords_list.append(segment_item['keywords']) |
|
|
|
|
|
|
|
|
db.session.add(segment_document) |
|
|
|
|
|
db.session.commit() |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
# save vector index |
|
|
# save vector index |
|
|
VectorService.create_segments_vector(keywords_list, pre_segment_data_list, dataset) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
logging.exception("create segment index failed") |
|
|
|
|
|
for segment_document in segment_data_list: |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
VectorService.create_segments_vector([args['keywords']], [segment_document], dataset) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
logging.exception("create segment index failed") |
|
|
segment_document.enabled = False |
|
|
segment_document.enabled = False |
|
|
segment_document.disabled_at = datetime.datetime.utcnow() |
|
|
segment_document.disabled_at = datetime.datetime.utcnow() |
|
|
segment_document.status = 'error' |
|
|
segment_document.status = 'error' |
|
|
segment_document.error = str(e) |
|
|
segment_document.error = str(e) |
|
|
db.session.commit() |
|
|
|
|
|
return segment_data_list |
|
|
|
|
|
|
|
|
db.session.commit() |
|
|
|
|
|
segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_document.id).first() |
|
|
|
|
|
return segment |
|
|
|
|
|
|
|
|
|
|
|
@classmethod |
|
|
|
|
|
def multi_create_segment(cls, segments: list, document: Document, dataset: Dataset): |
|
|
|
|
|
lock_name = 'multi_add_segment_lock_document_id_{}'.format(document.id) |
|
|
|
|
|
with redis_client.lock(lock_name, timeout=600): |
|
|
|
|
|
embedding_model = None |
|
|
|
|
|
if dataset.indexing_technique == 'high_quality': |
|
|
|
|
|
model_manager = ModelManager() |
|
|
|
|
|
embedding_model = model_manager.get_model_instance( |
|
|
|
|
|
tenant_id=current_user.current_tenant_id, |
|
|
|
|
|
provider=dataset.embedding_model_provider, |
|
|
|
|
|
model_type=ModelType.TEXT_EMBEDDING, |
|
|
|
|
|
model=dataset.embedding_model |
|
|
|
|
|
) |
|
|
|
|
|
max_position = db.session.query(func.max(DocumentSegment.position)).filter( |
|
|
|
|
|
DocumentSegment.document_id == document.id |
|
|
|
|
|
).scalar() |
|
|
|
|
|
pre_segment_data_list = [] |
|
|
|
|
|
segment_data_list = [] |
|
|
|
|
|
keywords_list = [] |
|
|
|
|
|
for segment_item in segments: |
|
|
|
|
|
content = segment_item['content'] |
|
|
|
|
|
doc_id = str(uuid.uuid4()) |
|
|
|
|
|
segment_hash = helper.generate_text_hash(content) |
|
|
|
|
|
tokens = 0 |
|
|
|
|
|
if dataset.indexing_technique == 'high_quality' and embedding_model: |
|
|
|
|
|
# calc embedding use tokens |
|
|
|
|
|
model_type_instance = cast(TextEmbeddingModel, embedding_model.model_type_instance) |
|
|
|
|
|
tokens = model_type_instance.get_num_tokens( |
|
|
|
|
|
model=embedding_model.model, |
|
|
|
|
|
credentials=embedding_model.credentials, |
|
|
|
|
|
texts=[content] |
|
|
|
|
|
) |
|
|
|
|
|
segment_document = DocumentSegment( |
|
|
|
|
|
tenant_id=current_user.current_tenant_id, |
|
|
|
|
|
dataset_id=document.dataset_id, |
|
|
|
|
|
document_id=document.id, |
|
|
|
|
|
index_node_id=doc_id, |
|
|
|
|
|
index_node_hash=segment_hash, |
|
|
|
|
|
position=max_position + 1 if max_position else 1, |
|
|
|
|
|
content=content, |
|
|
|
|
|
word_count=len(content), |
|
|
|
|
|
tokens=tokens, |
|
|
|
|
|
status='completed', |
|
|
|
|
|
indexing_at=datetime.datetime.utcnow(), |
|
|
|
|
|
completed_at=datetime.datetime.utcnow(), |
|
|
|
|
|
created_by=current_user.id |
|
|
|
|
|
) |
|
|
|
|
|
if document.doc_form == 'qa_model': |
|
|
|
|
|
segment_document.answer = segment_item['answer'] |
|
|
|
|
|
db.session.add(segment_document) |
|
|
|
|
|
segment_data_list.append(segment_document) |
|
|
|
|
|
|
|
|
|
|
|
pre_segment_data_list.append(segment_document) |
|
|
|
|
|
keywords_list.append(segment_item['keywords']) |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
# save vector index |
|
|
|
|
|
VectorService.create_segments_vector(keywords_list, pre_segment_data_list, dataset) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
logging.exception("create segment index failed") |
|
|
|
|
|
for segment_document in segment_data_list: |
|
|
|
|
|
segment_document.enabled = False |
|
|
|
|
|
segment_document.disabled_at = datetime.datetime.utcnow() |
|
|
|
|
|
segment_document.status = 'error' |
|
|
|
|
|
segment_document.error = str(e) |
|
|
|
|
|
db.session.commit() |
|
|
|
|
|
return segment_data_list |
|
|
|
|
|
|
|
|
@classmethod |
|
|
@classmethod |
|
|
def update_segment(cls, args: dict, segment: DocumentSegment, document: Document, dataset: Dataset): |
|
|
def update_segment(cls, args: dict, segment: DocumentSegment, document: Document, dataset: Dataset): |