Signed-off-by: kenwoodjw <blackxin55+@gmail.com> Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>tags/1.9.0
| # node_ids is segment's node_ids | # node_ids is segment's node_ids | ||||
| if dataset.indexing_technique == "high_quality": | if dataset.indexing_technique == "high_quality": | ||||
| delete_child_chunks = kwargs.get("delete_child_chunks") or False | delete_child_chunks = kwargs.get("delete_child_chunks") or False | ||||
| precomputed_child_node_ids = kwargs.get("precomputed_child_node_ids") | |||||
| vector = Vector(dataset) | vector = Vector(dataset) | ||||
| if node_ids: | if node_ids: | ||||
| child_node_ids = ( | |||||
| db.session.query(ChildChunk.index_node_id) | |||||
| .join(DocumentSegment, ChildChunk.segment_id == DocumentSegment.id) | |||||
| .where( | |||||
| DocumentSegment.dataset_id == dataset.id, | |||||
| DocumentSegment.index_node_id.in_(node_ids), | |||||
| ChildChunk.dataset_id == dataset.id, | |||||
| # Use precomputed child_node_ids if available (to avoid race conditions) | |||||
| if precomputed_child_node_ids is not None: | |||||
| child_node_ids = precomputed_child_node_ids | |||||
| else: | |||||
| # Fallback to original query (may fail if segments are already deleted) | |||||
| child_node_ids = ( | |||||
| db.session.query(ChildChunk.index_node_id) | |||||
| .join(DocumentSegment, ChildChunk.segment_id == DocumentSegment.id) | |||||
| .where( | |||||
| DocumentSegment.dataset_id == dataset.id, | |||||
| DocumentSegment.index_node_id.in_(node_ids), | |||||
| ChildChunk.dataset_id == dataset.id, | |||||
| ) | |||||
| .all() | |||||
| ) | ) | ||||
| .all() | |||||
| ) | |||||
| child_node_ids = [child_node_id[0] for child_node_id in child_node_ids] | |||||
| vector.delete_by_ids(child_node_ids) | |||||
| if delete_child_chunks: | |||||
| child_node_ids = [child_node_id[0] for child_node_id in child_node_ids if child_node_id[0]] | |||||
| # Delete from vector index | |||||
| if child_node_ids: | |||||
| vector.delete_by_ids(child_node_ids) | |||||
| # Delete from database | |||||
| if delete_child_chunks and child_node_ids: | |||||
| db.session.query(ChildChunk).where( | db.session.query(ChildChunk).where( | ||||
| ChildChunk.dataset_id == dataset.id, ChildChunk.index_node_id.in_(child_node_ids) | ChildChunk.dataset_id == dataset.id, ChildChunk.index_node_id.in_(child_node_ids) | ||||
| ).delete(synchronize_session=False) | ).delete(synchronize_session=False) |
| if segment.enabled: | if segment.enabled: | ||||
| # send delete segment index task | # send delete segment index task | ||||
| redis_client.setex(indexing_cache_key, 600, 1) | redis_client.setex(indexing_cache_key, 600, 1) | ||||
| delete_segment_from_index_task.delay([segment.index_node_id], dataset.id, document.id) | |||||
| # Get child chunk IDs before parent segment is deleted | |||||
| child_node_ids = [] | |||||
| if segment.index_node_id: | |||||
| child_chunks = ( | |||||
| db.session.query(ChildChunk.index_node_id) | |||||
| .where( | |||||
| ChildChunk.segment_id == segment.id, | |||||
| ChildChunk.dataset_id == dataset.id, | |||||
| ) | |||||
| .all() | |||||
| ) | |||||
| child_node_ids = [chunk[0] for chunk in child_chunks if chunk[0]] | |||||
| delete_segment_from_index_task.delay([segment.index_node_id], dataset.id, document.id, child_node_ids) | |||||
| db.session.delete(segment) | db.session.delete(segment) | ||||
| # update document word count | # update document word count | ||||
| assert document.word_count is not None | assert document.word_count is not None | ||||
| @classmethod | @classmethod | ||||
| def delete_segments(cls, segment_ids: list, document: Document, dataset: Dataset): | def delete_segments(cls, segment_ids: list, document: Document, dataset: Dataset): | ||||
| assert isinstance(current_user, Account) | |||||
| segments = ( | |||||
| db.session.query(DocumentSegment.index_node_id, DocumentSegment.word_count) | |||||
| assert current_user is not None | |||||
| # Check if segment_ids is not empty to avoid WHERE false condition | |||||
| if not segment_ids or len(segment_ids) == 0: | |||||
| return | |||||
| segments_info = ( | |||||
| db.session.query(DocumentSegment) | |||||
| .with_entities(DocumentSegment.index_node_id, DocumentSegment.id, DocumentSegment.word_count) | |||||
| .where( | .where( | ||||
| DocumentSegment.id.in_(segment_ids), | DocumentSegment.id.in_(segment_ids), | ||||
| DocumentSegment.dataset_id == dataset.id, | DocumentSegment.dataset_id == dataset.id, | ||||
| .all() | .all() | ||||
| ) | ) | ||||
| if not segments: | |||||
| if not segments_info: | |||||
| return | return | ||||
| index_node_ids = [seg.index_node_id for seg in segments] | |||||
| total_words = sum(seg.word_count for seg in segments) | |||||
| index_node_ids = [info[0] for info in segments_info] | |||||
| segment_db_ids = [info[1] for info in segments_info] | |||||
| total_words = sum(info[2] for info in segments_info if info[2] is not None) | |||||
| # Get child chunk IDs before parent segments are deleted | |||||
| child_node_ids = [] | |||||
| if index_node_ids: | |||||
| child_chunks = ( | |||||
| db.session.query(ChildChunk.index_node_id) | |||||
| .where( | |||||
| ChildChunk.segment_id.in_(segment_db_ids), | |||||
| ChildChunk.dataset_id == dataset.id, | |||||
| ) | |||||
| .all() | |||||
| ) | |||||
| child_node_ids = [chunk[0] for chunk in child_chunks if chunk[0]] | |||||
| # Start async cleanup with both parent and child node IDs | |||||
| if index_node_ids or child_node_ids: | |||||
| delete_segment_from_index_task.delay(index_node_ids, dataset.id, document.id, child_node_ids) | |||||
| document.word_count = ( | document.word_count = ( | ||||
| document.word_count - total_words if document.word_count and document.word_count > total_words else 0 | document.word_count - total_words if document.word_count and document.word_count > total_words else 0 | ||||
| ) | ) | ||||
| db.session.add(document) | db.session.add(document) | ||||
| delete_segment_from_index_task.delay(index_node_ids, dataset.id, document.id) | |||||
| # Delete database records | |||||
| db.session.query(DocumentSegment).where(DocumentSegment.id.in_(segment_ids)).delete() | db.session.query(DocumentSegment).where(DocumentSegment.id.in_(segment_ids)).delete() | ||||
| db.session.commit() | db.session.commit() | ||||
| @shared_task(queue="dataset") | @shared_task(queue="dataset") | ||||
| def delete_segment_from_index_task(index_node_ids: list, dataset_id: str, document_id: str): | |||||
| def delete_segment_from_index_task( | |||||
| index_node_ids: list, dataset_id: str, document_id: str, child_node_ids: list | None = None | |||||
| ): | |||||
| """ | """ | ||||
| Async Remove segment from index | Async Remove segment from index | ||||
| :param index_node_ids: | :param index_node_ids: | ||||
| try: | try: | ||||
| dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first() | dataset = db.session.query(Dataset).where(Dataset.id == dataset_id).first() | ||||
| if not dataset: | if not dataset: | ||||
| logging.warning("Dataset %s not found, skipping index cleanup", dataset_id) | |||||
| return | return | ||||
| dataset_document = db.session.query(Document).where(Document.id == document_id).first() | dataset_document = db.session.query(Document).where(Document.id == document_id).first() | ||||
| return | return | ||||
| if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed": | if not dataset_document.enabled or dataset_document.archived or dataset_document.indexing_status != "completed": | ||||
| logging.info("Document not in valid state for index operations, skipping") | |||||
| return | return | ||||
| index_type = dataset_document.doc_form | |||||
| index_processor = IndexProcessorFactory(index_type).init_index_processor() | |||||
| index_processor.clean(dataset, index_node_ids, with_keywords=True, delete_child_chunks=True) | |||||
| doc_form = dataset_document.doc_form | |||||
| # Proceed with index cleanup using the index_node_ids directly | |||||
| index_processor = IndexProcessorFactory(doc_form).init_index_processor() | |||||
| index_processor.clean( | |||||
| dataset, | |||||
| index_node_ids, | |||||
| with_keywords=True, | |||||
| delete_child_chunks=True, | |||||
| precomputed_child_node_ids=child_node_ids, | |||||
| ) | |||||
| end_at = time.perf_counter() | end_at = time.perf_counter() | ||||
| logger.info(click.style(f"Segment deleted from index latency: {end_at - start_at}", fg="green")) | logger.info(click.style(f"Segment deleted from index latency: {end_at - start_at}", fg="green")) |