You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

vector_service.py 9.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. from typing import Optional
  2. from core.model_manager import ModelInstance, ModelManager
  3. from core.model_runtime.entities.model_entities import ModelType
  4. from core.rag.datasource.keyword.keyword_factory import Keyword
  5. from core.rag.datasource.vdb.vector_factory import Vector
  6. from core.rag.index_processor.constant.index_type import IndexType
  7. from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
  8. from core.rag.models.document import Document
  9. from extensions.ext_database import db
  10. from models.dataset import ChildChunk, Dataset, DatasetProcessRule, DocumentSegment
  11. from models.dataset import Document as DatasetDocument
  12. from services.entities.knowledge_entities.knowledge_entities import ParentMode
  13. class VectorService:
  14. @classmethod
  15. def create_segments_vector(
  16. cls, keywords_list: Optional[list[list[str]]], segments: list[DocumentSegment], dataset: Dataset, doc_form: str
  17. ):
  18. documents = []
  19. for segment in segments:
  20. if doc_form == IndexType.PARENT_CHILD_INDEX:
  21. document = DatasetDocument.query.filter_by(id=segment.document_id).first()
  22. # get the process rule
  23. processing_rule = (
  24. db.session.query(DatasetProcessRule)
  25. .filter(DatasetProcessRule.id == document.dataset_process_rule_id)
  26. .first()
  27. )
  28. # get embedding model instance
  29. if dataset.indexing_technique == "high_quality":
  30. # check embedding model setting
  31. model_manager = ModelManager()
  32. if dataset.embedding_model_provider:
  33. embedding_model_instance = model_manager.get_model_instance(
  34. tenant_id=dataset.tenant_id,
  35. provider=dataset.embedding_model_provider,
  36. model_type=ModelType.TEXT_EMBEDDING,
  37. model=dataset.embedding_model,
  38. )
  39. else:
  40. embedding_model_instance = model_manager.get_default_model_instance(
  41. tenant_id=dataset.tenant_id,
  42. model_type=ModelType.TEXT_EMBEDDING,
  43. )
  44. else:
  45. raise ValueError("The knowledge base index technique is not high quality!")
  46. cls.generate_child_chunks(segment, document, dataset, embedding_model_instance, processing_rule, False)
  47. else:
  48. document = Document(
  49. page_content=segment.content,
  50. metadata={
  51. "doc_id": segment.index_node_id,
  52. "doc_hash": segment.index_node_hash,
  53. "document_id": segment.document_id,
  54. "dataset_id": segment.dataset_id,
  55. },
  56. )
  57. documents.append(document)
  58. if len(documents) > 0:
  59. index_processor = IndexProcessorFactory(doc_form).init_index_processor()
  60. index_processor.load(dataset, documents, with_keywords=True, keywords_list=keywords_list)
  61. @classmethod
  62. def update_segment_vector(cls, keywords: Optional[list[str]], segment: DocumentSegment, dataset: Dataset):
  63. # update segment index task
  64. # format new index
  65. document = Document(
  66. page_content=segment.content,
  67. metadata={
  68. "doc_id": segment.index_node_id,
  69. "doc_hash": segment.index_node_hash,
  70. "document_id": segment.document_id,
  71. "dataset_id": segment.dataset_id,
  72. },
  73. )
  74. if dataset.indexing_technique == "high_quality":
  75. # update vector index
  76. vector = Vector(dataset=dataset)
  77. vector.delete_by_ids([segment.index_node_id])
  78. vector.add_texts([document], duplicate_check=True)
  79. # update keyword index
  80. keyword = Keyword(dataset)
  81. keyword.delete_by_ids([segment.index_node_id])
  82. # save keyword index
  83. if keywords and len(keywords) > 0:
  84. keyword.add_texts([document], keywords_list=[keywords])
  85. else:
  86. keyword.add_texts([document])
  87. @classmethod
  88. def generate_child_chunks(
  89. cls,
  90. segment: DocumentSegment,
  91. dataset_document: Document,
  92. dataset: Dataset,
  93. embedding_model_instance: ModelInstance,
  94. processing_rule: DatasetProcessRule,
  95. regenerate: bool = False,
  96. ):
  97. index_processor = IndexProcessorFactory(dataset.doc_form).init_index_processor()
  98. if regenerate:
  99. # delete child chunks
  100. index_processor.clean(dataset, [segment.index_node_id], with_keywords=True, delete_child_chunks=True)
  101. # generate child chunks
  102. document = Document(
  103. page_content=segment.content,
  104. metadata={
  105. "doc_id": segment.index_node_id,
  106. "doc_hash": segment.index_node_hash,
  107. "document_id": segment.document_id,
  108. "dataset_id": segment.dataset_id,
  109. },
  110. )
  111. # use full doc mode to generate segment's child chunk
  112. processing_rule_dict = processing_rule.to_dict()
  113. processing_rule_dict["rules"]["parent_mode"] = ParentMode.FULL_DOC.value
  114. documents = index_processor.transform(
  115. [document],
  116. embedding_model_instance=embedding_model_instance,
  117. process_rule=processing_rule_dict,
  118. tenant_id=dataset.tenant_id,
  119. doc_language=dataset_document.doc_language,
  120. )
  121. # save child chunks
  122. if len(documents) > 0 and len(documents[0].children) > 0:
  123. index_processor.load(dataset, documents)
  124. for position, child_chunk in enumerate(documents[0].children, start=1):
  125. child_segment = ChildChunk(
  126. tenant_id=dataset.tenant_id,
  127. dataset_id=dataset.id,
  128. document_id=dataset_document.id,
  129. segment_id=segment.id,
  130. position=position,
  131. index_node_id=child_chunk.metadata["doc_id"],
  132. index_node_hash=child_chunk.metadata["doc_hash"],
  133. content=child_chunk.page_content,
  134. word_count=len(child_chunk.page_content),
  135. type="automatic",
  136. created_by=dataset_document.created_by,
  137. )
  138. db.session.add(child_segment)
  139. db.session.commit()
  140. @classmethod
  141. def create_child_chunk_vector(cls, child_segment: ChildChunk, dataset: Dataset):
  142. child_document = Document(
  143. page_content=child_segment.content,
  144. metadata={
  145. "doc_id": child_segment.index_node_id,
  146. "doc_hash": child_segment.index_node_hash,
  147. "document_id": child_segment.document_id,
  148. "dataset_id": child_segment.dataset_id,
  149. },
  150. )
  151. if dataset.indexing_technique == "high_quality":
  152. # save vector index
  153. vector = Vector(dataset=dataset)
  154. vector.add_texts([child_document], duplicate_check=True)
  155. @classmethod
  156. def update_child_chunk_vector(
  157. cls,
  158. new_child_chunks: list[ChildChunk],
  159. update_child_chunks: list[ChildChunk],
  160. delete_child_chunks: list[ChildChunk],
  161. dataset: Dataset,
  162. ):
  163. documents = []
  164. delete_node_ids = []
  165. for new_child_chunk in new_child_chunks:
  166. new_child_document = Document(
  167. page_content=new_child_chunk.content,
  168. metadata={
  169. "doc_id": new_child_chunk.index_node_id,
  170. "doc_hash": new_child_chunk.index_node_hash,
  171. "document_id": new_child_chunk.document_id,
  172. "dataset_id": new_child_chunk.dataset_id,
  173. },
  174. )
  175. documents.append(new_child_document)
  176. for update_child_chunk in update_child_chunks:
  177. child_document = Document(
  178. page_content=update_child_chunk.content,
  179. metadata={
  180. "doc_id": update_child_chunk.index_node_id,
  181. "doc_hash": update_child_chunk.index_node_hash,
  182. "document_id": update_child_chunk.document_id,
  183. "dataset_id": update_child_chunk.dataset_id,
  184. },
  185. )
  186. documents.append(child_document)
  187. delete_node_ids.append(update_child_chunk.index_node_id)
  188. for delete_child_chunk in delete_child_chunks:
  189. delete_node_ids.append(delete_child_chunk.index_node_id)
  190. if dataset.indexing_technique == "high_quality":
  191. # update vector index
  192. vector = Vector(dataset=dataset)
  193. if delete_node_ids:
  194. vector.delete_by_ids(delete_node_ids)
  195. if documents:
  196. vector.add_texts(documents, duplicate_check=True)
  197. @classmethod
  198. def delete_child_chunk_vector(cls, child_chunk: ChildChunk, dataset: Dataset):
  199. vector = Vector(dataset=dataset)
  200. vector.delete_by_ids([child_chunk.index_node_id])