| keyword_table = {} | keyword_table = {} | ||||
| for text in texts: | for text in texts: | ||||
| keywords = keyword_table_handler.extract_keywords(text.page_content, self._config.max_keywords_per_chunk) | keywords = keyword_table_handler.extract_keywords(text.page_content, self._config.max_keywords_per_chunk) | ||||
| self._update_segment_keywords(text.metadata['doc_id'], list(keywords)) | |||||
| self._update_segment_keywords(self.dataset.id, text.metadata['doc_id'], list(keywords)) | |||||
| keyword_table = self._add_text_to_keyword_table(keyword_table, text.metadata['doc_id'], list(keywords)) | keyword_table = self._add_text_to_keyword_table(keyword_table, text.metadata['doc_id'], list(keywords)) | ||||
| dataset_keyword_table = DatasetKeywordTable( | dataset_keyword_table = DatasetKeywordTable( | ||||
| keyword_table = self._get_dataset_keyword_table() | keyword_table = self._get_dataset_keyword_table() | ||||
| for text in texts: | for text in texts: | ||||
| keywords = keyword_table_handler.extract_keywords(text.page_content, self._config.max_keywords_per_chunk) | keywords = keyword_table_handler.extract_keywords(text.page_content, self._config.max_keywords_per_chunk) | ||||
| self._update_segment_keywords(text.metadata['doc_id'], list(keywords)) | |||||
| self._update_segment_keywords(self.dataset.id, text.metadata['doc_id'], list(keywords)) | |||||
| keyword_table = self._add_text_to_keyword_table(keyword_table, text.metadata['doc_id'], list(keywords)) | keyword_table = self._add_text_to_keyword_table(keyword_table, text.metadata['doc_id'], list(keywords)) | ||||
| self._save_dataset_keyword_table(keyword_table) | self._save_dataset_keyword_table(keyword_table) | ||||
| return sorted_chunk_indices[: k] | return sorted_chunk_indices[: k] | ||||
| def _update_segment_keywords(self, node_id: str, keywords: List[str]): | |||||
| document_segment = db.session.query(DocumentSegment).filter(DocumentSegment.index_node_id == node_id).first() | |||||
| def _update_segment_keywords(self, dataset_id: str, node_id: str, keywords: List[str]): | |||||
| document_segment = db.session.query(DocumentSegment).filter( | |||||
| DocumentSegment.dataset_id == dataset_id, | |||||
| DocumentSegment.index_node_id == node_id | |||||
| ).first() | |||||
| if document_segment: | if document_segment: | ||||
| document_segment.keywords = keywords | document_segment.keywords = keywords | ||||
| db.session.commit() | db.session.commit() | ||||
| def create_segment_keywords(self, node_id: str, keywords: List[str]): | def create_segment_keywords(self, node_id: str, keywords: List[str]): | ||||
| keyword_table = self._get_dataset_keyword_table() | keyword_table = self._get_dataset_keyword_table() | ||||
| self._update_segment_keywords(node_id, keywords) | |||||
| self._update_segment_keywords(self.dataset.id, node_id, keywords) | |||||
| keyword_table = self._add_text_to_keyword_table(keyword_table, node_id, keywords) | keyword_table = self._add_text_to_keyword_table(keyword_table, node_id, keywords) | ||||
| self._save_dataset_keyword_table(keyword_table) | self._save_dataset_keyword_table(keyword_table) | ||||