|
|
|
@@ -24,56 +24,64 @@ class Jieba(BaseKeyword): |
|
|
|
self._config = KeywordTableConfig() |
|
|
|
|
|
|
|
def create(self, texts: list[Document], **kwargs) -> BaseKeyword: |
|
|
|
keyword_table_handler = JiebaKeywordTableHandler() |
|
|
|
keyword_table = self._get_dataset_keyword_table() |
|
|
|
for text in texts: |
|
|
|
keywords = keyword_table_handler.extract_keywords(text.page_content, self._config.max_keywords_per_chunk) |
|
|
|
self._update_segment_keywords(self.dataset.id, text.metadata['doc_id'], list(keywords)) |
|
|
|
keyword_table = self._add_text_to_keyword_table(keyword_table, text.metadata['doc_id'], list(keywords)) |
|
|
|
lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id) |
|
|
|
with redis_client.lock(lock_name, timeout=600): |
|
|
|
keyword_table_handler = JiebaKeywordTableHandler() |
|
|
|
keyword_table = self._get_dataset_keyword_table() |
|
|
|
for text in texts: |
|
|
|
keywords = keyword_table_handler.extract_keywords(text.page_content, self._config.max_keywords_per_chunk) |
|
|
|
self._update_segment_keywords(self.dataset.id, text.metadata['doc_id'], list(keywords)) |
|
|
|
keyword_table = self._add_text_to_keyword_table(keyword_table, text.metadata['doc_id'], list(keywords)) |
|
|
|
|
|
|
|
self._save_dataset_keyword_table(keyword_table) |
|
|
|
self._save_dataset_keyword_table(keyword_table) |
|
|
|
|
|
|
|
return self |
|
|
|
return self |
|
|
|
|
|
|
|
def add_texts(self, texts: list[Document], **kwargs): |
|
|
|
keyword_table_handler = JiebaKeywordTableHandler() |
|
|
|
|
|
|
|
keyword_table = self._get_dataset_keyword_table() |
|
|
|
keywords_list = kwargs.get('keywords_list', None) |
|
|
|
for i in range(len(texts)): |
|
|
|
text = texts[i] |
|
|
|
if keywords_list: |
|
|
|
keywords = keywords_list[i] |
|
|
|
else: |
|
|
|
keywords = keyword_table_handler.extract_keywords(text.page_content, self._config.max_keywords_per_chunk) |
|
|
|
self._update_segment_keywords(self.dataset.id, text.metadata['doc_id'], list(keywords)) |
|
|
|
keyword_table = self._add_text_to_keyword_table(keyword_table, text.metadata['doc_id'], list(keywords)) |
|
|
|
|
|
|
|
self._save_dataset_keyword_table(keyword_table) |
|
|
|
lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id) |
|
|
|
with redis_client.lock(lock_name, timeout=600): |
|
|
|
keyword_table_handler = JiebaKeywordTableHandler() |
|
|
|
|
|
|
|
keyword_table = self._get_dataset_keyword_table() |
|
|
|
keywords_list = kwargs.get('keywords_list', None) |
|
|
|
for i in range(len(texts)): |
|
|
|
text = texts[i] |
|
|
|
if keywords_list: |
|
|
|
keywords = keywords_list[i] |
|
|
|
else: |
|
|
|
keywords = keyword_table_handler.extract_keywords(text.page_content, self._config.max_keywords_per_chunk) |
|
|
|
self._update_segment_keywords(self.dataset.id, text.metadata['doc_id'], list(keywords)) |
|
|
|
keyword_table = self._add_text_to_keyword_table(keyword_table, text.metadata['doc_id'], list(keywords)) |
|
|
|
|
|
|
|
self._save_dataset_keyword_table(keyword_table) |
|
|
|
|
|
|
|
def text_exists(self, id: str) -> bool: |
|
|
|
keyword_table = self._get_dataset_keyword_table() |
|
|
|
return id in set.union(*keyword_table.values()) |
|
|
|
|
|
|
|
def delete_by_ids(self, ids: list[str]) -> None: |
|
|
|
keyword_table = self._get_dataset_keyword_table() |
|
|
|
keyword_table = self._delete_ids_from_keyword_table(keyword_table, ids) |
|
|
|
lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id) |
|
|
|
with redis_client.lock(lock_name, timeout=600): |
|
|
|
keyword_table = self._get_dataset_keyword_table() |
|
|
|
keyword_table = self._delete_ids_from_keyword_table(keyword_table, ids) |
|
|
|
|
|
|
|
self._save_dataset_keyword_table(keyword_table) |
|
|
|
self._save_dataset_keyword_table(keyword_table) |
|
|
|
|
|
|
|
def delete_by_document_id(self, document_id: str): |
|
|
|
# get segment ids by document_id |
|
|
|
segments = db.session.query(DocumentSegment).filter( |
|
|
|
DocumentSegment.dataset_id == self.dataset.id, |
|
|
|
DocumentSegment.document_id == document_id |
|
|
|
).all() |
|
|
|
lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id) |
|
|
|
with redis_client.lock(lock_name, timeout=600): |
|
|
|
# get segment ids by document_id |
|
|
|
segments = db.session.query(DocumentSegment).filter( |
|
|
|
DocumentSegment.dataset_id == self.dataset.id, |
|
|
|
DocumentSegment.document_id == document_id |
|
|
|
).all() |
|
|
|
|
|
|
|
ids = [segment.index_node_id for segment in segments] |
|
|
|
ids = [segment.index_node_id for segment in segments] |
|
|
|
|
|
|
|
keyword_table = self._get_dataset_keyword_table() |
|
|
|
keyword_table = self._delete_ids_from_keyword_table(keyword_table, ids) |
|
|
|
keyword_table = self._get_dataset_keyword_table() |
|
|
|
keyword_table = self._delete_ids_from_keyword_table(keyword_table, ids) |
|
|
|
|
|
|
|
self._save_dataset_keyword_table(keyword_table) |
|
|
|
self._save_dataset_keyword_table(keyword_table) |
|
|
|
|
|
|
|
def search( |
|
|
|
self, query: str, |
|
|
|
@@ -106,13 +114,15 @@ class Jieba(BaseKeyword): |
|
|
|
return documents |
|
|
|
|
|
|
|
def delete(self) -> None: |
|
|
|
dataset_keyword_table = self.dataset.dataset_keyword_table |
|
|
|
if dataset_keyword_table: |
|
|
|
db.session.delete(dataset_keyword_table) |
|
|
|
db.session.commit() |
|
|
|
if dataset_keyword_table.data_source_type != 'database': |
|
|
|
file_key = 'keyword_files/' + self.dataset.tenant_id + '/' + self.dataset.id + '.txt' |
|
|
|
storage.delete(file_key) |
|
|
|
lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id) |
|
|
|
with redis_client.lock(lock_name, timeout=600): |
|
|
|
dataset_keyword_table = self.dataset.dataset_keyword_table |
|
|
|
if dataset_keyword_table: |
|
|
|
db.session.delete(dataset_keyword_table) |
|
|
|
db.session.commit() |
|
|
|
if dataset_keyword_table.data_source_type != 'database': |
|
|
|
file_key = 'keyword_files/' + self.dataset.tenant_id + '/' + self.dataset.id + '.txt' |
|
|
|
storage.delete(file_key) |
|
|
|
|
|
|
|
def _save_dataset_keyword_table(self, keyword_table): |
|
|
|
keyword_table_dict = { |
|
|
|
@@ -135,33 +145,31 @@ class Jieba(BaseKeyword): |
|
|
|
storage.save(file_key, json.dumps(keyword_table_dict, cls=SetEncoder).encode('utf-8')) |
|
|
|
|
|
|
|
def _get_dataset_keyword_table(self) -> Optional[dict]: |
|
|
|
lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id) |
|
|
|
with redis_client.lock(lock_name, timeout=20): |
|
|
|
dataset_keyword_table = self.dataset.dataset_keyword_table |
|
|
|
if dataset_keyword_table: |
|
|
|
keyword_table_dict = dataset_keyword_table.keyword_table_dict |
|
|
|
if keyword_table_dict: |
|
|
|
return keyword_table_dict['__data__']['table'] |
|
|
|
else: |
|
|
|
keyword_data_source_type = current_app.config['KEYWORD_DATA_SOURCE_TYPE'] |
|
|
|
dataset_keyword_table = DatasetKeywordTable( |
|
|
|
dataset_id=self.dataset.id, |
|
|
|
keyword_table='', |
|
|
|
data_source_type=keyword_data_source_type, |
|
|
|
) |
|
|
|
if keyword_data_source_type == 'database': |
|
|
|
dataset_keyword_table.keyword_table = json.dumps({ |
|
|
|
'__type__': 'keyword_table', |
|
|
|
'__data__': { |
|
|
|
"index_id": self.dataset.id, |
|
|
|
"summary": None, |
|
|
|
"table": {} |
|
|
|
} |
|
|
|
}, cls=SetEncoder) |
|
|
|
db.session.add(dataset_keyword_table) |
|
|
|
db.session.commit() |
|
|
|
dataset_keyword_table = self.dataset.dataset_keyword_table |
|
|
|
if dataset_keyword_table: |
|
|
|
keyword_table_dict = dataset_keyword_table.keyword_table_dict |
|
|
|
if keyword_table_dict: |
|
|
|
return keyword_table_dict['__data__']['table'] |
|
|
|
else: |
|
|
|
keyword_data_source_type = current_app.config['KEYWORD_DATA_SOURCE_TYPE'] |
|
|
|
dataset_keyword_table = DatasetKeywordTable( |
|
|
|
dataset_id=self.dataset.id, |
|
|
|
keyword_table='', |
|
|
|
data_source_type=keyword_data_source_type, |
|
|
|
) |
|
|
|
if keyword_data_source_type == 'database': |
|
|
|
dataset_keyword_table.keyword_table = json.dumps({ |
|
|
|
'__type__': 'keyword_table', |
|
|
|
'__data__': { |
|
|
|
"index_id": self.dataset.id, |
|
|
|
"summary": None, |
|
|
|
"table": {} |
|
|
|
} |
|
|
|
}, cls=SetEncoder) |
|
|
|
db.session.add(dataset_keyword_table) |
|
|
|
db.session.commit() |
|
|
|
|
|
|
|
return {} |
|
|
|
return {} |
|
|
|
|
|
|
|
def _add_text_to_keyword_table(self, keyword_table: dict, id: str, keywords: list[str]) -> dict: |
|
|
|
for keyword in keywords: |