소스 검색

fix: same chunk insert deadlock (#12502)

Co-authored-by: huangzhuo <huangzhuo1@xiaomi.com>
tags/0.15.1
huangzhuo1949 9 달 전
부모
커밋
e84bf35e2a
No account linked to committer's email address
1개의 변경된 파일14개의 추가작업 그리고 4개의 파일을 삭제
  1. 14
    4
      api/core/indexing_runner.py

+ 14
- 4
api/core/indexing_runner.py 파일 보기

# chunk nodes by chunk size # chunk nodes by chunk size
indexing_start_at = time.perf_counter() indexing_start_at = time.perf_counter()
tokens = 0 tokens = 0
chunk_size = 10
if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX: if dataset_document.doc_form != IndexType.PARENT_CHILD_INDEX:
# create keyword index # create keyword index
create_keyword_thread = threading.Thread( create_keyword_thread = threading.Thread(
) )
create_keyword_thread.start() create_keyword_thread.start()


max_workers = 10
if dataset.indexing_technique == "high_quality": if dataset.indexing_technique == "high_quality":
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [] futures = []
for i in range(0, len(documents), chunk_size):
chunk_documents = documents[i : i + chunk_size]

# Distribute documents into multiple groups based on the hash values of page_content
# This is done to prevent multiple threads from processing the same document,
# Thereby avoiding potential database insertion deadlocks
document_groups: list[list[Document]] = [[] for _ in range(max_workers)]
for document in documents:
hash = helper.generate_text_hash(document.page_content)
group_index = int(hash, 16) % max_workers
document_groups[group_index].append(document)
for chunk_documents in document_groups:
if len(chunk_documents) == 0:
continue
futures.append( futures.append(
executor.submit( executor.submit(
self._process_chunk, self._process_chunk,

Loading…
취소
저장