浏览代码

Feat: delete useless image blobs when task executor meet edge cases (#7727)

### What problem does this PR solve?

delete useless image blobs when the task executor meets edge cases

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
tags/v0.19.x
Stephen Hu 5 个月前
父节点
当前提交
e3e7c7ddaa
没有帐户链接到提交者的电子邮件
共有 1 个文件被更改,包括 14 次插入0 次删除
  1. 14
    0
      rag/svr/task_executor.py

+ 14
- 0
rag/svr/task_executor.py 查看文件

start_ts = timer() start_ts = timer()
doc_store_result = "" doc_store_result = ""
es_bulk_size = 4 es_bulk_size = 4

async def delete_image(kb_id, chunk_id):
try:
async with minio_limiter:
STORAGE_IMPL.delete(kb_id, chunk_id)
except Exception:
logging.exception(
"Deleting image of chunk {}/{}/{} got exception".format(task["location"], task["name"], chunk_id))
raise

for b in range(0, len(chunks), es_bulk_size): for b in range(0, len(chunks), es_bulk_size):
doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(task_tenant_id), task_dataset_id)) doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(task_tenant_id), task_dataset_id))
if b % 128 == 0: if b % 128 == 0:
except DoesNotExist: except DoesNotExist:
logging.warning(f"do_handle_task update_chunk_ids failed since task {task['id']} is unknown.") logging.warning(f"do_handle_task update_chunk_ids failed since task {task['id']} is unknown.")
doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(task_tenant_id), task_dataset_id)) doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(task_tenant_id), task_dataset_id))
async with trio.open_nursery() as nursery:
for chunk_id in chunk_ids:
nursery.start_soon(delete_image, task_dataset_id, chunk_id)
return return
logging.info("Indexing doc({}), page({}-{}), chunks({}), elapsed: {:.2f}".format(task_document_name, task_from_page, logging.info("Indexing doc({}), page({}-{}), chunks({}), elapsed: {:.2f}".format(task_document_name, task_from_page,
task_to_page, len(chunks), task_to_page, len(chunks),
timer() - start_ts)) timer() - start_ts))

正在加载...
取消
保存