ソースを参照

perf: optimze figure parser (#7392)

### What problem does this PR solve?

When parsing documents containing images, the current code uses a
single-threaded approach to call the VL model, resulting in extremely
slow parsing speed (e.g., parsing a Word document with dozens of images
takes over 20 minutes).

By switching to a multithreaded approach to call the VL model, the
parsing speed can be improved to an acceptable level.

### Type of change

- [x] Performance Improvement

---------

Co-authored-by: liuzhenghua-jk <liuzhenghua-jk@360shuke.com>
tags/v0.19.0
liuzhenghua 6ヶ月前
コミット
2f768b96e8
コミッターのメールアドレスに関連付けられたアカウントが存在しません
2個のファイルの変更44行の追加31行の削除
  1. 12
    6
      deepdoc/parser/figure_parser.py
  2. 32
    25
      rag/svr/task_executor.py

+ 12
- 6
deepdoc/parser/figure_parser.py ファイルの表示

# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
from concurrent.futures import ThreadPoolExecutor, as_completed


from PIL import Image from PIL import Image


) for figure_data in figures_data_without_positions if isinstance(figure_data[1], Image.Image)] ) for figure_data in figures_data_without_positions if isinstance(figure_data[1], Image.Image)]




shared_executor = ThreadPoolExecutor(max_workers=10)
class VisionFigureParser: class VisionFigureParser:
def __init__(self, vision_model, figures_data, *args, **kwargs): def __init__(self, vision_model, figures_data, *args, **kwargs):
self.vision_model = vision_model self.vision_model = vision_model
def __call__(self, **kwargs): def __call__(self, **kwargs):
callback = kwargs.get("callback", lambda prog, msg: None) callback = kwargs.get("callback", lambda prog, msg: None)


for idx, img_binary in enumerate(self.figures or []):
figure_num = idx # 0-based

txt = picture_vision_llm_chunk(
binary=img_binary,
def process(figure_idx, figure_binary):
description_text = picture_vision_llm_chunk(
binary=figure_binary,
vision_model=self.vision_model, vision_model=self.vision_model,
prompt=vision_llm_figure_describe_prompt(), prompt=vision_llm_figure_describe_prompt(),
callback=callback, callback=callback,
) )
return figure_idx, description_text

futures = []
for idx, img_binary in enumerate(self.figures or []):
futures.append(shared_executor.submit(process, idx, img_binary))


for future in as_completed(futures):
figure_num, txt = future.result()
if txt: if txt:
self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num]) self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num])



+ 32
- 25
rag/svr/task_executor.py ファイルの表示



MAX_CONCURRENT_TASKS = int(os.environ.get('MAX_CONCURRENT_TASKS', "5")) MAX_CONCURRENT_TASKS = int(os.environ.get('MAX_CONCURRENT_TASKS', "5"))
MAX_CONCURRENT_CHUNK_BUILDERS = int(os.environ.get('MAX_CONCURRENT_CHUNK_BUILDERS', "1")) MAX_CONCURRENT_CHUNK_BUILDERS = int(os.environ.get('MAX_CONCURRENT_CHUNK_BUILDERS', "1"))
MAX_CONCURRENT_MINIO = int(os.environ.get('MAX_CONCURRENT_MINIO', '10'))
task_limiter = trio.CapacityLimiter(MAX_CONCURRENT_TASKS) task_limiter = trio.CapacityLimiter(MAX_CONCURRENT_TASKS)
chunk_limiter = trio.CapacityLimiter(MAX_CONCURRENT_CHUNK_BUILDERS) chunk_limiter = trio.CapacityLimiter(MAX_CONCURRENT_CHUNK_BUILDERS)
minio_limiter = trio.CapacityLimiter(MAX_CONCURRENT_MINIO)
WORKER_HEARTBEAT_TIMEOUT = int(os.environ.get('WORKER_HEARTBEAT_TIMEOUT', '120')) WORKER_HEARTBEAT_TIMEOUT = int(os.environ.get('WORKER_HEARTBEAT_TIMEOUT', '120'))
stop_event = threading.Event() stop_event = threading.Event()


} }
if task["pagerank"]: if task["pagerank"]:
doc[PAGERANK_FLD] = int(task["pagerank"]) doc[PAGERANK_FLD] = int(task["pagerank"])
el = 0
for ck in cks:
d = copy.deepcopy(doc)
d.update(ck)
d["id"] = xxhash.xxh64((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.now().timestamp()
if not d.get("image"):
_ = d.pop("image", None)
d["img_id"] = ""
docs.append(d)
continue
st = timer()


async def upload_to_minio(document, chunk):
try: try:
output_buffer = BytesIO()
if isinstance(d["image"], bytes):
output_buffer = BytesIO(d["image"])
else:
d["image"].save(output_buffer, format='JPEG')

st = timer()
await trio.to_thread.run_sync(lambda: STORAGE_IMPL.put(task["kb_id"], d["id"], output_buffer.getvalue()))
el += timer() - st
async with minio_limiter:
d = copy.deepcopy(document)
d.update(chunk)
d["id"] = xxhash.xxh64((chunk["content_with_weight"] + str(d["doc_id"])).encode("utf-8")).hexdigest()
d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.now().timestamp()
if not d.get("image"):
_ = d.pop("image", None)
d["img_id"] = ""
docs.append(d)
return

output_buffer = BytesIO()
if isinstance(d["image"], bytes):
output_buffer = BytesIO(d["image"])
else:
d["image"].save(output_buffer, format='JPEG')
await trio.to_thread.run_sync(lambda: STORAGE_IMPL.put(task["kb_id"], d["id"], output_buffer.getvalue()))

d["img_id"] = "{}-{}".format(task["kb_id"], d["id"])
del d["image"]
docs.append(d)
except Exception: except Exception:
logging.exception( logging.exception(
"Saving image of chunk {}/{}/{} got exception".format(task["location"], task["name"], d["id"])) "Saving image of chunk {}/{}/{} got exception".format(task["location"], task["name"], d["id"]))
raise raise


d["img_id"] = "{}-{}".format(task["kb_id"], d["id"])
del d["image"]
docs.append(d)
logging.info("MINIO PUT({}):{}".format(task["name"], el))
async with trio.open_nursery() as nursery:
for ck in cks:
nursery.start_soon(upload_to_minio, doc, ck)

el = timer() - st
logging.info("MINIO PUT({}) cost {:.3f} s".format(task["name"], el))


if task["parser_config"].get("auto_keywords", 0): if task["parser_config"].get("auto_keywords", 0):
st = timer() st = timer()

読み込み中…
キャンセル
保存