Ver código fonte

fix #258 task_executor occupy cpu too much (#288)

### What problem does this PR solve?

Issue link:#285

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
tags/v0.1.0
KevinHuSh 1 ano atrás
pai
commit
923cbe488e
Nenhuma conta vinculada ao e-mail do autor do commit
3 arquivos alterados com 22 adições e 19 exclusões
  1. 1
    1
      rag/nlp/query.py
  2. 20
    17
      rag/svr/task_executor.py
  3. 1
    1
      rag/utils/minio_conn.py

+ 1
- 1
rag/nlp/query.py Ver arquivo

return Q("bool", return Q("bool",
must=Q("query_string", fields=self.flds, must=Q("query_string", fields=self.flds,
type="best_fields", query=" ".join(q), type="best_fields", query=" ".join(q),
boost=1, minimum_should_match=min_match)
boost=1)#, minimum_should_match=min_match)
), tks ), tks


def needQieqie(tk): def needQieqie(tk):

+ 20
- 17
rag/svr/task_executor.py Ver arquivo

import copy import copy
import re import re
import sys import sys
import time
import traceback import traceback
from functools import partial from functools import partial
import signal
from contextlib import contextmanager
from rag.settings import database_logger from rag.settings import database_logger
from rag.settings import cron_logger, DOC_MAXIMUM_SIZE from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
from multiprocessing import Pool
import numpy as np import numpy as np
from elasticsearch_dsl import Q from elasticsearch_dsl import Q
from multiprocessing.context import TimeoutError
from api.db.services.task_service import TaskService from api.db.services.task_service import TaskService
from rag.utils import ELASTICSEARCH from rag.utils import ELASTICSEARCH
from rag.utils import MINIO from rag.utils import MINIO
def collect(comm, mod, tm): def collect(comm, mod, tm):
tasks = TaskService.get_tasks(tm, mod, comm) tasks = TaskService.get_tasks(tm, mod, comm)
if len(tasks) == 0: if len(tasks) == 0:
time.sleep(1)
return pd.DataFrame() return pd.DataFrame()
tasks = pd.DataFrame(tasks) tasks = pd.DataFrame(tasks)
mtm = tasks["update_time"].max() mtm = tasks["update_time"].max()
cron_logger.info("TOTAL:{}, To:{}".format(len(tasks), mtm)) cron_logger.info("TOTAL:{}, To:{}".format(len(tasks), mtm))
return tasks return tasks


@contextmanager
def timeout(time):
# Register a function to raise a TimeoutError on the signal.
signal.signal(signal.SIGALRM, raise_timeout)
# Schedule the signal to be sent after ``time``.
signal.alarm(time)
yield



def raise_timeout(signum, frame):
raise TimeoutError
def get_minio_binary(bucket, name):
global MINIO
return MINIO.get(bucket, name)




def build(row): def build(row):
row["from_page"], row["from_page"],
row["to_page"]) row["to_page"])
chunker = FACTORY[row["parser_id"].lower()] chunker = FACTORY[row["parser_id"].lower()]
pool = Pool(processes=1)
try: try:
st = timer() st = timer()
with timeout(30):
binary = MINIO.get(row["kb_id"], row["location"])
thr = pool.apply_async(get_minio_binary, args=(row["kb_id"], row["location"]))
binary = thr.get(timeout=90)
pool.terminate()
cron_logger.info(
"From minio({}) {}/{}".format(timer()-st, row["location"], row["name"]))
cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"], cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
to_page=row["to_page"], lang=row["language"], callback=callback, to_page=row["to_page"], lang=row["language"], callback=callback,
kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"]) kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
cron_logger.info( cron_logger.info(
"Chunkking({}) {}/{}".format(timer()-st, row["location"], row["name"])) "Chunkking({}) {}/{}".format(timer()-st, row["location"], row["name"]))
except TimeoutError as e:
callback(-1, f"Internal server error: Fetch file timeout. Could you try it again.")
cron_logger.error(
"Chunkking {}/{}: Fetch file timeout.".format(row["location"], row["name"]))
return
except Exception as e: except Exception as e:
if re.search("(No such file|not found)", str(e)): if re.search("(No such file|not found)", str(e)):
callback(-1, "Can not find file <%s>" % row["name"]) callback(-1, "Can not find file <%s>" % row["name"])
else: else:
callback(-1, f"Internal server error: %s" % callback(-1, f"Internal server error: %s" %
str(e).replace("'", "")) str(e).replace("'", ""))
pool.terminate()
traceback.print_exc() traceback.print_exc()


cron_logger.warn(
cron_logger.error(
"Chunkking {}/{}: {}".format(row["location"], row["name"], str(e))) "Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))


return return

+ 1
- 1
rag/utils/minio_conn.py Ver arquivo





def get(self, bucket, fnm): def get(self, bucket, fnm):
for _ in range(10):
for _ in range(1):
try: try:
r = self.conn.get_object(bucket, fnm) r = self.conn.get_object(bucket, fnm)
return r.read() return r.read()

Carregando…
Cancelar
Salvar