You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

task_executor.py 9.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import datetime
  17. import json
  18. import logging
  19. import os
  20. import hashlib
  21. import copy
  22. import re
  23. import sys
  24. import traceback
  25. from functools import partial
  26. import signal
  27. from contextlib import contextmanager
  28. from rag.settings import database_logger
  29. from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
  30. import numpy as np
  31. from elasticsearch_dsl import Q
  32. from api.db.services.task_service import TaskService
  33. from rag.utils import ELASTICSEARCH
  34. from rag.utils import MINIO
  35. from rag.utils import rmSpace, findMaxTm
  36. from rag.nlp import search
  37. from io import BytesIO
  38. import pandas as pd
  39. from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one
  40. from api.db import LLMType, ParserType
  41. from api.db.services.document_service import DocumentService
  42. from api.db.services.llm_service import LLMBundle
  43. from api.utils.file_utils import get_project_base_directory
  44. BATCH_SIZE = 64
  45. FACTORY = {
  46. "general": naive,
  47. ParserType.NAIVE.value: naive,
  48. ParserType.PAPER.value: paper,
  49. ParserType.BOOK.value: book,
  50. ParserType.PRESENTATION.value: presentation,
  51. ParserType.MANUAL.value: manual,
  52. ParserType.LAWS.value: laws,
  53. ParserType.QA.value: qa,
  54. ParserType.TABLE.value: table,
  55. ParserType.RESUME.value: resume,
  56. ParserType.PICTURE.value: picture,
  57. ParserType.ONE.value: one,
  58. }
  59. def set_progress(task_id, from_page=0, to_page=-1,
  60. prog=None, msg="Processing..."):
  61. if prog is not None and prog < 0:
  62. msg = "[ERROR]" + msg
  63. cancel = TaskService.do_cancel(task_id)
  64. if cancel:
  65. msg += " [Canceled]"
  66. prog = -1
  67. if to_page > 0:
  68. if msg:
  69. msg = f"Page({from_page+1}~{to_page+1}): " + msg
  70. d = {"progress_msg": msg}
  71. if prog is not None:
  72. d["progress"] = prog
  73. try:
  74. TaskService.update_progress(task_id, d)
  75. except Exception as e:
  76. cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
  77. if cancel:
  78. sys.exit()
  79. def collect(comm, mod, tm):
  80. tasks = TaskService.get_tasks(tm, mod, comm)
  81. if len(tasks) == 0:
  82. return pd.DataFrame()
  83. tasks = pd.DataFrame(tasks)
  84. mtm = tasks["update_time"].max()
  85. cron_logger.info("TOTAL:{}, To:{}".format(len(tasks), mtm))
  86. return tasks
  87. @contextmanager
  88. def timeout(time):
  89. # Register a function to raise a TimeoutError on the signal.
  90. signal.signal(signal.SIGALRM, raise_timeout)
  91. # Schedule the signal to be sent after ``time``.
  92. signal.alarm(time)
  93. yield
  94. def raise_timeout(signum, frame):
  95. raise TimeoutError
  96. def build(row):
  97. from timeit import default_timer as timer
  98. if row["size"] > DOC_MAXIMUM_SIZE:
  99. set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" %
  100. (int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
  101. return []
  102. callback = partial(
  103. set_progress,
  104. row["id"],
  105. row["from_page"],
  106. row["to_page"])
  107. chunker = FACTORY[row["parser_id"].lower()]
  108. try:
  109. st = timer()
  110. with timeout(30):
  111. binary = MINIO.get(row["kb_id"], row["location"])
  112. cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
  113. to_page=row["to_page"], lang=row["language"], callback=callback,
  114. kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
  115. cron_logger.info(
  116. "Chunkking({}) {}/{}".format(timer()-st, row["location"], row["name"]))
  117. except Exception as e:
  118. if re.search("(No such file|not found)", str(e)):
  119. callback(-1, "Can not find file <%s>" % row["name"])
  120. else:
  121. callback(-1, f"Internal server error: %s" %
  122. str(e).replace("'", ""))
  123. traceback.print_exc()
  124. cron_logger.warn(
  125. "Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
  126. return
  127. docs = []
  128. doc = {
  129. "doc_id": row["doc_id"],
  130. "kb_id": [str(row["kb_id"])]
  131. }
  132. for ck in cks:
  133. d = copy.deepcopy(doc)
  134. d.update(ck)
  135. md5 = hashlib.md5()
  136. md5.update((ck["content_with_weight"] +
  137. str(d["doc_id"])).encode("utf-8"))
  138. d["_id"] = md5.hexdigest()
  139. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  140. d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
  141. if not d.get("image"):
  142. docs.append(d)
  143. continue
  144. output_buffer = BytesIO()
  145. if isinstance(d["image"], bytes):
  146. output_buffer = BytesIO(d["image"])
  147. else:
  148. d["image"].save(output_buffer, format='JPEG')
  149. MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
  150. d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
  151. del d["image"]
  152. docs.append(d)
  153. return docs
  154. def init_kb(row):
  155. idxnm = search.index_name(row["tenant_id"])
  156. if ELASTICSEARCH.indexExist(idxnm):
  157. return
  158. return ELASTICSEARCH.createIdx(idxnm, json.load(
  159. open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
  160. def embedding(docs, mdl, parser_config={}, callback=None):
  161. batch_size = 32
  162. tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
  163. re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs]
  164. tk_count = 0
  165. if len(tts) == len(cnts):
  166. tts_ = np.array([])
  167. for i in range(0, len(tts), batch_size):
  168. vts, c = mdl.encode(tts[i: i + batch_size])
  169. if len(tts_) == 0:
  170. tts_ = vts
  171. else:
  172. tts_ = np.concatenate((tts_, vts), axis=0)
  173. tk_count += c
  174. callback(prog=0.6 + 0.1 * (i + 1) / len(tts), msg="")
  175. tts = tts_
  176. cnts_ = np.array([])
  177. for i in range(0, len(cnts), batch_size):
  178. vts, c = mdl.encode(cnts[i: i + batch_size])
  179. if len(cnts_) == 0:
  180. cnts_ = vts
  181. else:
  182. cnts_ = np.concatenate((cnts_, vts), axis=0)
  183. tk_count += c
  184. callback(prog=0.7 + 0.2 * (i + 1) / len(cnts), msg="")
  185. cnts = cnts_
  186. title_w = float(parser_config.get("filename_embd_weight", 0.1))
  187. vects = (title_w * tts + (1 - title_w) *
  188. cnts) if len(tts) == len(cnts) else cnts
  189. assert len(vects) == len(docs)
  190. for i, d in enumerate(docs):
  191. v = vects[i].tolist()
  192. d["q_%d_vec" % len(v)] = v
  193. return tk_count
  194. def main(comm, mod):
  195. tm_fnm = os.path.join(
  196. get_project_base_directory(),
  197. "rag/res",
  198. f"{comm}-{mod}.tm")
  199. tm = findMaxTm(tm_fnm)
  200. rows = collect(comm, mod, tm)
  201. if len(rows) == 0:
  202. return
  203. tmf = open(tm_fnm, "a+")
  204. for _, r in rows.iterrows():
  205. callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
  206. try:
  207. embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING)
  208. except Exception as e:
  209. callback(prog=-1, msg=str(e))
  210. continue
  211. cks = build(r)
  212. if cks is None:
  213. continue
  214. if not cks:
  215. tmf.write(str(r["update_time"]) + "\n")
  216. callback(1., "No chunk! Done!")
  217. continue
  218. # TODO: exception handler
  219. ## set_progress(r["did"], -1, "ERROR: ")
  220. callback(
  221. msg="Finished slicing files(%d). Start to embedding the content." %
  222. len(cks))
  223. try:
  224. tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
  225. except Exception as e:
  226. callback(-1, "Embedding error:{}".format(str(e)))
  227. cron_logger.error(str(e))
  228. tk_count = 0
  229. callback(msg="Finished embedding! Start to build index!")
  230. init_kb(r)
  231. chunk_count = len(set([c["_id"] for c in cks]))
  232. es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
  233. if es_r:
  234. callback(-1, "Index failure!")
  235. ELASTICSEARCH.deleteByQuery(
  236. Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
  237. cron_logger.error(str(es_r))
  238. else:
  239. if TaskService.do_cancel(r["id"]):
  240. ELASTICSEARCH.deleteByQuery(
  241. Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
  242. continue
  243. callback(1., "Done!")
  244. DocumentService.increment_chunk_num(
  245. r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
  246. cron_logger.info(
  247. "Chunk doc({}), token({}), chunks({})".format(
  248. r["id"], tk_count, len(cks)))
  249. tmf.write(str(r["update_time"]) + "\n")
  250. tmf.close()
  251. if __name__ == "__main__":
  252. peewee_logger = logging.getLogger('peewee')
  253. peewee_logger.propagate = False
  254. peewee_logger.addHandler(database_logger.handlers[0])
  255. peewee_logger.setLevel(database_logger.level)
  256. from mpi4py import MPI
  257. comm = MPI.COMM_WORLD
  258. while True:
  259. main(int(sys.argv[2]), int(sys.argv[1]))