您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

task_executor.py 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import datetime
  17. import json
  18. import logging
  19. import os
  20. import hashlib
  21. import copy
  22. import re
  23. import sys
  24. import time
  25. import traceback
  26. from functools import partial
  27. from api.db.services.file2document_service import File2DocumentService
  28. from api.settings import retrievaler
  29. from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
  30. from rag.utils.minio_conn import MINIO
  31. from api.db.db_models import close_connection
  32. from rag.settings import database_logger, SVR_QUEUE_NAME
  33. from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
  34. from multiprocessing import Pool
  35. import numpy as np
  36. from elasticsearch_dsl import Q, Search
  37. from multiprocessing.context import TimeoutError
  38. from api.db.services.task_service import TaskService
  39. from rag.utils.es_conn import ELASTICSEARCH
  40. from timeit import default_timer as timer
  41. from rag.utils import rmSpace, findMaxTm, num_tokens_from_string
  42. from rag.nlp import search, rag_tokenizer
  43. from io import BytesIO
  44. import pandas as pd
  45. from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio
  46. from api.db import LLMType, ParserType
  47. from api.db.services.document_service import DocumentService
  48. from api.db.services.llm_service import LLMBundle
  49. from api.utils.file_utils import get_project_base_directory
  50. from rag.utils.redis_conn import REDIS_CONN
  51. BATCH_SIZE = 64
  52. FACTORY = {
  53. "general": naive,
  54. ParserType.NAIVE.value: naive,
  55. ParserType.PAPER.value: paper,
  56. ParserType.BOOK.value: book,
  57. ParserType.PRESENTATION.value: presentation,
  58. ParserType.MANUAL.value: manual,
  59. ParserType.LAWS.value: laws,
  60. ParserType.QA.value: qa,
  61. ParserType.TABLE.value: table,
  62. ParserType.RESUME.value: resume,
  63. ParserType.PICTURE.value: picture,
  64. ParserType.ONE.value: one,
  65. ParserType.AUDIO.value: audio
  66. }
  67. def set_progress(task_id, from_page=0, to_page=-1,
  68. prog=None, msg="Processing..."):
  69. if prog is not None and prog < 0:
  70. msg = "[ERROR]" + msg
  71. cancel = TaskService.do_cancel(task_id)
  72. if cancel:
  73. msg += " [Canceled]"
  74. prog = -1
  75. if to_page > 0:
  76. if msg:
  77. msg = f"Page({from_page + 1}~{to_page + 1}): " + msg
  78. d = {"progress_msg": msg}
  79. if prog is not None:
  80. d["progress"] = prog
  81. try:
  82. TaskService.update_progress(task_id, d)
  83. except Exception as e:
  84. cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
  85. close_connection()
  86. if cancel:
  87. sys.exit()
  88. def collect():
  89. try:
  90. payload = REDIS_CONN.queue_consumer(SVR_QUEUE_NAME, "rag_flow_svr_task_broker", "rag_flow_svr_task_consumer")
  91. if not payload:
  92. time.sleep(1)
  93. return pd.DataFrame()
  94. except Exception as e:
  95. cron_logger.error("Get task event from queue exception:" + str(e))
  96. return pd.DataFrame()
  97. msg = payload.get_message()
  98. payload.ack()
  99. if not msg: return pd.DataFrame()
  100. if TaskService.do_cancel(msg["id"]):
  101. cron_logger.info("Task {} has been canceled.".format(msg["id"]))
  102. return pd.DataFrame()
  103. tasks = TaskService.get_tasks(msg["id"])
  104. assert tasks, "{} empty task!".format(msg["id"])
  105. tasks = pd.DataFrame(tasks)
  106. if msg.get("type", "") == "raptor":
  107. tasks["task_type"] = "raptor"
  108. return tasks
  109. def get_minio_binary(bucket, name):
  110. return MINIO.get(bucket, name)
  111. def build(row):
  112. if row["size"] > DOC_MAXIMUM_SIZE:
  113. set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" %
  114. (int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
  115. return []
  116. callback = partial(
  117. set_progress,
  118. row["id"],
  119. row["from_page"],
  120. row["to_page"])
  121. chunker = FACTORY[row["parser_id"].lower()]
  122. try:
  123. st = timer()
  124. bucket, name = File2DocumentService.get_minio_address(doc_id=row["doc_id"])
  125. binary = get_minio_binary(bucket, name)
  126. cron_logger.info(
  127. "From minio({}) {}/{}".format(timer() - st, row["location"], row["name"]))
  128. cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
  129. to_page=row["to_page"], lang=row["language"], callback=callback,
  130. kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
  131. cron_logger.info(
  132. "Chunkking({}) {}/{}".format(timer() - st, row["location"], row["name"]))
  133. except TimeoutError as e:
  134. callback(-1, f"Internal server error: Fetch file timeout. Could you try it again.")
  135. cron_logger.error(
  136. "Chunkking {}/{}: Fetch file timeout.".format(row["location"], row["name"]))
  137. return
  138. except Exception as e:
  139. if re.search("(No such file|not found)", str(e)):
  140. callback(-1, "Can not find file <%s>" % row["name"])
  141. else:
  142. callback(-1, f"Internal server error: %s" %
  143. str(e).replace("'", ""))
  144. traceback.print_exc()
  145. cron_logger.error(
  146. "Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
  147. return
  148. docs = []
  149. doc = {
  150. "doc_id": row["doc_id"],
  151. "kb_id": [str(row["kb_id"])]
  152. }
  153. el = 0
  154. for ck in cks:
  155. d = copy.deepcopy(doc)
  156. d.update(ck)
  157. md5 = hashlib.md5()
  158. md5.update((ck["content_with_weight"] +
  159. str(d["doc_id"])).encode("utf-8"))
  160. d["_id"] = md5.hexdigest()
  161. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  162. d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
  163. if not d.get("image"):
  164. docs.append(d)
  165. continue
  166. output_buffer = BytesIO()
  167. if isinstance(d["image"], bytes):
  168. output_buffer = BytesIO(d["image"])
  169. else:
  170. d["image"].save(output_buffer, format='JPEG')
  171. st = timer()
  172. MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
  173. el += timer() - st
  174. d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
  175. del d["image"]
  176. docs.append(d)
  177. cron_logger.info("MINIO PUT({}):{}".format(row["name"], el))
  178. return docs
  179. def init_kb(row):
  180. idxnm = search.index_name(row["tenant_id"])
  181. if ELASTICSEARCH.indexExist(idxnm):
  182. return
  183. return ELASTICSEARCH.createIdx(idxnm, json.load(
  184. open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
  185. def embedding(docs, mdl, parser_config={}, callback=None):
  186. batch_size = 32
  187. tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
  188. re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs]
  189. tk_count = 0
  190. if len(tts) == len(cnts):
  191. tts_ = np.array([])
  192. for i in range(0, len(tts), batch_size):
  193. vts, c = mdl.encode(tts[i: i + batch_size])
  194. if len(tts_) == 0:
  195. tts_ = vts
  196. else:
  197. tts_ = np.concatenate((tts_, vts), axis=0)
  198. tk_count += c
  199. callback(prog=0.6 + 0.1 * (i + 1) / len(tts), msg="")
  200. tts = tts_
  201. cnts_ = np.array([])
  202. for i in range(0, len(cnts), batch_size):
  203. vts, c = mdl.encode(cnts[i: i + batch_size])
  204. if len(cnts_) == 0:
  205. cnts_ = vts
  206. else:
  207. cnts_ = np.concatenate((cnts_, vts), axis=0)
  208. tk_count += c
  209. callback(prog=0.7 + 0.2 * (i + 1) / len(cnts), msg="")
  210. cnts = cnts_
  211. title_w = float(parser_config.get("filename_embd_weight", 0.1))
  212. vects = (title_w * tts + (1 - title_w) *
  213. cnts) if len(tts) == len(cnts) else cnts
  214. assert len(vects) == len(docs)
  215. for i, d in enumerate(docs):
  216. v = vects[i].tolist()
  217. d["q_%d_vec" % len(v)] = v
  218. return tk_count
  219. def run_raptor(row, chat_mdl, embd_mdl, callback=None):
  220. vts, _ = embd_mdl.encode(["ok"])
  221. vctr_nm = "q_%d_vec"%len(vts[0])
  222. chunks = []
  223. for d in retrievaler.chunk_list(row["doc_id"], row["tenant_id"], fields=["content_with_weight", vctr_nm]):
  224. chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
  225. raptor = Raptor(
  226. row["parser_config"]["raptor"].get("max_cluster", 64),
  227. chat_mdl,
  228. embd_mdl,
  229. row["parser_config"]["raptor"]["prompt"],
  230. row["parser_config"]["raptor"]["max_token"],
  231. row["parser_config"]["raptor"]["threshold"]
  232. )
  233. original_length = len(chunks)
  234. raptor(chunks, row["parser_config"]["raptor"]["random_seed"], callback)
  235. doc = {
  236. "doc_id": row["doc_id"],
  237. "kb_id": [str(row["kb_id"])],
  238. "docnm_kwd": row["name"],
  239. "title_tks": rag_tokenizer.tokenize(row["name"])
  240. }
  241. res = []
  242. tk_count = 0
  243. for content, vctr in chunks[original_length:]:
  244. d = copy.deepcopy(doc)
  245. md5 = hashlib.md5()
  246. md5.update((content + str(d["doc_id"])).encode("utf-8"))
  247. d["_id"] = md5.hexdigest()
  248. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  249. d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
  250. d[vctr_nm] = vctr.tolist()
  251. d["content_with_weight"] = content
  252. d["content_ltks"] = rag_tokenizer.tokenize(content)
  253. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  254. res.append(d)
  255. tk_count += num_tokens_from_string(content)
  256. return res, tk_count
  257. def main():
  258. rows = collect()
  259. if len(rows) == 0:
  260. return
  261. for _, r in rows.iterrows():
  262. callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
  263. try:
  264. embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
  265. except Exception as e:
  266. callback(-1, msg=str(e))
  267. cron_logger.error(str(e))
  268. continue
  269. if r.get("task_type", "") == "raptor":
  270. try:
  271. chat_mdl = LLMBundle(r["tenant_id"], LLMType.CHAT, llm_name=r["llm_id"], lang=r["language"])
  272. cks, tk_count = run_raptor(r, chat_mdl, embd_mdl, callback)
  273. except Exception as e:
  274. callback(-1, msg=str(e))
  275. cron_logger.error(str(e))
  276. continue
  277. else:
  278. st = timer()
  279. cks = build(r)
  280. cron_logger.info("Build chunks({}): {}".format(r["name"], timer() - st))
  281. if cks is None:
  282. continue
  283. if not cks:
  284. callback(1., "No chunk! Done!")
  285. continue
  286. # TODO: exception handler
  287. ## set_progress(r["did"], -1, "ERROR: ")
  288. callback(
  289. msg="Finished slicing files(%d). Start to embedding the content." %
  290. len(cks))
  291. st = timer()
  292. try:
  293. tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
  294. except Exception as e:
  295. callback(-1, "Embedding error:{}".format(str(e)))
  296. cron_logger.error(str(e))
  297. tk_count = 0
  298. cron_logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
  299. callback(msg="Finished embedding({:.2f})! Start to build index!".format(timer() - st))
  300. init_kb(r)
  301. chunk_count = len(set([c["_id"] for c in cks]))
  302. st = timer()
  303. es_r = ""
  304. es_bulk_size = 16
  305. for b in range(0, len(cks), es_bulk_size):
  306. es_r = ELASTICSEARCH.bulk(cks[b:b + es_bulk_size], search.index_name(r["tenant_id"]))
  307. if b % 128 == 0:
  308. callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
  309. cron_logger.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
  310. if es_r:
  311. callback(-1, "Index failure!")
  312. ELASTICSEARCH.deleteByQuery(
  313. Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
  314. cron_logger.error(str(es_r))
  315. else:
  316. if TaskService.do_cancel(r["id"]):
  317. ELASTICSEARCH.deleteByQuery(
  318. Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
  319. continue
  320. callback(1., "Done!")
  321. DocumentService.increment_chunk_num(
  322. r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
  323. cron_logger.info(
  324. "Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
  325. r["id"], tk_count, len(cks), timer() - st))
  326. if __name__ == "__main__":
  327. peewee_logger = logging.getLogger('peewee')
  328. peewee_logger.propagate = False
  329. peewee_logger.addHandler(database_logger.handlers[0])
  330. peewee_logger.setLevel(database_logger.level)
  331. while True:
  332. main()