You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import datetime
  17. import json
  18. import logging
  19. import os
  20. import hashlib
  21. import copy
  22. import re
  23. import sys
  24. import time
  25. import traceback
  26. from concurrent.futures import ThreadPoolExecutor
  27. from functools import partial
  28. from io import BytesIO
  29. from multiprocessing.context import TimeoutError
  30. from timeit import default_timer as timer
  31. import numpy as np
  32. import pandas as pd
  33. from elasticsearch_dsl import Q
  34. from api.db import LLMType, ParserType
  35. from api.db.services.dialog_service import keyword_extraction, question_proposal
  36. from api.db.services.document_service import DocumentService
  37. from api.db.services.llm_service import LLMBundle
  38. from api.db.services.task_service import TaskService
  39. from api.db.services.file2document_service import File2DocumentService
  40. from api.settings import retrievaler
  41. from api.utils.file_utils import get_project_base_directory
  42. from api.db.db_models import close_connection
  43. from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, knowledge_graph, email
  44. from rag.nlp import search, rag_tokenizer
  45. from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
  46. from rag.settings import database_logger, SVR_QUEUE_NAME
  47. from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
  48. from rag.utils import rmSpace, num_tokens_from_string
  49. from rag.utils.es_conn import ELASTICSEARCH
  50. from rag.utils.redis_conn import REDIS_CONN, Payload
  51. from rag.utils.storage_factory import STORAGE_IMPL
  52. BATCH_SIZE = 64
  53. FACTORY = {
  54. "general": naive,
  55. ParserType.NAIVE.value: naive,
  56. ParserType.PAPER.value: paper,
  57. ParserType.BOOK.value: book,
  58. ParserType.PRESENTATION.value: presentation,
  59. ParserType.MANUAL.value: manual,
  60. ParserType.LAWS.value: laws,
  61. ParserType.QA.value: qa,
  62. ParserType.TABLE.value: table,
  63. ParserType.RESUME.value: resume,
  64. ParserType.PICTURE.value: picture,
  65. ParserType.ONE.value: one,
  66. ParserType.AUDIO.value: audio,
  67. ParserType.EMAIL.value: email,
  68. ParserType.KG.value: knowledge_graph
  69. }
  70. CONSUMER_NAME = "task_consumer_" + ("0" if len(sys.argv) < 2 else sys.argv[1])
  71. PAYLOAD: Payload | None = None
  72. def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
  73. global PAYLOAD
  74. if prog is not None and prog < 0:
  75. msg = "[ERROR]" + msg
  76. cancel = TaskService.do_cancel(task_id)
  77. if cancel:
  78. msg += " [Canceled]"
  79. prog = -1
  80. if to_page > 0:
  81. if msg:
  82. msg = f"Page({from_page + 1}~{to_page + 1}): " + msg
  83. d = {"progress_msg": msg}
  84. if prog is not None:
  85. d["progress"] = prog
  86. try:
  87. TaskService.update_progress(task_id, d)
  88. except Exception as e:
  89. cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
  90. close_connection()
  91. if cancel:
  92. if PAYLOAD:
  93. PAYLOAD.ack()
  94. PAYLOAD = None
  95. os._exit(0)
  96. def collect():
  97. global CONSUMER_NAME, PAYLOAD
  98. try:
  99. PAYLOAD = REDIS_CONN.get_unacked_for(CONSUMER_NAME, SVR_QUEUE_NAME, "rag_flow_svr_task_broker")
  100. if not PAYLOAD:
  101. PAYLOAD = REDIS_CONN.queue_consumer(SVR_QUEUE_NAME, "rag_flow_svr_task_broker", CONSUMER_NAME)
  102. if not PAYLOAD:
  103. time.sleep(1)
  104. return pd.DataFrame()
  105. except Exception as e:
  106. cron_logger.error("Get task event from queue exception:" + str(e))
  107. return pd.DataFrame()
  108. msg = PAYLOAD.get_message()
  109. if not msg:
  110. return pd.DataFrame()
  111. if TaskService.do_cancel(msg["id"]):
  112. cron_logger.info("Task {} has been canceled.".format(msg["id"]))
  113. return pd.DataFrame()
  114. tasks = TaskService.get_tasks(msg["id"])
  115. if not tasks:
  116. cron_logger.warn("{} empty task!".format(msg["id"]))
  117. return []
  118. tasks = pd.DataFrame(tasks)
  119. if msg.get("type", "") == "raptor":
  120. tasks["task_type"] = "raptor"
  121. return tasks
  122. def get_storage_binary(bucket, name):
  123. return STORAGE_IMPL.get(bucket, name)
  124. def build(row):
  125. if row["size"] > DOC_MAXIMUM_SIZE:
  126. set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" %
  127. (int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
  128. return []
  129. callback = partial(
  130. set_progress,
  131. row["id"],
  132. row["from_page"],
  133. row["to_page"])
  134. chunker = FACTORY[row["parser_id"].lower()]
  135. try:
  136. st = timer()
  137. bucket, name = File2DocumentService.get_storage_address(doc_id=row["doc_id"])
  138. binary = get_storage_binary(bucket, name)
  139. cron_logger.info(
  140. "From minio({}) {}/{}".format(timer() - st, row["location"], row["name"]))
  141. except TimeoutError:
  142. callback(-1, "Internal server error: Fetch file from minio timeout. Could you try it again.")
  143. cron_logger.error(
  144. "Minio {}/{}: Fetch file from minio timeout.".format(row["location"], row["name"]))
  145. return
  146. except Exception as e:
  147. if re.search("(No such file|not found)", str(e)):
  148. callback(-1, "Can not find file <%s> from minio. Could you try it again?" % row["name"])
  149. else:
  150. callback(-1, "Get file from minio: %s" % str(e).replace("'", ""))
  151. traceback.print_exc()
  152. return
  153. try:
  154. cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
  155. to_page=row["to_page"], lang=row["language"], callback=callback,
  156. kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
  157. cron_logger.info(
  158. "Chunking({}) {}/{}".format(timer() - st, row["location"], row["name"]))
  159. except Exception as e:
  160. callback(-1, "Internal server error while chunking: %s" %
  161. str(e).replace("'", ""))
  162. cron_logger.error(
  163. "Chunking {}/{}: {}".format(row["location"], row["name"], str(e)))
  164. traceback.print_exc()
  165. return
  166. docs = []
  167. doc = {
  168. "doc_id": row["doc_id"],
  169. "kb_id": [str(row["kb_id"])]
  170. }
  171. el = 0
  172. for ck in cks:
  173. d = copy.deepcopy(doc)
  174. d.update(ck)
  175. md5 = hashlib.md5()
  176. md5.update((ck["content_with_weight"] +
  177. str(d["doc_id"])).encode("utf-8"))
  178. d["_id"] = md5.hexdigest()
  179. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  180. d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
  181. if not d.get("image"):
  182. docs.append(d)
  183. continue
  184. try:
  185. output_buffer = BytesIO()
  186. if isinstance(d["image"], bytes):
  187. output_buffer = BytesIO(d["image"])
  188. else:
  189. d["image"].save(output_buffer, format='JPEG')
  190. st = timer()
  191. STORAGE_IMPL.put(row["kb_id"], d["_id"], output_buffer.getvalue())
  192. el += timer() - st
  193. except Exception as e:
  194. cron_logger.error(str(e))
  195. traceback.print_exc()
  196. d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
  197. del d["image"]
  198. docs.append(d)
  199. cron_logger.info("MINIO PUT({}):{}".format(row["name"], el))
  200. if row["parser_config"].get("auto_keywords", 0):
  201. callback(msg="Start to generate keywords for every chunk ...")
  202. chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
  203. for d in docs:
  204. d["important_kwd"] = keyword_extraction(chat_mdl, d["content_with_weight"],
  205. row["parser_config"]["auto_keywords"]).split(",")
  206. d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
  207. if row["parser_config"].get("auto_questions", 0):
  208. callback(msg="Start to generate questions for every chunk ...")
  209. chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
  210. for d in docs:
  211. qst = question_proposal(chat_mdl, d["content_with_weight"], row["parser_config"]["auto_questions"])
  212. d["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + d["content_with_weight"]
  213. qst = rag_tokenizer.tokenize(qst)
  214. if "content_ltks" in d:
  215. d["content_ltks"] += " " + qst
  216. if "content_sm_ltks" in d:
  217. d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
  218. return docs
  219. def init_kb(row):
  220. idxnm = search.index_name(row["tenant_id"])
  221. if ELASTICSEARCH.indexExist(idxnm):
  222. return
  223. return ELASTICSEARCH.createIdx(idxnm, json.load(
  224. open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
  225. def embedding(docs, mdl, parser_config=None, callback=None):
  226. if parser_config is None:
  227. parser_config = {}
  228. batch_size = 32
  229. tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
  230. re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs]
  231. tk_count = 0
  232. if len(tts) == len(cnts):
  233. tts_ = np.array([])
  234. for i in range(0, len(tts), batch_size):
  235. vts, c = mdl.encode(tts[i: i + batch_size])
  236. if len(tts_) == 0:
  237. tts_ = vts
  238. else:
  239. tts_ = np.concatenate((tts_, vts), axis=0)
  240. tk_count += c
  241. callback(prog=0.6 + 0.1 * (i + 1) / len(tts), msg="")
  242. tts = tts_
  243. cnts_ = np.array([])
  244. for i in range(0, len(cnts), batch_size):
  245. vts, c = mdl.encode(cnts[i: i + batch_size])
  246. if len(cnts_) == 0:
  247. cnts_ = vts
  248. else:
  249. cnts_ = np.concatenate((cnts_, vts), axis=0)
  250. tk_count += c
  251. callback(prog=0.7 + 0.2 * (i + 1) / len(cnts), msg="")
  252. cnts = cnts_
  253. title_w = float(parser_config.get("filename_embd_weight", 0.1))
  254. vects = (title_w * tts + (1 - title_w) *
  255. cnts) if len(tts) == len(cnts) else cnts
  256. assert len(vects) == len(docs)
  257. for i, d in enumerate(docs):
  258. v = vects[i].tolist()
  259. d["q_%d_vec" % len(v)] = v
  260. return tk_count
  261. def run_raptor(row, chat_mdl, embd_mdl, callback=None):
  262. vts, _ = embd_mdl.encode(["ok"])
  263. vctr_nm = "q_%d_vec" % len(vts[0])
  264. chunks = []
  265. for d in retrievaler.chunk_list(row["doc_id"], row["tenant_id"], fields=["content_with_weight", vctr_nm]):
  266. chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
  267. raptor = Raptor(
  268. row["parser_config"]["raptor"].get("max_cluster", 64),
  269. chat_mdl,
  270. embd_mdl,
  271. row["parser_config"]["raptor"]["prompt"],
  272. row["parser_config"]["raptor"]["max_token"],
  273. row["parser_config"]["raptor"]["threshold"]
  274. )
  275. original_length = len(chunks)
  276. raptor(chunks, row["parser_config"]["raptor"]["random_seed"], callback)
  277. doc = {
  278. "doc_id": row["doc_id"],
  279. "kb_id": [str(row["kb_id"])],
  280. "docnm_kwd": row["name"],
  281. "title_tks": rag_tokenizer.tokenize(row["name"])
  282. }
  283. res = []
  284. tk_count = 0
  285. for content, vctr in chunks[original_length:]:
  286. d = copy.deepcopy(doc)
  287. md5 = hashlib.md5()
  288. md5.update((content + str(d["doc_id"])).encode("utf-8"))
  289. d["_id"] = md5.hexdigest()
  290. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  291. d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
  292. d[vctr_nm] = vctr.tolist()
  293. d["content_with_weight"] = content
  294. d["content_ltks"] = rag_tokenizer.tokenize(content)
  295. d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
  296. res.append(d)
  297. tk_count += num_tokens_from_string(content)
  298. return res, tk_count
  299. def main():
  300. rows = collect()
  301. if len(rows) == 0:
  302. return
  303. for _, r in rows.iterrows():
  304. callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
  305. try:
  306. embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
  307. except Exception as e:
  308. callback(-1, msg=str(e))
  309. cron_logger.error(str(e))
  310. continue
  311. if r.get("task_type", "") == "raptor":
  312. try:
  313. chat_mdl = LLMBundle(r["tenant_id"], LLMType.CHAT, llm_name=r["llm_id"], lang=r["language"])
  314. cks, tk_count = run_raptor(r, chat_mdl, embd_mdl, callback)
  315. except Exception as e:
  316. callback(-1, msg=str(e))
  317. cron_logger.error(str(e))
  318. continue
  319. else:
  320. st = timer()
  321. cks = build(r)
  322. cron_logger.info("Build chunks({}): {}".format(r["name"], timer() - st))
  323. if cks is None:
  324. continue
  325. if not cks:
  326. callback(1., "No chunk! Done!")
  327. continue
  328. # TODO: exception handler
  329. ## set_progress(r["did"], -1, "ERROR: ")
  330. callback(
  331. msg="Finished slicing files(%d). Start to embedding the content." %
  332. len(cks))
  333. st = timer()
  334. try:
  335. tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
  336. except Exception as e:
  337. callback(-1, "Embedding error:{}".format(str(e)))
  338. cron_logger.error(str(e))
  339. tk_count = 0
  340. cron_logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
  341. callback(msg="Finished embedding({:.2f})! Start to build index!".format(timer() - st))
  342. init_kb(r)
  343. chunk_count = len(set([c["_id"] for c in cks]))
  344. st = timer()
  345. es_r = ""
  346. es_bulk_size = 4
  347. for b in range(0, len(cks), es_bulk_size):
  348. es_r = ELASTICSEARCH.bulk(cks[b:b + es_bulk_size], search.index_name(r["tenant_id"]))
  349. if b % 128 == 0:
  350. callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
  351. cron_logger.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
  352. if es_r:
  353. callback(-1, "Insert chunk error, detail info please check ragflow-logs/api/cron_logger.log. Please also check ES status!")
  354. ELASTICSEARCH.deleteByQuery(
  355. Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
  356. cron_logger.error(str(es_r))
  357. else:
  358. if TaskService.do_cancel(r["id"]):
  359. ELASTICSEARCH.deleteByQuery(
  360. Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
  361. continue
  362. callback(1., "Done!")
  363. DocumentService.increment_chunk_num(
  364. r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
  365. cron_logger.info(
  366. "Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
  367. r["id"], tk_count, len(cks), timer() - st))
  368. def report_status():
  369. global CONSUMER_NAME
  370. while True:
  371. try:
  372. obj = REDIS_CONN.get("TASKEXE")
  373. if not obj: obj = {}
  374. else: obj = json.loads(obj)
  375. if CONSUMER_NAME not in obj: obj[CONSUMER_NAME] = []
  376. obj[CONSUMER_NAME].append(timer())
  377. obj[CONSUMER_NAME] = obj[CONSUMER_NAME][-60:]
  378. REDIS_CONN.set_obj("TASKEXE", obj, 60*2)
  379. except Exception as e:
  380. print("[Exception]:", str(e))
  381. time.sleep(30)
  382. if __name__ == "__main__":
  383. peewee_logger = logging.getLogger('peewee')
  384. peewee_logger.propagate = False
  385. peewee_logger.addHandler(database_logger.handlers[0])
  386. peewee_logger.setLevel(database_logger.level)
  387. exe = ThreadPoolExecutor(max_workers=1)
  388. exe.submit(report_status)
  389. while True:
  390. main()
  391. if PAYLOAD:
  392. PAYLOAD.ack()
  393. PAYLOAD = None