Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import datetime
  17. import json
  18. import logging
  19. import os
  20. import hashlib
  21. import copy
  22. import re
  23. import sys
  24. import time
  25. import traceback
  26. from functools import partial
  27. from api.db.db_models import close_connection
  28. from rag.settings import database_logger
  29. from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
  30. from multiprocessing import Pool
  31. import numpy as np
  32. from elasticsearch_dsl import Q
  33. from multiprocessing.context import TimeoutError
  34. from api.db.services.task_service import TaskService
  35. from rag.utils import ELASTICSEARCH
  36. from rag.utils import MINIO
  37. from rag.utils import rmSpace, findMaxTm
  38. from rag.nlp import search
  39. from io import BytesIO
  40. import pandas as pd
  41. from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one
  42. from api.db import LLMType, ParserType
  43. from api.db.services.document_service import DocumentService
  44. from api.db.services.llm_service import LLMBundle
  45. from api.utils.file_utils import get_project_base_directory
  46. BATCH_SIZE = 64
  47. FACTORY = {
  48. "general": naive,
  49. ParserType.NAIVE.value: naive,
  50. ParserType.PAPER.value: paper,
  51. ParserType.BOOK.value: book,
  52. ParserType.PRESENTATION.value: presentation,
  53. ParserType.MANUAL.value: manual,
  54. ParserType.LAWS.value: laws,
  55. ParserType.QA.value: qa,
  56. ParserType.TABLE.value: table,
  57. ParserType.RESUME.value: resume,
  58. ParserType.PICTURE.value: picture,
  59. ParserType.ONE.value: one,
  60. }
  61. def set_progress(task_id, from_page=0, to_page=-1,
  62. prog=None, msg="Processing..."):
  63. if prog is not None and prog < 0:
  64. msg = "[ERROR]" + msg
  65. cancel = TaskService.do_cancel(task_id)
  66. if cancel:
  67. msg += " [Canceled]"
  68. prog = -1
  69. if to_page > 0:
  70. if msg:
  71. msg = f"Page({from_page+1}~{to_page+1}): " + msg
  72. d = {"progress_msg": msg}
  73. if prog is not None:
  74. d["progress"] = prog
  75. try:
  76. TaskService.update_progress(task_id, d)
  77. except Exception as e:
  78. cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
  79. if cancel:
  80. sys.exit()
  81. def collect(comm, mod, tm):
  82. tasks = TaskService.get_tasks(tm, mod, comm)
  83. if len(tasks) == 0:
  84. time.sleep(1)
  85. return pd.DataFrame()
  86. tasks = pd.DataFrame(tasks)
  87. mtm = tasks["update_time"].max()
  88. cron_logger.info("TOTAL:{}, To:{}".format(len(tasks), mtm))
  89. return tasks
  90. def get_minio_binary(bucket, name):
  91. global MINIO
  92. return MINIO.get(bucket, name)
  93. def build(row):
  94. from timeit import default_timer as timer
  95. if row["size"] > DOC_MAXIMUM_SIZE:
  96. set_progress(row["id"], prog=-1, msg="File size exceeds( <= %dMb )" %
  97. (int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
  98. return []
  99. callback = partial(
  100. set_progress,
  101. row["id"],
  102. row["from_page"],
  103. row["to_page"])
  104. chunker = FACTORY[row["parser_id"].lower()]
  105. pool = Pool(processes=1)
  106. try:
  107. st = timer()
  108. thr = pool.apply_async(get_minio_binary, args=(row["kb_id"], row["location"]))
  109. binary = thr.get(timeout=90)
  110. pool.terminate()
  111. cron_logger.info(
  112. "From minio({}) {}/{}".format(timer()-st, row["location"], row["name"]))
  113. cks = chunker.chunk(row["name"], binary=binary, from_page=row["from_page"],
  114. to_page=row["to_page"], lang=row["language"], callback=callback,
  115. kb_id=row["kb_id"], parser_config=row["parser_config"], tenant_id=row["tenant_id"])
  116. cron_logger.info(
  117. "Chunkking({}) {}/{}".format(timer()-st, row["location"], row["name"]))
  118. except TimeoutError as e:
  119. callback(-1, f"Internal server error: Fetch file timeout. Could you try it again.")
  120. cron_logger.error(
  121. "Chunkking {}/{}: Fetch file timeout.".format(row["location"], row["name"]))
  122. return
  123. except Exception as e:
  124. if re.search("(No such file|not found)", str(e)):
  125. callback(-1, "Can not find file <%s>" % row["name"])
  126. else:
  127. callback(-1, f"Internal server error: %s" %
  128. str(e).replace("'", ""))
  129. pool.terminate()
  130. traceback.print_exc()
  131. cron_logger.error(
  132. "Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
  133. return
  134. docs = []
  135. doc = {
  136. "doc_id": row["doc_id"],
  137. "kb_id": [str(row["kb_id"])]
  138. }
  139. for ck in cks:
  140. d = copy.deepcopy(doc)
  141. d.update(ck)
  142. md5 = hashlib.md5()
  143. md5.update((ck["content_with_weight"] +
  144. str(d["doc_id"])).encode("utf-8"))
  145. d["_id"] = md5.hexdigest()
  146. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  147. d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
  148. if not d.get("image"):
  149. docs.append(d)
  150. continue
  151. output_buffer = BytesIO()
  152. if isinstance(d["image"], bytes):
  153. output_buffer = BytesIO(d["image"])
  154. else:
  155. d["image"].save(output_buffer, format='JPEG')
  156. MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
  157. d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
  158. del d["image"]
  159. docs.append(d)
  160. return docs
  161. def init_kb(row):
  162. idxnm = search.index_name(row["tenant_id"])
  163. if ELASTICSEARCH.indexExist(idxnm):
  164. return
  165. return ELASTICSEARCH.createIdx(idxnm, json.load(
  166. open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
  167. def embedding(docs, mdl, parser_config={}, callback=None):
  168. batch_size = 32
  169. tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
  170. re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs]
  171. tk_count = 0
  172. if len(tts) == len(cnts):
  173. tts_ = np.array([])
  174. for i in range(0, len(tts), batch_size):
  175. vts, c = mdl.encode(tts[i: i + batch_size])
  176. if len(tts_) == 0:
  177. tts_ = vts
  178. else:
  179. tts_ = np.concatenate((tts_, vts), axis=0)
  180. tk_count += c
  181. callback(prog=0.6 + 0.1 * (i + 1) / len(tts), msg="")
  182. tts = tts_
  183. cnts_ = np.array([])
  184. for i in range(0, len(cnts), batch_size):
  185. vts, c = mdl.encode(cnts[i: i + batch_size])
  186. if len(cnts_) == 0:
  187. cnts_ = vts
  188. else:
  189. cnts_ = np.concatenate((cnts_, vts), axis=0)
  190. tk_count += c
  191. callback(prog=0.7 + 0.2 * (i + 1) / len(cnts), msg="")
  192. cnts = cnts_
  193. title_w = float(parser_config.get("filename_embd_weight", 0.1))
  194. vects = (title_w * tts + (1 - title_w) *
  195. cnts) if len(tts) == len(cnts) else cnts
  196. assert len(vects) == len(docs)
  197. for i, d in enumerate(docs):
  198. v = vects[i].tolist()
  199. d["q_%d_vec" % len(v)] = v
  200. return tk_count
  201. def main(comm, mod):
  202. tm_fnm = os.path.join(
  203. get_project_base_directory(),
  204. "rag/res",
  205. f"{comm}-{mod}.tm")
  206. tm = findMaxTm(tm_fnm)
  207. rows = collect(comm, mod, tm)
  208. if len(rows) == 0:
  209. return
  210. tmf = open(tm_fnm, "a+")
  211. for _, r in rows.iterrows():
  212. callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
  213. try:
  214. embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
  215. except Exception as e:
  216. traceback.print_stack(e)
  217. callback(prog=-1, msg=str(e))
  218. continue
  219. cks = build(r)
  220. if cks is None:
  221. continue
  222. if not cks:
  223. tmf.write(str(r["update_time"]) + "\n")
  224. callback(1., "No chunk! Done!")
  225. continue
  226. # TODO: exception handler
  227. ## set_progress(r["did"], -1, "ERROR: ")
  228. callback(
  229. msg="Finished slicing files(%d). Start to embedding the content." %
  230. len(cks))
  231. try:
  232. tk_count = embedding(cks, embd_mdl, r["parser_config"], callback)
  233. except Exception as e:
  234. callback(-1, "Embedding error:{}".format(str(e)))
  235. cron_logger.error(str(e))
  236. tk_count = 0
  237. callback(msg="Finished embedding! Start to build index!")
  238. init_kb(r)
  239. chunk_count = len(set([c["_id"] for c in cks]))
  240. es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
  241. if es_r:
  242. callback(-1, "Index failure!")
  243. ELASTICSEARCH.deleteByQuery(
  244. Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
  245. cron_logger.error(str(es_r))
  246. else:
  247. if TaskService.do_cancel(r["id"]):
  248. ELASTICSEARCH.deleteByQuery(
  249. Q("match", doc_id=r["doc_id"]), idxnm=search.index_name(r["tenant_id"]))
  250. continue
  251. callback(1., "Done!")
  252. DocumentService.increment_chunk_num(
  253. r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
  254. cron_logger.info(
  255. "Chunk doc({}), token({}), chunks({})".format(
  256. r["id"], tk_count, len(cks)))
  257. tmf.write(str(r["update_time"]) + "\n")
  258. tmf.close()
  259. if __name__ == "__main__":
  260. peewee_logger = logging.getLogger('peewee')
  261. peewee_logger.propagate = False
  262. peewee_logger.addHandler(database_logger.handlers[0])
  263. peewee_logger.setLevel(database_logger.level)
  264. from mpi4py import MPI
  265. comm = MPI.COMM_WORLD
  266. while True:
  267. main(int(sys.argv[2]), int(sys.argv[1]))
  268. close_connection()