You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

task_executor.py 7.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import datetime
  17. import json
  18. import logging
  19. import os
  20. import hashlib
  21. import copy
  22. import re
  23. import sys
  24. from functools import partial
  25. from timeit import default_timer as timer
  26. from api.db.services.task_service import TaskService
  27. from rag.llm import EmbeddingModel, CvModel
  28. from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
  29. from rag.utils import ELASTICSEARCH
  30. from rag.utils import MINIO
  31. from rag.utils import rmSpace, findMaxTm
  32. from rag.nlp import search
  33. from io import BytesIO
  34. import pandas as pd
  35. from rag.app import laws, paper, presentation, manual
  36. from api.db import LLMType, ParserType
  37. from api.db.services.document_service import DocumentService
  38. from api.db.services.llm_service import LLMBundle
  39. from api.settings import database_logger
  40. from api.utils.file_utils import get_project_base_directory
  41. BATCH_SIZE = 64
  42. FACTORY = {
  43. ParserType.GENERAL.value: laws,
  44. ParserType.PAPER.value: paper,
  45. ParserType.PRESENTATION.value: presentation,
  46. ParserType.MANUAL.value: manual,
  47. ParserType.LAWS.value: laws,
  48. }
  49. def set_progress(task_id, from_page, to_page, prog=None, msg="Processing..."):
  50. cancel = TaskService.do_cancel(task_id)
  51. if cancel:
  52. msg = "Canceled."
  53. prog = -1
  54. if to_page > 0: msg = f"Page({from_page}~{to_page}): " + msg
  55. d = {"progress_msg": msg}
  56. if prog is not None: d["progress"] = prog
  57. try:
  58. TaskService.update_by_id(task_id, d)
  59. except Exception as e:
  60. cron_logger.error("set_progress:({}), {}".format(task_id, str(e)))
  61. if cancel:sys.exit()
  62. """
  63. def chuck_doc(name, binary, tenant_id, cvmdl=None):
  64. suff = os.path.split(name)[-1].lower().split(".")[-1]
  65. if suff.find("pdf") >= 0:
  66. return PDF(binary)
  67. if suff.find("doc") >= 0:
  68. return DOC(binary)
  69. if re.match(r"(xlsx|xlsm|xltx|xltm)", suff):
  70. return EXC(binary)
  71. if suff.find("ppt") >= 0:
  72. return PPT(binary)
  73. if cvmdl and re.search(r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$",
  74. name.lower()):
  75. txt = cvmdl.describe(binary)
  76. field = TextChunker.Fields()
  77. field.text_chunks = [(txt, binary)]
  78. field.table_chunks = []
  79. return field
  80. return TextChunker()(binary)
  81. """
  82. def collect(comm, mod, tm):
  83. tasks = TaskService.get_tasks(tm, mod, comm)
  84. if len(tasks) == 0:
  85. return pd.DataFrame()
  86. tasks = pd.DataFrame(tasks)
  87. mtm = tasks["update_time"].max()
  88. cron_logger.info("TOTAL:{}, To:{}".format(len(tasks), mtm))
  89. return tasks
  90. def build(row, cvmdl):
  91. if row["size"] > DOC_MAXIMUM_SIZE:
  92. set_progress(row["id"], -1, "File size exceeds( <= %dMb )" %
  93. (int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
  94. return []
  95. callback = partial(set_progress, row["id"], row["from_page"], row["to_page"])
  96. chunker = FACTORY[row["parser_id"]]
  97. try:
  98. cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"]))
  99. cks = chunker.chunk(row["name"], MINIO.get(row["kb_id"], row["location"]), row["from_page"], row["to_page"],
  100. callback)
  101. except Exception as e:
  102. if re.search("(No such file|not found)", str(e)):
  103. callback(-1, "Can not find file <%s>" % row["doc_name"])
  104. else:
  105. callback(-1, f"Internal server error: %s" % str(e).replace("'", ""))
  106. cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
  107. return []
  108. callback(msg="Finished slicing files. Start to embedding the content.")
  109. docs = []
  110. doc = {
  111. "doc_id": row["doc_id"],
  112. "kb_id": [str(row["kb_id"])]
  113. }
  114. for ck in cks:
  115. d = copy.deepcopy(doc)
  116. d.update(ck)
  117. md5 = hashlib.md5()
  118. md5.update((ck["content_with_weight"] + str(d["doc_id"])).encode("utf-8"))
  119. d["_id"] = md5.hexdigest()
  120. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  121. if not d.get("image"):
  122. docs.append(d)
  123. continue
  124. output_buffer = BytesIO()
  125. if isinstance(d["image"], bytes):
  126. output_buffer = BytesIO(d["image"])
  127. else:
  128. d["image"].save(output_buffer, format='JPEG')
  129. MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
  130. d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
  131. docs.append(d)
  132. return docs
  133. def init_kb(row):
  134. idxnm = search.index_name(row["tenant_id"])
  135. if ELASTICSEARCH.indexExist(idxnm):
  136. return
  137. return ELASTICSEARCH.createIdx(idxnm, json.load(
  138. open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
  139. def embedding(docs, mdl):
  140. tts, cnts = [d["docnm_kwd"] for d in docs], [d["content_with_weight"] for d in docs]
  141. tk_count = 0
  142. tts, c = mdl.encode(tts)
  143. tk_count += c
  144. cnts, c = mdl.encode(cnts)
  145. tk_count += c
  146. vects = 0.1 * tts + 0.9 * cnts
  147. assert len(vects) == len(docs)
  148. for i, d in enumerate(docs):
  149. v = vects[i].tolist()
  150. d["q_%d_vec" % len(v)] = v
  151. return tk_count
  152. def main(comm, mod):
  153. tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"{comm}-{mod}.tm")
  154. tm = findMaxTm(tm_fnm)
  155. rows = collect(comm, mod, tm)
  156. if len(rows) == 0:
  157. return
  158. tmf = open(tm_fnm, "a+")
  159. for _, r in rows.iterrows():
  160. try:
  161. embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING)
  162. cv_mdl = LLMBundle(r["tenant_id"], LLMType.IMAGE2TEXT)
  163. # TODO: sequence2text model
  164. except Exception as e:
  165. set_progress(r["id"], -1, str(e))
  166. continue
  167. callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
  168. st_tm = timer()
  169. cks = build(r, cv_mdl)
  170. if not cks:
  171. tmf.write(str(r["update_time"]) + "\n")
  172. continue
  173. # TODO: exception handler
  174. ## set_progress(r["did"], -1, "ERROR: ")
  175. try:
  176. tk_count = embedding(cks, embd_mdl)
  177. except Exception as e:
  178. callback(-1, "Embedding error:{}".format(str(e)))
  179. cron_logger.error(str(e))
  180. continue
  181. callback(msg="Finished embedding! Start to build index!")
  182. init_kb(r)
  183. chunk_count = len(set([c["_id"] for c in cks]))
  184. callback(1., "Done!")
  185. es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
  186. if es_r:
  187. callback(-1, "Index failure!")
  188. cron_logger.error(str(es_r))
  189. else:
  190. DocumentService.increment_chunk_num(r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
  191. cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks)))
  192. tmf.write(str(r["update_time"]) + "\n")
  193. tmf.close()
  194. if __name__ == "__main__":
  195. peewee_logger = logging.getLogger('peewee')
  196. peewee_logger.propagate = False
  197. peewee_logger.addHandler(database_logger.handlers[0])
  198. peewee_logger.setLevel(database_logger.level)
  199. from mpi4py import MPI
  200. comm = MPI.COMM_WORLD
  201. main(comm.Get_size(), comm.Get_rank())