Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

parse_user_docs.py 9.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284
  1. #
  2. # Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import datetime
  17. import json
  18. import logging
  19. import os
  20. import hashlib
  21. import copy
  22. import time
  23. import random
  24. import re
  25. from timeit import default_timer as timer
  26. from rag.llm import EmbeddingModel, CvModel
  27. from rag.settings import cron_logger, DOC_MAXIMUM_SIZE
  28. from rag.utils import ELASTICSEARCH
  29. from rag.utils import MINIO
  30. from rag.utils import rmSpace, findMaxTm
  31. from rag.nlp import huchunk, huqie, search
  32. from io import BytesIO
  33. import pandas as pd
  34. from elasticsearch_dsl import Q
  35. from PIL import Image
  36. from rag.parser import (
  37. PdfParser,
  38. DocxParser,
  39. ExcelParser
  40. )
  41. from rag.nlp.huchunk import (
  42. PdfChunker,
  43. DocxChunker,
  44. ExcelChunker,
  45. PptChunker,
  46. TextChunker
  47. )
  48. from api.db import LLMType
  49. from api.db.services.document_service import DocumentService
  50. from api.db.services.llm_service import TenantLLMService
  51. from api.settings import database_logger
  52. from api.utils import get_format_time
  53. from api.utils.file_utils import get_project_base_directory
  54. BATCH_SIZE = 64
  55. PDF = PdfChunker(PdfParser())
  56. DOC = DocxChunker(DocxParser())
  57. EXC = ExcelChunker(ExcelParser())
  58. PPT = PptChunker()
  59. def chuck_doc(name, binary, cvmdl=None):
  60. suff = os.path.split(name)[-1].lower().split(".")[-1]
  61. if suff.find("pdf") >= 0:
  62. return PDF(binary)
  63. if suff.find("doc") >= 0:
  64. return DOC(binary)
  65. if re.match(r"(xlsx|xlsm|xltx|xltm)", suff):
  66. return EXC(binary)
  67. if suff.find("ppt") >= 0:
  68. return PPT(binary)
  69. if cvmdl and re.search(r"\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico)$",
  70. name.lower()):
  71. txt = cvmdl.describe(binary)
  72. field = TextChunker.Fields()
  73. field.text_chunks = [(txt, binary)]
  74. field.table_chunks = []
  75. return TextChunker()(binary)
  76. def collect(comm, mod, tm):
  77. docs = DocumentService.get_newly_uploaded(tm, mod, comm)
  78. if len(docs) == 0:
  79. return pd.DataFrame()
  80. docs = pd.DataFrame(docs)
  81. mtm = docs["update_time"].max()
  82. cron_logger.info("TOTAL:{}, To:{}".format(len(docs), mtm))
  83. return docs
  84. def set_progress(docid, prog, msg="Processing...", begin=False):
  85. d = {"progress": prog, "progress_msg": msg}
  86. if begin:
  87. d["process_begin_at"] = get_format_time()
  88. try:
  89. DocumentService.update_by_id(
  90. docid, {"progress": prog, "progress_msg": msg})
  91. except Exception as e:
  92. cron_logger.error("set_progress:({}), {}".format(docid, str(e)))
  93. def build(row, cvmdl):
  94. if row["size"] > DOC_MAXIMUM_SIZE:
  95. set_progress(row["id"], -1, "File size exceeds( <= %dMb )" %
  96. (int(DOC_MAXIMUM_SIZE / 1024 / 1024)))
  97. return []
  98. # res = ELASTICSEARCH.search(Q("term", doc_id=row["id"]))
  99. # if ELASTICSEARCH.getTotal(res) > 0:
  100. # ELASTICSEARCH.updateScriptByQuery(Q("term", doc_id=row["id"]),
  101. # scripts="""
  102. # if(!ctx._source.kb_id.contains('%s'))
  103. # ctx._source.kb_id.add('%s');
  104. # """ % (str(row["kb_id"]), str(row["kb_id"])),
  105. # idxnm=search.index_name(row["tenant_id"])
  106. # )
  107. # set_progress(row["id"], 1, "Done")
  108. # return []
  109. random.seed(time.time())
  110. set_progress(row["id"], random.randint(0, 20) /
  111. 100., "Finished preparing! Start to slice file!", True)
  112. try:
  113. cron_logger.info("Chunkking {}/{}".format(row["location"], row["name"]))
  114. obj = chuck_doc(row["name"], MINIO.get(row["kb_id"], row["location"]), cvmdl)
  115. except Exception as e:
  116. if re.search("(No such file|not found)", str(e)):
  117. set_progress(
  118. row["id"], -1, "Can not find file <%s>" %
  119. row["doc_name"])
  120. else:
  121. set_progress(
  122. row["id"], -1, f"Internal server error: %s" %
  123. str(e).replace(
  124. "'", ""))
  125. cron_logger.warn("Chunkking {}/{}: {}".format(row["location"], row["name"], str(e)))
  126. return []
  127. if not obj.text_chunks and not obj.table_chunks:
  128. set_progress(
  129. row["id"],
  130. 1,
  131. "Nothing added! Mostly, file type unsupported yet.")
  132. return []
  133. set_progress(row["id"], random.randint(20, 60) / 100.,
  134. "Finished slicing files. Start to embedding the content.")
  135. doc = {
  136. "doc_id": row["id"],
  137. "kb_id": [str(row["kb_id"])],
  138. "docnm_kwd": os.path.split(row["location"])[-1],
  139. "title_tks": huqie.qie(row["name"])
  140. }
  141. doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
  142. output_buffer = BytesIO()
  143. docs = []
  144. md5 = hashlib.md5()
  145. for txt, img in obj.text_chunks:
  146. d = copy.deepcopy(doc)
  147. md5.update((txt + str(d["doc_id"])).encode("utf-8"))
  148. d["_id"] = md5.hexdigest()
  149. d["content_ltks"] = huqie.qie(txt)
  150. d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
  151. if not img:
  152. docs.append(d)
  153. continue
  154. if isinstance(img, bytes):
  155. output_buffer = BytesIO(img)
  156. else:
  157. img.save(output_buffer, format='JPEG')
  158. MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
  159. d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
  160. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  161. docs.append(d)
  162. for arr, img in obj.table_chunks:
  163. for i, txt in enumerate(arr):
  164. d = copy.deepcopy(doc)
  165. d["content_ltks"] = huqie.qie(txt)
  166. md5.update((txt + str(d["doc_id"])).encode("utf-8"))
  167. d["_id"] = md5.hexdigest()
  168. if not img:
  169. docs.append(d)
  170. continue
  171. img.save(output_buffer, format='JPEG')
  172. MINIO.put(row["kb_id"], d["_id"], output_buffer.getvalue())
  173. d["img_id"] = "{}-{}".format(row["kb_id"], d["_id"])
  174. d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
  175. docs.append(d)
  176. set_progress(row["id"], random.randint(60, 70) /
  177. 100., "Continue embedding the content.")
  178. return docs
  179. def init_kb(row):
  180. idxnm = search.index_name(row["tenant_id"])
  181. if ELASTICSEARCH.indexExist(idxnm):
  182. return
  183. return ELASTICSEARCH.createIdx(idxnm, json.load(
  184. open(os.path.join(get_project_base_directory(), "conf", "mapping.json"), "r")))
  185. def embedding(docs, mdl):
  186. tts, cnts = [rmSpace(d["title_tks"]) for d in docs], [rmSpace(d["content_ltks"]) for d in docs]
  187. tk_count = 0
  188. tts, c = mdl.encode(tts)
  189. tk_count += c
  190. cnts, c = mdl.encode(cnts)
  191. tk_count += c
  192. vects = 0.1 * tts + 0.9 * cnts
  193. assert len(vects) == len(docs)
  194. for i, d in enumerate(docs):
  195. v = vects[i].tolist()
  196. d["q_%d_vec"%len(v)] = v
  197. return tk_count
  198. def main(comm, mod):
  199. global model
  200. from rag.llm import HuEmbedding
  201. model = HuEmbedding()
  202. tm_fnm = os.path.join(get_project_base_directory(), "rag/res", f"{comm}-{mod}.tm")
  203. tm = findMaxTm(tm_fnm)
  204. rows = collect(comm, mod, tm)
  205. if len(rows) == 0:
  206. return
  207. tmf = open(tm_fnm, "a+")
  208. for _, r in rows.iterrows():
  209. embd_mdl = TenantLLMService.model_instance(r["tenant_id"], LLMType.EMBEDDING)
  210. if not embd_mdl:
  211. set_progress(r["id"], -1, "Can't find embedding model!")
  212. cron_logger.error("Tenant({}) can't find embedding model!".format(r["tenant_id"]))
  213. continue
  214. cv_mdl = TenantLLMService.model_instance(r["tenant_id"], LLMType.IMAGE2TEXT)
  215. st_tm = timer()
  216. cks = build(r, cv_mdl)
  217. if not cks:
  218. tmf.write(str(r["update_time"]) + "\n")
  219. continue
  220. # TODO: exception handler
  221. ## set_progress(r["did"], -1, "ERROR: ")
  222. try:
  223. tk_count = embedding(cks, embd_mdl)
  224. except Exception as e:
  225. set_progress(r["id"], -1, "Embedding error:{}".format(str(e)))
  226. cron_logger.error(str(e))
  227. continue
  228. set_progress(r["id"], random.randint(70, 95) / 100.,
  229. "Finished embedding! Start to build index!")
  230. init_kb(r)
  231. es_r = ELASTICSEARCH.bulk(cks, search.index_name(r["tenant_id"]))
  232. if es_r:
  233. set_progress(r["id"], -1, "Index failure!")
  234. cron_logger.error(str(es_r))
  235. else:
  236. set_progress(r["id"], 1., "Done!")
  237. DocumentService.increment_chunk_num(r["id"], r["kb_id"], tk_count, len(cks), timer()-st_tm)
  238. cron_logger.info("Chunk doc({}), token({}), chunks({})".format(r["id"], tk_count, len(cks)))
  239. tmf.write(str(r["update_time"]) + "\n")
  240. tmf.close()
  241. if __name__ == "__main__":
  242. peewee_logger = logging.getLogger('peewee')
  243. peewee_logger.propagate = False
  244. peewee_logger.addHandler(database_logger.handlers[0])
  245. peewee_logger.setLevel(database_logger.level)
  246. from mpi4py import MPI
  247. comm = MPI.COMM_WORLD
  248. main(comm.Get_size(), comm.Get_rank())