您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. import copy
  2. import re
  3. from collections import Counter
  4. from api.db import ParserType
  5. from rag.parser import tokenize
  6. from rag.nlp import huqie
  7. from rag.parser.pdf_parser import HuParser
  8. import numpy as np
  9. from rag.utils import num_tokens_from_string
  10. class Pdf(HuParser):
  11. def __init__(self):
  12. self.model_speciess = ParserType.PAPER.value
  13. super().__init__()
  14. def __call__(self, filename, binary=None, from_page=0,
  15. to_page=100000, zoomin=3, callback=None):
  16. self.__images__(
  17. filename if not binary else binary,
  18. zoomin,
  19. from_page,
  20. to_page)
  21. callback(0.2, "OCR finished.")
  22. from timeit import default_timer as timer
  23. start = timer()
  24. self._layouts_paddle(zoomin)
  25. callback(0.47, "Layout analysis finished")
  26. print("paddle layouts:", timer() - start)
  27. self._table_transformer_job(zoomin)
  28. callback(0.68, "Table analysis finished")
  29. self._text_merge()
  30. column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
  31. self._concat_downward(concat_between_pages=False)
  32. self._filter_forpages()
  33. callback(0.75, "Text merging finished.")
  34. tbls = self._extract_table_figure(True, zoomin, False)
  35. # clean mess
  36. if column_width < self.page_images[0].size[0] / zoomin / 2:
  37. print("two_column...................", column_width,
  38. self.page_images[0].size[0] / zoomin / 2)
  39. self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
  40. for b in self.boxes:
  41. b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
  42. freq = Counter([b["text"] for b in self.boxes])
  43. garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
  44. i = 0
  45. while i < len(self.boxes):
  46. if self.boxes[i]["text"] in garbage \
  47. or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
  48. or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
  49. self.boxes.pop(i)
  50. elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
  51. '1'):
  52. # merge within same layouts
  53. self.boxes[i + 1]["top"] = self.boxes[i]["top"]
  54. self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
  55. self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
  56. self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
  57. self.boxes.pop(i)
  58. else:
  59. i += 1
  60. def _begin(txt):
  61. return re.match(
  62. "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
  63. txt.lower().strip())
  64. if from_page > 0:
  65. return {
  66. "title":"",
  67. "authors": "",
  68. "abstract": "",
  69. "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
  70. re.match(r"(text|title)", b.get("layoutno", "text"))],
  71. "tables": tbls
  72. }
  73. # get title and authors
  74. title = ""
  75. authors = []
  76. i = 0
  77. while i < min(32, len(self.boxes)):
  78. b = self.boxes[i]
  79. i += 1
  80. if b.get("layoutno", "").find("title") >= 0:
  81. title = b["text"]
  82. if _begin(title):
  83. title = ""
  84. break
  85. for j in range(3):
  86. if _begin(self.boxes[i + j]["text"]): break
  87. authors.append(self.boxes[i + j]["text"])
  88. break
  89. break
  90. # get abstract
  91. abstr = ""
  92. i = 0
  93. while i + 1 < min(32, len(self.boxes)):
  94. b = self.boxes[i]
  95. i += 1
  96. txt = b["text"].lower().strip()
  97. if re.match("(abstract|摘要)", txt):
  98. if len(txt.split(" ")) > 32 or len(txt) > 64:
  99. abstr = txt + self._line_tag(b, zoomin)
  100. i += 1
  101. break
  102. txt = self.boxes[i + 1]["text"].lower().strip()
  103. if len(txt.split(" ")) > 32 or len(txt) > 64:
  104. abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
  105. i += 1
  106. break
  107. if not abstr: i = 0
  108. callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)))
  109. for b in self.boxes: print(b["text"], b.get("layoutno"))
  110. print(tbls)
  111. return {
  112. "title": title if title else filename,
  113. "authors": " ".join(authors),
  114. "abstract": abstr,
  115. "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
  116. re.match(r"(text|title)", b.get("layoutno", "text"))],
  117. "tables": tbls
  118. }
  119. def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
  120. pdf_parser = None
  121. if re.search(r"\.pdf$", filename, re.IGNORECASE):
  122. pdf_parser = Pdf()
  123. paper = pdf_parser(filename if not binary else binary,
  124. from_page=from_page, to_page=to_page, callback=callback)
  125. else: raise NotImplementedError("file type not supported yet(pdf supported)")
  126. doc = {"docnm_kwd": filename, "authors_tks": paper["authors"],
  127. "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
  128. doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
  129. doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
  130. # is it English
  131. eng = pdf_parser.is_english
  132. print("It's English.....", eng)
  133. res = []
  134. # add tables
  135. for img, rows in paper["tables"]:
  136. bs = 10
  137. de = ";" if eng else ";"
  138. for i in range(0, len(rows), bs):
  139. d = copy.deepcopy(doc)
  140. r = de.join(rows[i:i + bs])
  141. r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
  142. tokenize(d, r)
  143. d["image"] = img
  144. res.append(d)
  145. if paper["abstract"]:
  146. d = copy.deepcopy(doc)
  147. txt = pdf_parser.remove_tag(paper["abstract"])
  148. d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
  149. d["important_tks"] = " ".join(d["important_kwd"])
  150. d["image"] = pdf_parser.crop(paper["abstract"])
  151. tokenize(d, txt, eng)
  152. res.append(d)
  153. readed = [0] * len(paper["lines"])
  154. # find colon firstly
  155. i = 0
  156. while i + 1 < len(paper["lines"]):
  157. txt = pdf_parser.remove_tag(paper["lines"][i][0])
  158. j = i
  159. if txt.strip("\n").strip()[-1] not in "::":
  160. i += 1
  161. continue
  162. i += 1
  163. while i < len(paper["lines"]) and not paper["lines"][i][0]:
  164. i += 1
  165. if i >= len(paper["lines"]): break
  166. proj = [paper["lines"][i][0].strip()]
  167. i += 1
  168. while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
  169. proj.append(paper["lines"][i])
  170. i += 1
  171. for k in range(j, i): readed[k] = True
  172. txt = txt[::-1]
  173. if eng:
  174. r = re.search(r"(.*?) ([\.;?!]|$)", txt)
  175. txt = r.group(1)[::-1] if r else txt[::-1]
  176. else:
  177. r = re.search(r"(.*?) ([。?;!]|$)", txt)
  178. txt = r.group(1)[::-1] if r else txt[::-1]
  179. for p in proj:
  180. d = copy.deepcopy(doc)
  181. txt += "\n" + pdf_parser.remove_tag(p)
  182. d["image"] = pdf_parser.crop(p)
  183. tokenize(d, txt)
  184. res.append(d)
  185. i = 0
  186. chunk = []
  187. tk_cnt = 0
  188. def add_chunk():
  189. nonlocal chunk, res, doc, pdf_parser, tk_cnt
  190. d = copy.deepcopy(doc)
  191. ck = "\n".join(chunk)
  192. tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
  193. d["image"] = pdf_parser.crop(ck)
  194. res.append(d)
  195. chunk = []
  196. tk_cnt = 0
  197. while i < len(paper["lines"]):
  198. if tk_cnt > 128:
  199. add_chunk()
  200. if readed[i]:
  201. i += 1
  202. continue
  203. readed[i] = True
  204. txt, layouts = paper["lines"][i]
  205. txt_ = pdf_parser.remove_tag(txt)
  206. i += 1
  207. cnt = num_tokens_from_string(txt_)
  208. if any([
  209. layouts.find("title") >= 0 and chunk,
  210. cnt + tk_cnt > 128 and tk_cnt > 32,
  211. ]):
  212. add_chunk()
  213. chunk = [txt]
  214. tk_cnt = cnt
  215. else:
  216. chunk.append(txt)
  217. tk_cnt += cnt
  218. if chunk: add_chunk()
  219. for i, d in enumerate(res):
  220. print(d)
  221. # d["image"].save(f"./logs/{i}.jpg")
  222. return res
  223. if __name__ == "__main__":
  224. import sys
  225. def dummy(a, b):
  226. pass
  227. chunk(sys.argv[1], callback=dummy)