You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

paper.py 9.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. import copy
  2. import re
  3. from collections import Counter
  4. from api.db import ParserType
  5. from rag.cv.ppdetection import PPDet
  6. from rag.parser import tokenize
  7. from rag.nlp import huqie
  8. from rag.parser.pdf_parser import HuParser
  9. import numpy as np
  10. from rag.utils import num_tokens_from_string
  11. class Pdf(HuParser):
  12. def __init__(self):
  13. self.model_speciess = ParserType.PAPER.value
  14. super().__init__()
  15. def __call__(self, filename, binary=None, from_page=0,
  16. to_page=100000, zoomin=3, callback=None):
  17. self.__images__(
  18. filename if not binary else binary,
  19. zoomin,
  20. from_page,
  21. to_page)
  22. callback(0.2, "OCR finished.")
  23. from timeit import default_timer as timer
  24. start = timer()
  25. self._layouts_paddle(zoomin)
  26. callback(0.47, "Layout analysis finished")
  27. print("paddle layouts:", timer() - start)
  28. self._table_transformer_job(zoomin)
  29. callback(0.68, "Table analysis finished")
  30. self._text_merge()
  31. column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
  32. self._concat_downward(concat_between_pages=False)
  33. self._filter_forpages()
  34. callback(0.75, "Text merging finished.")
  35. tbls = self._extract_table_figure(True, zoomin, False)
  36. # clean mess
  37. if column_width < self.page_images[0].size[0] / zoomin / 2:
  38. print("two_column...................", column_width,
  39. self.page_images[0].size[0] / zoomin / 2)
  40. self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
  41. for b in self.boxes:
  42. b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
  43. freq = Counter([b["text"] for b in self.boxes])
  44. garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
  45. i = 0
  46. while i < len(self.boxes):
  47. if self.boxes[i]["text"] in garbage \
  48. or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
  49. or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
  50. self.boxes.pop(i)
  51. elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
  52. '1'):
  53. # merge within same layouts
  54. self.boxes[i + 1]["top"] = self.boxes[i]["top"]
  55. self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
  56. self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
  57. self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
  58. self.boxes.pop(i)
  59. else:
  60. i += 1
  61. def _begin(txt):
  62. return re.match(
  63. "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
  64. txt.lower().strip())
  65. if from_page > 0:
  66. return {
  67. "title":"",
  68. "authors": "",
  69. "abstract": "",
  70. "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
  71. re.match(r"(text|title)", b.get("layoutno", "text"))],
  72. "tables": tbls
  73. }
  74. # get title and authors
  75. title = ""
  76. authors = []
  77. i = 0
  78. while i < min(32, len(self.boxes)):
  79. b = self.boxes[i]
  80. i += 1
  81. if b.get("layoutno", "").find("title") >= 0:
  82. title = b["text"]
  83. if _begin(title):
  84. title = ""
  85. break
  86. for j in range(3):
  87. if _begin(self.boxes[i + j]["text"]): break
  88. authors.append(self.boxes[i + j]["text"])
  89. break
  90. break
  91. # get abstract
  92. abstr = ""
  93. i = 0
  94. while i + 1 < min(32, len(self.boxes)):
  95. b = self.boxes[i]
  96. i += 1
  97. txt = b["text"].lower().strip()
  98. if re.match("(abstract|摘要)", txt):
  99. if len(txt.split(" ")) > 32 or len(txt) > 64:
  100. abstr = txt + self._line_tag(b, zoomin)
  101. i += 1
  102. break
  103. txt = self.boxes[i + 1]["text"].lower().strip()
  104. if len(txt.split(" ")) > 32 or len(txt) > 64:
  105. abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
  106. i += 1
  107. break
  108. if not abstr: i = 0
  109. callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)))
  110. for b in self.boxes: print(b["text"], b.get("layoutno"))
  111. print(tbls)
  112. return {
  113. "title": title if title else filename,
  114. "authors": " ".join(authors),
  115. "abstract": abstr,
  116. "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
  117. re.match(r"(text|title)", b.get("layoutno", "text"))],
  118. "tables": tbls
  119. }
  120. def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
  121. pdf_parser = None
  122. if re.search(r"\.pdf$", filename, re.IGNORECASE):
  123. pdf_parser = Pdf()
  124. paper = pdf_parser(filename if not binary else binary,
  125. from_page=from_page, to_page=to_page, callback=callback)
  126. else: raise NotImplementedError("file type not supported yet(pdf supported)")
  127. doc = {"docnm_kwd": filename, "authors_tks": paper["authors"],
  128. "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
  129. doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
  130. doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
  131. # is it English
  132. eng = pdf_parser.is_english
  133. print("It's English.....", eng)
  134. res = []
  135. # add tables
  136. for img, rows in paper["tables"]:
  137. bs = 10
  138. de = ";" if eng else ";"
  139. for i in range(0, len(rows), bs):
  140. d = copy.deepcopy(doc)
  141. r = de.join(rows[i:i + bs])
  142. r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
  143. tokenize(d, r)
  144. d["image"] = img
  145. res.append(d)
  146. if paper["abstract"]:
  147. d = copy.deepcopy(doc)
  148. txt = pdf_parser.remove_tag(paper["abstract"])
  149. d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
  150. d["important_tks"] = " ".join(d["important_kwd"])
  151. d["image"] = pdf_parser.crop(paper["abstract"])
  152. tokenize(d, txt, eng)
  153. res.append(d)
  154. readed = [0] * len(paper["lines"])
  155. # find colon firstly
  156. i = 0
  157. while i + 1 < len(paper["lines"]):
  158. txt = pdf_parser.remove_tag(paper["lines"][i][0])
  159. j = i
  160. if txt.strip("\n").strip()[-1] not in "::":
  161. i += 1
  162. continue
  163. i += 1
  164. while i < len(paper["lines"]) and not paper["lines"][i][0]:
  165. i += 1
  166. if i >= len(paper["lines"]): break
  167. proj = [paper["lines"][i][0].strip()]
  168. i += 1
  169. while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
  170. proj.append(paper["lines"][i])
  171. i += 1
  172. for k in range(j, i): readed[k] = True
  173. txt = txt[::-1]
  174. if eng:
  175. r = re.search(r"(.*?) ([\.;?!]|$)", txt)
  176. txt = r.group(1)[::-1] if r else txt[::-1]
  177. else:
  178. r = re.search(r"(.*?) ([。?;!]|$)", txt)
  179. txt = r.group(1)[::-1] if r else txt[::-1]
  180. for p in proj:
  181. d = copy.deepcopy(doc)
  182. txt += "\n" + pdf_parser.remove_tag(p)
  183. d["image"] = pdf_parser.crop(p)
  184. tokenize(d, txt)
  185. res.append(d)
  186. i = 0
  187. chunk = []
  188. tk_cnt = 0
  189. def add_chunk():
  190. nonlocal chunk, res, doc, pdf_parser, tk_cnt
  191. d = copy.deepcopy(doc)
  192. ck = "\n".join(chunk)
  193. tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
  194. d["image"] = pdf_parser.crop(ck)
  195. res.append(d)
  196. chunk = []
  197. tk_cnt = 0
  198. while i < len(paper["lines"]):
  199. if tk_cnt > 128:
  200. add_chunk()
  201. if readed[i]:
  202. i += 1
  203. continue
  204. readed[i] = True
  205. txt, layouts = paper["lines"][i]
  206. txt_ = pdf_parser.remove_tag(txt)
  207. i += 1
  208. cnt = num_tokens_from_string(txt_)
  209. if any([
  210. layouts.find("title") >= 0 and chunk,
  211. cnt + tk_cnt > 128 and tk_cnt > 32,
  212. ]):
  213. add_chunk()
  214. chunk = [txt]
  215. tk_cnt = cnt
  216. else:
  217. chunk.append(txt)
  218. tk_cnt += cnt
  219. if chunk: add_chunk()
  220. for i, d in enumerate(res):
  221. print(d)
  222. # d["image"].save(f"./logs/{i}.jpg")
  223. return res
  224. if __name__ == "__main__":
  225. import sys
  226. def dummy(a, b):
  227. pass
  228. chunk(sys.argv[1], callback=dummy)