You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

paper.py 9.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import copy
  14. import re
  15. from collections import Counter
  16. from api.db import ParserType
  17. from deepdoc.parser import tokenize
  18. from rag.nlp import huqie
  19. from deepdoc.parser import PdfParser
  20. import numpy as np
  21. from rag.utils import num_tokens_from_string
  22. class Pdf(PdfParser):
  23. def __init__(self):
  24. self.model_speciess = ParserType.PAPER.value
  25. super().__init__()
  26. def __call__(self, filename, binary=None, from_page=0,
  27. to_page=100000, zoomin=3, callback=None):
  28. self.__images__(
  29. filename if not binary else binary,
  30. zoomin,
  31. from_page,
  32. to_page)
  33. callback(0.2, "OCR finished.")
  34. from timeit import default_timer as timer
  35. start = timer()
  36. self._layouts_rec(zoomin)
  37. callback(0.47, "Layout analysis finished")
  38. print("paddle layouts:", timer() - start)
  39. self._table_transformer_job(zoomin)
  40. callback(0.68, "Table analysis finished")
  41. self._text_merge()
  42. column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
  43. self._concat_downward(concat_between_pages=False)
  44. self._filter_forpages()
  45. callback(0.75, "Text merging finished.")
  46. tbls = self._extract_table_figure(True, zoomin, False)
  47. # clean mess
  48. if column_width < self.page_images[0].size[0] / zoomin / 2:
  49. print("two_column...................", column_width,
  50. self.page_images[0].size[0] / zoomin / 2)
  51. self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
  52. for b in self.boxes:
  53. b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
  54. freq = Counter([b["text"] for b in self.boxes])
  55. garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
  56. i = 0
  57. while i < len(self.boxes):
  58. if self.boxes[i]["text"] in garbage \
  59. or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
  60. or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
  61. self.boxes.pop(i)
  62. elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
  63. '1'):
  64. # merge within same layouts
  65. self.boxes[i + 1]["top"] = self.boxes[i]["top"]
  66. self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
  67. self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
  68. self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
  69. self.boxes.pop(i)
  70. else:
  71. i += 1
  72. def _begin(txt):
  73. return re.match(
  74. "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
  75. txt.lower().strip())
  76. if from_page > 0:
  77. return {
  78. "title":"",
  79. "authors": "",
  80. "abstract": "",
  81. "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
  82. re.match(r"(text|title)", b.get("layoutno", "text"))],
  83. "tables": tbls
  84. }
  85. # get title and authors
  86. title = ""
  87. authors = []
  88. i = 0
  89. while i < min(32, len(self.boxes)):
  90. b = self.boxes[i]
  91. i += 1
  92. if b.get("layoutno", "").find("title") >= 0:
  93. title = b["text"]
  94. if _begin(title):
  95. title = ""
  96. break
  97. for j in range(3):
  98. if _begin(self.boxes[i + j]["text"]): break
  99. authors.append(self.boxes[i + j]["text"])
  100. break
  101. break
  102. # get abstract
  103. abstr = ""
  104. i = 0
  105. while i + 1 < min(32, len(self.boxes)):
  106. b = self.boxes[i]
  107. i += 1
  108. txt = b["text"].lower().strip()
  109. if re.match("(abstract|摘要)", txt):
  110. if len(txt.split(" ")) > 32 or len(txt) > 64:
  111. abstr = txt + self._line_tag(b, zoomin)
  112. i += 1
  113. break
  114. txt = self.boxes[i + 1]["text"].lower().strip()
  115. if len(txt.split(" ")) > 32 or len(txt) > 64:
  116. abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
  117. i += 1
  118. break
  119. if not abstr: i = 0
  120. callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)))
  121. for b in self.boxes: print(b["text"], b.get("layoutno"))
  122. print(tbls)
  123. return {
  124. "title": title if title else filename,
  125. "authors": " ".join(authors),
  126. "abstract": abstr,
  127. "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
  128. re.match(r"(text|title)", b.get("layoutno", "text"))],
  129. "tables": tbls
  130. }
  131. def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
  132. """
  133. Only pdf is supported.
  134. The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
  135. """
  136. pdf_parser = None
  137. if re.search(r"\.pdf$", filename, re.IGNORECASE):
  138. pdf_parser = Pdf()
  139. paper = pdf_parser(filename if not binary else binary,
  140. from_page=from_page, to_page=to_page, callback=callback)
  141. else: raise NotImplementedError("file type not supported yet(pdf supported)")
  142. doc = {"docnm_kwd": filename, "authors_tks": paper["authors"],
  143. "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
  144. doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
  145. doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
  146. # is it English
  147. eng = pdf_parser.is_english
  148. print("It's English.....", eng)
  149. res = []
  150. # add tables
  151. for img, rows in paper["tables"]:
  152. bs = 10
  153. de = ";" if eng else ";"
  154. for i in range(0, len(rows), bs):
  155. d = copy.deepcopy(doc)
  156. r = de.join(rows[i:i + bs])
  157. r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
  158. tokenize(d, r)
  159. d["image"] = img
  160. res.append(d)
  161. if paper["abstract"]:
  162. d = copy.deepcopy(doc)
  163. txt = pdf_parser.remove_tag(paper["abstract"])
  164. d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
  165. d["important_tks"] = " ".join(d["important_kwd"])
  166. d["image"] = pdf_parser.crop(paper["abstract"])
  167. tokenize(d, txt, eng)
  168. res.append(d)
  169. readed = [0] * len(paper["lines"])
  170. # find colon firstly
  171. i = 0
  172. while i + 1 < len(paper["lines"]):
  173. txt = pdf_parser.remove_tag(paper["lines"][i][0])
  174. j = i
  175. if txt.strip("\n").strip()[-1] not in "::":
  176. i += 1
  177. continue
  178. i += 1
  179. while i < len(paper["lines"]) and not paper["lines"][i][0]:
  180. i += 1
  181. if i >= len(paper["lines"]): break
  182. proj = [paper["lines"][i][0].strip()]
  183. i += 1
  184. while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
  185. proj.append(paper["lines"][i])
  186. i += 1
  187. for k in range(j, i): readed[k] = True
  188. txt = txt[::-1]
  189. if eng:
  190. r = re.search(r"(.*?) ([\.;?!]|$)", txt)
  191. txt = r.group(1)[::-1] if r else txt[::-1]
  192. else:
  193. r = re.search(r"(.*?) ([。?;!]|$)", txt)
  194. txt = r.group(1)[::-1] if r else txt[::-1]
  195. for p in proj:
  196. d = copy.deepcopy(doc)
  197. txt += "\n" + pdf_parser.remove_tag(p)
  198. d["image"] = pdf_parser.crop(p)
  199. tokenize(d, txt)
  200. res.append(d)
  201. i = 0
  202. chunk = []
  203. tk_cnt = 0
  204. def add_chunk():
  205. nonlocal chunk, res, doc, pdf_parser, tk_cnt
  206. d = copy.deepcopy(doc)
  207. ck = "\n".join(chunk)
  208. tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
  209. d["image"] = pdf_parser.crop(ck)
  210. res.append(d)
  211. chunk = []
  212. tk_cnt = 0
  213. while i < len(paper["lines"]):
  214. if tk_cnt > 128:
  215. add_chunk()
  216. if readed[i]:
  217. i += 1
  218. continue
  219. readed[i] = True
  220. txt, layouts = paper["lines"][i]
  221. txt_ = pdf_parser.remove_tag(txt)
  222. i += 1
  223. cnt = num_tokens_from_string(txt_)
  224. if any([
  225. layouts.find("title") >= 0 and chunk,
  226. cnt + tk_cnt > 128 and tk_cnt > 32,
  227. ]):
  228. add_chunk()
  229. chunk = [txt]
  230. tk_cnt = cnt
  231. else:
  232. chunk.append(txt)
  233. tk_cnt += cnt
  234. if chunk: add_chunk()
  235. for i, d in enumerate(res):
  236. print(d)
  237. # d["image"].save(f"./logs/{i}.jpg")
  238. return res
  239. if __name__ == "__main__":
  240. import sys
  241. def dummy(a, b):
  242. pass
  243. chunk(sys.argv[1], callback=dummy)