You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

paper.py 9.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import copy
  14. import re
  15. from collections import Counter
  16. from api.db import ParserType
  17. from rag.nlp import huqie, tokenize, tokenize_table, add_positions
  18. from deepdoc.parser import PdfParser
  19. import numpy as np
  20. from rag.utils import num_tokens_from_string
  21. class Pdf(PdfParser):
  22. def __init__(self):
  23. self.model_speciess = ParserType.PAPER.value
  24. super().__init__()
  25. def __call__(self, filename, binary=None, from_page=0,
  26. to_page=100000, zoomin=3, callback=None):
  27. callback(msg="OCR is running...")
  28. self.__images__(
  29. filename if not binary else binary,
  30. zoomin,
  31. from_page,
  32. to_page)
  33. callback(0.2, "OCR finished.")
  34. from timeit import default_timer as timer
  35. start = timer()
  36. self._layouts_rec(zoomin)
  37. callback(0.47, "Layout analysis finished")
  38. print("paddle layouts:", timer() - start)
  39. self._table_transformer_job(zoomin)
  40. callback(0.68, "Table analysis finished")
  41. self._text_merge()
  42. column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
  43. self._concat_downward(concat_between_pages=False)
  44. self._filter_forpages()
  45. callback(0.75, "Text merging finished.")
  46. tbls = self._extract_table_figure(True, zoomin, False, True)
  47. # clean mess
  48. if column_width < self.page_images[0].size[0] / zoomin / 2:
  49. print("two_column...................", column_width,
  50. self.page_images[0].size[0] / zoomin / 2)
  51. self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
  52. for b in self.boxes:
  53. b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
  54. freq = Counter([b["text"] for b in self.boxes])
  55. garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
  56. i = 0
  57. while i < len(self.boxes):
  58. if self.boxes[i]["text"] in garbage \
  59. or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
  60. or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
  61. self.boxes.pop(i)
  62. elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
  63. '1'):
  64. # merge within same layouts
  65. self.boxes[i + 1]["top"] = self.boxes[i]["top"]
  66. self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
  67. self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
  68. self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
  69. self.boxes.pop(i)
  70. else:
  71. i += 1
  72. def _begin(txt):
  73. return re.match(
  74. "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
  75. txt.lower().strip())
  76. if from_page > 0:
  77. return {
  78. "title":"",
  79. "authors": "",
  80. "abstract": "",
  81. "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
  82. re.match(r"(text|title)", b.get("layoutno", "text"))],
  83. "tables": tbls
  84. }
  85. # get title and authors
  86. title = ""
  87. authors = []
  88. i = 0
  89. while i < min(32, len(self.boxes)):
  90. b = self.boxes[i]
  91. i += 1
  92. if b.get("layoutno", "").find("title") >= 0:
  93. title = b["text"]
  94. if _begin(title):
  95. title = ""
  96. break
  97. for j in range(3):
  98. if _begin(self.boxes[i + j]["text"]): break
  99. authors.append(self.boxes[i + j]["text"])
  100. break
  101. break
  102. # get abstract
  103. abstr = ""
  104. i = 0
  105. while i + 1 < min(32, len(self.boxes)):
  106. b = self.boxes[i]
  107. i += 1
  108. txt = b["text"].lower().strip()
  109. if re.match("(abstract|摘要)", txt):
  110. if len(txt.split(" ")) > 32 or len(txt) > 64:
  111. abstr = txt + self._line_tag(b, zoomin)
  112. i += 1
  113. break
  114. txt = self.boxes[i + 1]["text"].lower().strip()
  115. if len(txt.split(" ")) > 32 or len(txt) > 64:
  116. abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
  117. i += 1
  118. break
  119. if not abstr: i = 0
  120. callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)))
  121. for b in self.boxes: print(b["text"], b.get("layoutno"))
  122. print(tbls)
  123. return {
  124. "title": title if title else filename,
  125. "authors": " ".join(authors),
  126. "abstract": abstr,
  127. "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
  128. re.match(r"(text|title)", b.get("layoutno", "text"))],
  129. "tables": tbls
  130. }
  131. def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
  132. """
  133. Only pdf is supported.
  134. The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
  135. """
  136. pdf_parser = None
  137. if re.search(r"\.pdf$", filename, re.IGNORECASE):
  138. pdf_parser = Pdf()
  139. paper = pdf_parser(filename if not binary else binary,
  140. from_page=from_page, to_page=to_page, callback=callback)
  141. else: raise NotImplementedError("file type not supported yet(pdf supported)")
  142. doc = {"docnm_kwd": filename, "authors_tks": paper["authors"],
  143. "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
  144. doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
  145. doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
  146. # is it English
  147. eng = lang.lower() == "english"#pdf_parser.is_english
  148. print("It's English.....", eng)
  149. res = tokenize_table(paper["tables"], doc, eng)
  150. if paper["abstract"]:
  151. d = copy.deepcopy(doc)
  152. txt = pdf_parser.remove_tag(paper["abstract"])
  153. d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
  154. d["important_tks"] = " ".join(d["important_kwd"])
  155. d["image"], poss = pdf_parser.crop(paper["abstract"], need_position=True)
  156. add_positions(d, poss)
  157. tokenize(d, txt, eng)
  158. res.append(d)
  159. readed = [0] * len(paper["lines"])
  160. # find colon firstly
  161. i = 0
  162. while i + 1 < len(paper["lines"]):
  163. txt = pdf_parser.remove_tag(paper["lines"][i][0])
  164. j = i
  165. if txt.strip("\n").strip()[-1] not in "::":
  166. i += 1
  167. continue
  168. i += 1
  169. while i < len(paper["lines"]) and not paper["lines"][i][0]:
  170. i += 1
  171. if i >= len(paper["lines"]): break
  172. proj = [paper["lines"][i][0].strip()]
  173. i += 1
  174. while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
  175. proj.append(paper["lines"][i])
  176. i += 1
  177. for k in range(j, i): readed[k] = True
  178. txt = txt[::-1]
  179. if eng:
  180. r = re.search(r"(.*?) ([\.;?!]|$)", txt)
  181. txt = r.group(1)[::-1] if r else txt[::-1]
  182. else:
  183. r = re.search(r"(.*?) ([。?;!]|$)", txt)
  184. txt = r.group(1)[::-1] if r else txt[::-1]
  185. for p in proj:
  186. d = copy.deepcopy(doc)
  187. txt += "\n" + pdf_parser.remove_tag(p)
  188. d["image"], poss = pdf_parser.crop(p, need_position=True)
  189. add_positions(d, poss)
  190. tokenize(d, txt, eng)
  191. res.append(d)
  192. i = 0
  193. chunk = []
  194. tk_cnt = 0
  195. def add_chunk():
  196. nonlocal chunk, res, doc, pdf_parser, tk_cnt
  197. d = copy.deepcopy(doc)
  198. ck = "\n".join(chunk)
  199. tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
  200. d["image"], poss = pdf_parser.crop(ck, need_position=True)
  201. add_positions(d, poss)
  202. res.append(d)
  203. chunk = []
  204. tk_cnt = 0
  205. while i < len(paper["lines"]):
  206. if tk_cnt > 128:
  207. add_chunk()
  208. if readed[i]:
  209. i += 1
  210. continue
  211. readed[i] = True
  212. txt, layouts = paper["lines"][i]
  213. txt_ = pdf_parser.remove_tag(txt)
  214. i += 1
  215. cnt = num_tokens_from_string(txt_)
  216. if any([
  217. layouts.find("title") >= 0 and chunk,
  218. cnt + tk_cnt > 128 and tk_cnt > 32,
  219. ]):
  220. add_chunk()
  221. chunk = [txt]
  222. tk_cnt = cnt
  223. else:
  224. chunk.append(txt)
  225. tk_cnt += cnt
  226. if chunk: add_chunk()
  227. for i, d in enumerate(res):
  228. print(d)
  229. # d["image"].save(f"./logs/{i}.jpg")
  230. return res
  231. if __name__ == "__main__":
  232. import sys
  233. def dummy(a, b):
  234. pass
  235. chunk(sys.argv[1], callback=dummy)