You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

paper.py 9.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import copy
  14. import re
  15. from collections import Counter
  16. from api.db import ParserType
  17. from rag.nlp import huqie, tokenize, tokenize_table
  18. from deepdoc.parser import PdfParser
  19. import numpy as np
  20. from rag.utils import num_tokens_from_string
  21. class Pdf(PdfParser):
  22. def __init__(self):
  23. self.model_speciess = ParserType.PAPER.value
  24. super().__init__()
  25. def __call__(self, filename, binary=None, from_page=0,
  26. to_page=100000, zoomin=3, callback=None):
  27. self.__images__(
  28. filename if not binary else binary,
  29. zoomin,
  30. from_page,
  31. to_page)
  32. callback(0.2, "OCR finished.")
  33. from timeit import default_timer as timer
  34. start = timer()
  35. self._layouts_rec(zoomin)
  36. callback(0.47, "Layout analysis finished")
  37. print("paddle layouts:", timer() - start)
  38. self._table_transformer_job(zoomin)
  39. callback(0.68, "Table analysis finished")
  40. self._text_merge()
  41. column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
  42. self._concat_downward(concat_between_pages=False)
  43. self._filter_forpages()
  44. callback(0.75, "Text merging finished.")
  45. tbls = self._extract_table_figure(True, zoomin, False)
  46. # clean mess
  47. if column_width < self.page_images[0].size[0] / zoomin / 2:
  48. print("two_column...................", column_width,
  49. self.page_images[0].size[0] / zoomin / 2)
  50. self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
  51. for b in self.boxes:
  52. b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
  53. freq = Counter([b["text"] for b in self.boxes])
  54. garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
  55. i = 0
  56. while i < len(self.boxes):
  57. if self.boxes[i]["text"] in garbage \
  58. or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
  59. or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
  60. self.boxes.pop(i)
  61. elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
  62. '1'):
  63. # merge within same layouts
  64. self.boxes[i + 1]["top"] = self.boxes[i]["top"]
  65. self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
  66. self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
  67. self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
  68. self.boxes.pop(i)
  69. else:
  70. i += 1
  71. def _begin(txt):
  72. return re.match(
  73. "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
  74. txt.lower().strip())
  75. if from_page > 0:
  76. return {
  77. "title":"",
  78. "authors": "",
  79. "abstract": "",
  80. "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
  81. re.match(r"(text|title)", b.get("layoutno", "text"))],
  82. "tables": tbls
  83. }
  84. # get title and authors
  85. title = ""
  86. authors = []
  87. i = 0
  88. while i < min(32, len(self.boxes)):
  89. b = self.boxes[i]
  90. i += 1
  91. if b.get("layoutno", "").find("title") >= 0:
  92. title = b["text"]
  93. if _begin(title):
  94. title = ""
  95. break
  96. for j in range(3):
  97. if _begin(self.boxes[i + j]["text"]): break
  98. authors.append(self.boxes[i + j]["text"])
  99. break
  100. break
  101. # get abstract
  102. abstr = ""
  103. i = 0
  104. while i + 1 < min(32, len(self.boxes)):
  105. b = self.boxes[i]
  106. i += 1
  107. txt = b["text"].lower().strip()
  108. if re.match("(abstract|摘要)", txt):
  109. if len(txt.split(" ")) > 32 or len(txt) > 64:
  110. abstr = txt + self._line_tag(b, zoomin)
  111. i += 1
  112. break
  113. txt = self.boxes[i + 1]["text"].lower().strip()
  114. if len(txt.split(" ")) > 32 or len(txt) > 64:
  115. abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
  116. i += 1
  117. break
  118. if not abstr: i = 0
  119. callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)))
  120. for b in self.boxes: print(b["text"], b.get("layoutno"))
  121. print(tbls)
  122. return {
  123. "title": title if title else filename,
  124. "authors": " ".join(authors),
  125. "abstract": abstr,
  126. "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
  127. re.match(r"(text|title)", b.get("layoutno", "text"))],
  128. "tables": tbls
  129. }
  130. def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
  131. """
  132. Only pdf is supported.
  133. The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
  134. """
  135. pdf_parser = None
  136. if re.search(r"\.pdf$", filename, re.IGNORECASE):
  137. pdf_parser = Pdf()
  138. paper = pdf_parser(filename if not binary else binary,
  139. from_page=from_page, to_page=to_page, callback=callback)
  140. else: raise NotImplementedError("file type not supported yet(pdf supported)")
  141. doc = {"docnm_kwd": filename, "authors_tks": paper["authors"],
  142. "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
  143. doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
  144. doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
  145. # is it English
  146. eng = lang.lower() == "english"#pdf_parser.is_english
  147. print("It's English.....", eng)
  148. res = tokenize_table(paper["tables"], doc, eng)
  149. if paper["abstract"]:
  150. d = copy.deepcopy(doc)
  151. txt = pdf_parser.remove_tag(paper["abstract"])
  152. d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
  153. d["important_tks"] = " ".join(d["important_kwd"])
  154. d["image"] = pdf_parser.crop(paper["abstract"])
  155. tokenize(d, txt, eng)
  156. res.append(d)
  157. readed = [0] * len(paper["lines"])
  158. # find colon firstly
  159. i = 0
  160. while i + 1 < len(paper["lines"]):
  161. txt = pdf_parser.remove_tag(paper["lines"][i][0])
  162. j = i
  163. if txt.strip("\n").strip()[-1] not in "::":
  164. i += 1
  165. continue
  166. i += 1
  167. while i < len(paper["lines"]) and not paper["lines"][i][0]:
  168. i += 1
  169. if i >= len(paper["lines"]): break
  170. proj = [paper["lines"][i][0].strip()]
  171. i += 1
  172. while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
  173. proj.append(paper["lines"][i])
  174. i += 1
  175. for k in range(j, i): readed[k] = True
  176. txt = txt[::-1]
  177. if eng:
  178. r = re.search(r"(.*?) ([\.;?!]|$)", txt)
  179. txt = r.group(1)[::-1] if r else txt[::-1]
  180. else:
  181. r = re.search(r"(.*?) ([。?;!]|$)", txt)
  182. txt = r.group(1)[::-1] if r else txt[::-1]
  183. for p in proj:
  184. d = copy.deepcopy(doc)
  185. txt += "\n" + pdf_parser.remove_tag(p)
  186. d["image"] = pdf_parser.crop(p)
  187. tokenize(d, txt)
  188. res.append(d)
  189. i = 0
  190. chunk = []
  191. tk_cnt = 0
  192. def add_chunk():
  193. nonlocal chunk, res, doc, pdf_parser, tk_cnt
  194. d = copy.deepcopy(doc)
  195. ck = "\n".join(chunk)
  196. tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
  197. d["image"] = pdf_parser.crop(ck)
  198. res.append(d)
  199. chunk = []
  200. tk_cnt = 0
  201. while i < len(paper["lines"]):
  202. if tk_cnt > 128:
  203. add_chunk()
  204. if readed[i]:
  205. i += 1
  206. continue
  207. readed[i] = True
  208. txt, layouts = paper["lines"][i]
  209. txt_ = pdf_parser.remove_tag(txt)
  210. i += 1
  211. cnt = num_tokens_from_string(txt_)
  212. if any([
  213. layouts.find("title") >= 0 and chunk,
  214. cnt + tk_cnt > 128 and tk_cnt > 32,
  215. ]):
  216. add_chunk()
  217. chunk = [txt]
  218. tk_cnt = cnt
  219. else:
  220. chunk.append(txt)
  221. tk_cnt += cnt
  222. if chunk: add_chunk()
  223. for i, d in enumerate(res):
  224. print(d)
  225. # d["image"].save(f"./logs/{i}.jpg")
  226. return res
  227. if __name__ == "__main__":
  228. import sys
  229. def dummy(a, b):
  230. pass
  231. chunk(sys.argv[1], callback=dummy)