Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

paper.py 9.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import copy
  14. import re
  15. from collections import Counter
  16. from api.db import ParserType
  17. from rag.nlp import huqie, tokenize, tokenize_table, add_positions
  18. from deepdoc.parser import PdfParser
  19. import numpy as np
  20. from rag.utils import num_tokens_from_string
  21. class Pdf(PdfParser):
  22. def __init__(self):
  23. self.model_speciess = ParserType.PAPER.value
  24. super().__init__()
  25. def __call__(self, filename, binary=None, from_page=0,
  26. to_page=100000, zoomin=3, callback=None):
  27. callback(msg="OCR is running...")
  28. self.__images__(
  29. filename if not binary else binary,
  30. zoomin,
  31. from_page,
  32. to_page,
  33. callback
  34. )
  35. callback(msg="OCR finished.")
  36. from timeit import default_timer as timer
  37. start = timer()
  38. self._layouts_rec(zoomin)
  39. callback(0.63, "Layout analysis finished")
  40. print("paddle layouts:", timer() - start)
  41. self._table_transformer_job(zoomin)
  42. callback(0.68, "Table analysis finished")
  43. self._text_merge()
  44. column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
  45. self._concat_downward(concat_between_pages=False)
  46. self._filter_forpages()
  47. callback(0.75, "Text merging finished.")
  48. tbls = self._extract_table_figure(True, zoomin, True, True)
  49. # clean mess
  50. if column_width < self.page_images[0].size[0] / zoomin / 2:
  51. print("two_column...................", column_width,
  52. self.page_images[0].size[0] / zoomin / 2)
  53. self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
  54. for b in self.boxes:
  55. b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
  56. freq = Counter([b["text"] for b in self.boxes])
  57. garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
  58. i = 0
  59. while i < len(self.boxes):
  60. if self.boxes[i]["text"] in garbage \
  61. or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
  62. or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
  63. self.boxes.pop(i)
  64. elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
  65. '1'):
  66. # merge within same layouts
  67. self.boxes[i + 1]["top"] = self.boxes[i]["top"]
  68. self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
  69. self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
  70. self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
  71. self.boxes.pop(i)
  72. else:
  73. i += 1
  74. def _begin(txt):
  75. return re.match(
  76. "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
  77. txt.lower().strip())
  78. if from_page > 0:
  79. return {
  80. "title":"",
  81. "authors": "",
  82. "abstract": "",
  83. "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
  84. re.match(r"(text|title)", b.get("layoutno", "text"))],
  85. "tables": tbls
  86. }
  87. # get title and authors
  88. title = ""
  89. authors = []
  90. i = 0
  91. while i < min(32, len(self.boxes)):
  92. b = self.boxes[i]
  93. i += 1
  94. if b.get("layoutno", "").find("title") >= 0:
  95. title = b["text"]
  96. if _begin(title):
  97. title = ""
  98. break
  99. for j in range(3):
  100. if _begin(self.boxes[i + j]["text"]): break
  101. authors.append(self.boxes[i + j]["text"])
  102. break
  103. break
  104. # get abstract
  105. abstr = ""
  106. i = 0
  107. while i + 1 < min(32, len(self.boxes)):
  108. b = self.boxes[i]
  109. i += 1
  110. txt = b["text"].lower().strip()
  111. if re.match("(abstract|摘要)", txt):
  112. if len(txt.split(" ")) > 32 or len(txt) > 64:
  113. abstr = txt + self._line_tag(b, zoomin)
  114. i += 1
  115. break
  116. txt = self.boxes[i + 1]["text"].lower().strip()
  117. if len(txt.split(" ")) > 32 or len(txt) > 64:
  118. abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
  119. i += 1
  120. break
  121. if not abstr: i = 0
  122. callback(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)))
  123. for b in self.boxes: print(b["text"], b.get("layoutno"))
  124. print(tbls)
  125. return {
  126. "title": title if title else filename,
  127. "authors": " ".join(authors),
  128. "abstract": abstr,
  129. "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
  130. re.match(r"(text|title)", b.get("layoutno", "text"))],
  131. "tables": tbls
  132. }
  133. def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
  134. """
  135. Only pdf is supported.
  136. The abstract of the paper will be sliced as an entire chunk, and will not be sliced partly.
  137. """
  138. pdf_parser = None
  139. if re.search(r"\.pdf$", filename, re.IGNORECASE):
  140. pdf_parser = Pdf()
  141. paper = pdf_parser(filename if not binary else binary,
  142. from_page=from_page, to_page=to_page, callback=callback)
  143. else: raise NotImplementedError("file type not supported yet(pdf supported)")
  144. doc = {"docnm_kwd": filename, "authors_tks": paper["authors"],
  145. "title_tks": huqie.qie(paper["title"] if paper["title"] else filename)}
  146. doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
  147. doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
  148. # is it English
  149. eng = lang.lower() == "english"#pdf_parser.is_english
  150. print("It's English.....", eng)
  151. res = tokenize_table(paper["tables"], doc, eng)
  152. if paper["abstract"]:
  153. d = copy.deepcopy(doc)
  154. txt = pdf_parser.remove_tag(paper["abstract"])
  155. d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
  156. d["important_tks"] = " ".join(d["important_kwd"])
  157. d["image"], poss = pdf_parser.crop(paper["abstract"], need_position=True)
  158. add_positions(d, poss)
  159. tokenize(d, txt, eng)
  160. res.append(d)
  161. readed = [0] * len(paper["lines"])
  162. # find colon firstly
  163. i = 0
  164. while i + 1 < len(paper["lines"]):
  165. txt = pdf_parser.remove_tag(paper["lines"][i][0])
  166. j = i
  167. if txt.strip("\n").strip()[-1] not in "::":
  168. i += 1
  169. continue
  170. i += 1
  171. while i < len(paper["lines"]) and not paper["lines"][i][0]:
  172. i += 1
  173. if i >= len(paper["lines"]): break
  174. proj = [paper["lines"][i][0].strip()]
  175. i += 1
  176. while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
  177. proj.append(paper["lines"][i])
  178. i += 1
  179. for k in range(j, i): readed[k] = True
  180. txt = txt[::-1]
  181. if eng:
  182. r = re.search(r"(.*?) ([\.;?!]|$)", txt)
  183. txt = r.group(1)[::-1] if r else txt[::-1]
  184. else:
  185. r = re.search(r"(.*?) ([。?;!]|$)", txt)
  186. txt = r.group(1)[::-1] if r else txt[::-1]
  187. for p in proj:
  188. d = copy.deepcopy(doc)
  189. txt += "\n" + pdf_parser.remove_tag(p)
  190. d["image"], poss = pdf_parser.crop(p, need_position=True)
  191. add_positions(d, poss)
  192. tokenize(d, txt, eng)
  193. res.append(d)
  194. i = 0
  195. chunk = []
  196. tk_cnt = 0
  197. def add_chunk():
  198. nonlocal chunk, res, doc, pdf_parser, tk_cnt
  199. d = copy.deepcopy(doc)
  200. ck = "\n".join(chunk)
  201. tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
  202. d["image"], poss = pdf_parser.crop(ck, need_position=True)
  203. add_positions(d, poss)
  204. res.append(d)
  205. chunk = []
  206. tk_cnt = 0
  207. while i < len(paper["lines"]):
  208. if tk_cnt > 128:
  209. add_chunk()
  210. if readed[i]:
  211. i += 1
  212. continue
  213. readed[i] = True
  214. txt, layouts = paper["lines"][i]
  215. txt_ = pdf_parser.remove_tag(txt)
  216. i += 1
  217. cnt = num_tokens_from_string(txt_)
  218. if any([
  219. layouts.find("title") >= 0 and chunk,
  220. cnt + tk_cnt > 128 and tk_cnt > 32,
  221. ]):
  222. add_chunk()
  223. chunk = [txt]
  224. tk_cnt = cnt
  225. else:
  226. chunk.append(txt)
  227. tk_cnt += cnt
  228. if chunk: add_chunk()
  229. for i, d in enumerate(res):
  230. print(d)
  231. # d["image"].save(f"./logs/{i}.jpg")
  232. return res
  233. if __name__ == "__main__":
  234. import sys
  235. def dummy(a, b):
  236. pass
  237. chunk(sys.argv[1], callback=dummy)