You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

paper.py 8.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. import copy
  2. import re
  3. from collections import Counter
  4. from rag.app import callback__, bullets_category, BULLET_PATTERN, is_english, tokenize
  5. from rag.nlp import huqie, stemmer
  6. from rag.parser.docx_parser import HuDocxParser
  7. from rag.parser.pdf_parser import HuParser
  8. from nltk.tokenize import word_tokenize
  9. import numpy as np
  10. from rag.utils import num_tokens_from_string
  11. class Pdf(HuParser):
  12. def __call__(self, filename, binary=None, from_page=0,
  13. to_page=100000, zoomin=3, callback=None):
  14. self.__images__(
  15. filename if not binary else binary,
  16. zoomin,
  17. from_page,
  18. to_page)
  19. callback__(0.2, "OCR finished.", callback)
  20. from timeit import default_timer as timer
  21. start = timer()
  22. self._layouts_paddle(zoomin)
  23. callback__(0.47, "Layout analysis finished", callback)
  24. print("paddle layouts:", timer() - start)
  25. self._table_transformer_job(zoomin)
  26. callback__(0.68, "Table analysis finished", callback)
  27. self._text_merge()
  28. column_width = np.median([b["x1"] - b["x0"] for b in self.boxes])
  29. self._concat_downward(concat_between_pages=False)
  30. self._filter_forpages()
  31. callback__(0.75, "Text merging finished.", callback)
  32. tbls = self._extract_table_figure(True, zoomin, False)
  33. # clean mess
  34. if column_width < self.page_images[0].size[0] / zoomin / 2:
  35. print("two_column...................", column_width,
  36. self.page_images[0].size[0] / zoomin / 2)
  37. self.boxes = self.sort_X_by_page(self.boxes, column_width / 2)
  38. for b in self.boxes:
  39. b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
  40. freq = Counter([b["text"] for b in self.boxes])
  41. garbage = set([k for k, v in freq.items() if v > self.total_page * 0.6])
  42. i = 0
  43. while i < len(self.boxes):
  44. if self.boxes[i]["text"] in garbage \
  45. or (re.match(r"[a-zA-Z0-9]+$", self.boxes[i]["text"]) and not self.boxes[i].get("layoutno")) \
  46. or (i + 1 < len(self.boxes) and self.boxes[i]["text"] == self.boxes[i + 1]["text"]):
  47. self.boxes.pop(i)
  48. elif i + 1 < len(self.boxes) and self.boxes[i].get("layoutno", '0') == self.boxes[i + 1].get("layoutno",
  49. '1'):
  50. # merge within same layouts
  51. self.boxes[i + 1]["top"] = self.boxes[i]["top"]
  52. self.boxes[i + 1]["x0"] = min(self.boxes[i]["x0"], self.boxes[i + 1]["x0"])
  53. self.boxes[i + 1]["x1"] = max(self.boxes[i]["x1"], self.boxes[i + 1]["x1"])
  54. self.boxes[i + 1]["text"] = self.boxes[i]["text"] + " " + self.boxes[i + 1]["text"]
  55. self.boxes.pop(i)
  56. else:
  57. i += 1
  58. def _begin(txt):
  59. return re.match(
  60. "[0-9. 一、i]*(introduction|abstract|摘要|引言|keywords|key words|关键词|background|背景|目录|前言|contents)",
  61. txt.lower().strip())
  62. # get title and authors
  63. title = ""
  64. authors = []
  65. i = 0
  66. while i < min(32, len(self.boxes)):
  67. b = self.boxes[i]
  68. i += 1
  69. if b.get("layoutno", "").find("title") >= 0:
  70. title = b["text"]
  71. if _begin(title):
  72. title = ""
  73. break
  74. for j in range(3):
  75. if _begin(self.boxes[i + j]["text"]): break
  76. authors.append(self.boxes[i + j]["text"])
  77. break
  78. break
  79. # get abstract
  80. abstr = ""
  81. i = 0
  82. while i + 1 < min(32, len(self.boxes)):
  83. b = self.boxes[i]
  84. i += 1
  85. txt = b["text"].lower().strip()
  86. if re.match("(abstract|摘要)", txt):
  87. if len(txt.split(" ")) > 32 or len(txt) > 64:
  88. abstr = txt + self._line_tag(b, zoomin)
  89. i += 1
  90. break
  91. txt = self.boxes[i + 1]["text"].lower().strip()
  92. if len(txt.split(" ")) > 32 or len(txt) > 64:
  93. abstr = txt + self._line_tag(self.boxes[i + 1], zoomin)
  94. i += 1
  95. break
  96. if not abstr: i = 0
  97. callback__(0.8, "Page {}~{}: Text merging finished".format(from_page, min(to_page, self.total_page)), callback)
  98. for b in self.boxes: print(b["text"], b.get("layoutno"))
  99. print(tbls)
  100. return {
  101. "title": title if title else filename,
  102. "authors": " ".join(authors),
  103. "abstract": abstr,
  104. "lines": [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes[i:] if
  105. re.match(r"(text|title)", b.get("layoutno", "text"))],
  106. "tables": tbls
  107. }
  108. def chunk(filename, binary=None, from_page=0, to_page=100000, callback=None):
  109. pdf_parser = None
  110. paper = {}
  111. if re.search(r"\.pdf$", filename, re.IGNORECASE):
  112. pdf_parser = Pdf()
  113. paper = pdf_parser(filename if not binary else binary,
  114. from_page=from_page, to_page=to_page, callback=callback)
  115. else: raise NotImplementedError("file type not supported yet(pdf supported)")
  116. doc = {
  117. "docnm_kwd": paper["title"] if paper["title"] else filename,
  118. "authors_tks": paper["authors"]
  119. }
  120. doc["title_tks"] = huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
  121. doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
  122. doc["authors_sm_tks"] = huqie.qieqie(doc["authors_tks"])
  123. # is it English
  124. eng = pdf_parser.is_english
  125. print("It's English.....", eng)
  126. res = []
  127. # add tables
  128. for img, rows in paper["tables"]:
  129. bs = 10
  130. de = ";" if eng else ";"
  131. for i in range(0, len(rows), bs):
  132. d = copy.deepcopy(doc)
  133. r = de.join(rows[i:i + bs])
  134. r = re.sub(r"\t——(来自| in ).*”%s" % de, "", r)
  135. tokenize(d, r)
  136. d["image"] = img
  137. res.append(d)
  138. if paper["abstract"]:
  139. d = copy.deepcopy(doc)
  140. txt = pdf_parser.remove_tag(paper["abstract"])
  141. d["important_kwd"] = ["abstract", "总结", "概括", "summary", "summarize"]
  142. d["important_tks"] = " ".join(d["important_kwd"])
  143. d["image"] = pdf_parser.crop(paper["abstract"])
  144. tokenize(d, txt, eng)
  145. res.append(d)
  146. readed = [0] * len(paper["lines"])
  147. # find colon firstly
  148. i = 0
  149. while i + 1 < len(paper["lines"]):
  150. txt = pdf_parser.remove_tag(paper["lines"][i][0])
  151. j = i
  152. if txt.strip("\n").strip()[-1] not in "::":
  153. i += 1
  154. continue
  155. i += 1
  156. while i < len(paper["lines"]) and not paper["lines"][i][0]:
  157. i += 1
  158. if i >= len(paper["lines"]): break
  159. proj = [paper["lines"][i][0].strip()]
  160. i += 1
  161. while i < len(paper["lines"]) and paper["lines"][i][0].strip()[0] == proj[-1][0]:
  162. proj.append(paper["lines"][i])
  163. i += 1
  164. for k in range(j, i): readed[k] = True
  165. txt = txt[::-1]
  166. if eng:
  167. r = re.search(r"(.*?) ([\.;?!]|$)", txt)
  168. txt = r.group(1)[::-1] if r else txt[::-1]
  169. else:
  170. r = re.search(r"(.*?) ([。?;!]|$)", txt)
  171. txt = r.group(1)[::-1] if r else txt[::-1]
  172. for p in proj:
  173. d = copy.deepcopy(doc)
  174. txt += "\n" + pdf_parser.remove_tag(p)
  175. d["image"] = pdf_parser.crop(p)
  176. tokenize(d, txt)
  177. res.append(d)
  178. i = 0
  179. chunk = []
  180. tk_cnt = 0
  181. def add_chunk():
  182. nonlocal chunk, res, doc, pdf_parser, tk_cnt
  183. d = copy.deepcopy(doc)
  184. ck = "\n".join(chunk)
  185. tokenize(d, pdf_parser.remove_tag(ck), pdf_parser.is_english)
  186. d["image"] = pdf_parser.crop(ck)
  187. res.append(d)
  188. chunk = []
  189. tk_cnt = 0
  190. while i < len(paper["lines"]):
  191. if tk_cnt > 128:
  192. add_chunk()
  193. if readed[i]:
  194. i += 1
  195. continue
  196. readed[i] = True
  197. txt, layouts = paper["lines"][i]
  198. txt_ = pdf_parser.remove_tag(txt)
  199. i += 1
  200. cnt = num_tokens_from_string(txt_)
  201. if any([
  202. layouts.find("title") >= 0 and chunk,
  203. cnt + tk_cnt > 128 and tk_cnt > 32,
  204. ]):
  205. add_chunk()
  206. chunk = [txt]
  207. tk_cnt = cnt
  208. else:
  209. chunk.append(txt)
  210. tk_cnt += cnt
  211. if chunk: add_chunk()
  212. for i, d in enumerate(res):
  213. print(d)
  214. # d["image"].save(f"./logs/{i}.jpg")
  215. return res
  216. if __name__ == "__main__":
  217. import sys
  218. chunk(sys.argv[1])