您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

pdf_parser.py 44KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109
  1. # -*- coding: utf-8 -*-
  2. import random
  3. import fitz
  4. import xgboost as xgb
  5. from io import BytesIO
  6. import torch
  7. import re
  8. import pdfplumber
  9. import logging
  10. from PIL import Image, ImageDraw
  11. import numpy as np
  12. from PyPDF2 import PdfReader as pdf2_read
  13. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  14. from rag.nlp import huqie
  15. from copy import deepcopy
  16. from huggingface_hub import hf_hub_download
  17. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  18. class HuParser:
  19. def __init__(self):
  20. self.ocr = OCR()
  21. if hasattr(self, "model_speciess"):
  22. self.layouter = LayoutRecognizer("layout." + self.model_speciess)
  23. else:
  24. self.layouter = LayoutRecognizer("layout")
  25. self.tbl_det = TableStructureRecognizer()
  26. self.updown_cnt_mdl = xgb.Booster()
  27. if torch.cuda.is_available():
  28. self.updown_cnt_mdl.set_param({"device": "cuda"})
  29. self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
  30. filename="updown_concat_xgb.model"))
  31. self.page_from = 0
  32. """
  33. If you have trouble downloading HuggingFace models, -_^ this might help!!
  34. For Linux:
  35. export HF_ENDPOINT=https://hf-mirror.com
  36. For Windows:
  37. Good luck
  38. ^_-
  39. """
  40. def __char_width(self, c):
  41. return (c["x1"] - c["x0"]) // len(c["text"])
  42. def __height(self, c):
  43. return c["bottom"] - c["top"]
  44. def _x_dis(self, a, b):
  45. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  46. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  47. def _y_dis(
  48. self, a, b):
  49. return (
  50. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  51. def _match_proj(self, b):
  52. proj_patt = [
  53. r"第[零一二三四五六七八九十百]+章",
  54. r"第[零一二三四五六七八九十百]+[条节]",
  55. r"[零一二三四五六七八九十百]+[、是  ]",
  56. r"[\((][零一二三四五六七八九十百]+[)\)]",
  57. r"[\((][0-9]+[)\)]",
  58. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  59. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  60. r"[⚫•➢①② ]",
  61. ]
  62. return any([re.match(p, b["text"]) for p in proj_patt])
  63. def _updown_concat_features(self, up, down):
  64. w = max(self.__char_width(up), self.__char_width(down))
  65. h = max(self.__height(up), self.__height(down))
  66. y_dis = self._y_dis(up, down)
  67. LEN = 6
  68. tks_down = huqie.qie(down["text"][:LEN]).split(" ")
  69. tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
  70. tks_all = up["text"][-LEN:].strip() \
  71. + (" " if re.match(r"[a-zA-Z0-9]+",
  72. up["text"][-1] + down["text"][0]) else "") \
  73. + down["text"][:LEN].strip()
  74. tks_all = huqie.qie(tks_all).split(" ")
  75. fea = [
  76. up.get("R", -1) == down.get("R", -1),
  77. y_dis / h,
  78. down["page_number"] - up["page_number"],
  79. up["layout_type"] == down["layout_type"],
  80. up["layout_type"] == "text",
  81. down["layout_type"] == "text",
  82. up["layout_type"] == "table",
  83. down["layout_type"] == "table",
  84. True if re.search(
  85. r"([。?!;!?;+))]|[a-z]\.)$",
  86. up["text"]) else False,
  87. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  88. True if re.search(
  89. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  90. down["text"]) else False,
  91. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  92. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  93. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  94. True if re.search(r"[\((][^\))]+$", up["text"])
  95. and re.search(r"[\))]", down["text"]) else False,
  96. self._match_proj(down),
  97. True if re.match(r"[A-Z]", down["text"]) else False,
  98. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  99. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  100. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  101. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  102. ) > 1 and len(
  103. down["text"].strip()) > 1 else False,
  104. up["x0"] > down["x1"],
  105. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  106. self.__height(down)),
  107. self._x_dis(up, down) / max(w, 0.000001),
  108. (len(up["text"]) - len(down["text"])) /
  109. max(len(up["text"]), len(down["text"])),
  110. len(tks_all) - len(tks_up) - len(tks_down),
  111. len(tks_down) - len(tks_up),
  112. tks_down[-1] == tks_up[-1],
  113. max(down["in_row"], up["in_row"]),
  114. abs(down["in_row"] - up["in_row"]),
  115. len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
  116. len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
  117. ]
  118. return fea
  119. @staticmethod
  120. def sort_X_by_page(arr, threashold):
  121. # sort using y1 first and then x1
  122. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  123. for i in range(len(arr) - 1):
  124. for j in range(i, -1, -1):
  125. # restore the order using th
  126. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  127. and arr[j + 1]["top"] < arr[j]["top"] \
  128. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  129. tmp = arr[j]
  130. arr[j] = arr[j + 1]
  131. arr[j + 1] = tmp
  132. return arr
  133. def _has_color(self, o):
  134. if o.get("ncs", "") == "DeviceGray":
  135. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  136. o["non_stroking_color"][0] == 1:
  137. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  138. return False
  139. return True
  140. def _table_transformer_job(self, ZM):
  141. logging.info("Table processing...")
  142. imgs, pos = [], []
  143. tbcnt = [0]
  144. MARGIN = 10
  145. self.tb_cpns = []
  146. assert len(self.page_layout) == len(self.page_images)
  147. for p, tbls in enumerate(self.page_layout): # for page
  148. tbls = [f for f in tbls if f["type"] == "table"]
  149. tbcnt.append(len(tbls))
  150. if not tbls:
  151. continue
  152. for tb in tbls: # for table
  153. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  154. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  155. left *= ZM
  156. top *= ZM
  157. right *= ZM
  158. bott *= ZM
  159. pos.append((left, top))
  160. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  161. assert len(self.page_images) == len(tbcnt) - 1
  162. if not imgs:
  163. return
  164. recos = self.tbl_det(imgs)
  165. tbcnt = np.cumsum(tbcnt)
  166. for i in range(len(tbcnt) - 1): # for page
  167. pg = []
  168. for j, tb_items in enumerate(
  169. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  170. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  171. for it in tb_items: # for table components
  172. it["x0"] = (it["x0"] + poss[j][0])
  173. it["x1"] = (it["x1"] + poss[j][0])
  174. it["top"] = (it["top"] + poss[j][1])
  175. it["bottom"] = (it["bottom"] + poss[j][1])
  176. for n in ["x0", "x1", "top", "bottom"]:
  177. it[n] /= ZM
  178. it["top"] += self.page_cum_height[i]
  179. it["bottom"] += self.page_cum_height[i]
  180. it["pn"] = i
  181. it["layoutno"] = j
  182. pg.append(it)
  183. self.tb_cpns.extend(pg)
  184. def gather(kwd, fzy=10, ption=0.6):
  185. eles = Recognizer.sort_Y_firstly(
  186. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  187. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  188. return Recognizer.sort_Y_firstly(eles, 0)
  189. # add R,H,C,SP tag to boxes within table layout
  190. headers = gather(r".*header$")
  191. rows = gather(r".* (row|header)")
  192. spans = gather(r".*spanning")
  193. clmns = sorted([r for r in self.tb_cpns if re.match(
  194. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  195. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  196. for b in self.boxes:
  197. if b.get("layout_type", "") != "table":
  198. continue
  199. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  200. if ii is not None:
  201. b["R"] = ii
  202. b["R_top"] = rows[ii]["top"]
  203. b["R_bott"] = rows[ii]["bottom"]
  204. ii = Recognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
  205. if ii is not None:
  206. b["H_top"] = headers[ii]["top"]
  207. b["H_bott"] = headers[ii]["bottom"]
  208. b["H_left"] = headers[ii]["x0"]
  209. b["H_right"] = headers[ii]["x1"]
  210. b["H"] = ii
  211. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  212. if ii is not None:
  213. b["C"] = ii
  214. b["C_left"] = clmns[ii]["x0"]
  215. b["C_right"] = clmns[ii]["x1"]
  216. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  217. if ii is not None:
  218. b["H_top"] = spans[ii]["top"]
  219. b["H_bott"] = spans[ii]["bottom"]
  220. b["H_left"] = spans[ii]["x0"]
  221. b["H_right"] = spans[ii]["x1"]
  222. b["SP"] = ii
  223. def __ocr(self, pagenum, img, chars, ZM=3):
  224. bxs = self.ocr.detect(np.array(img))
  225. if not bxs:
  226. self.boxes.append([])
  227. return
  228. bxs = [(line[0], line[1][0]) for line in bxs]
  229. bxs = Recognizer.sort_Y_firstly(
  230. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  231. "top": b[0][1] / ZM, "text": "", "txt": t,
  232. "bottom": b[-1][1] / ZM,
  233. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  234. self.mean_height[-1] / 3
  235. )
  236. # merge chars in the same rect
  237. for c in Recognizer.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4):
  238. ii = Recognizer.find_overlapped(c, bxs)
  239. if ii is None:
  240. self.lefted_chars.append(c)
  241. continue
  242. ch = c["bottom"] - c["top"]
  243. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  244. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  245. self.lefted_chars.append(c)
  246. continue
  247. if c["text"] == " " and bxs[ii]["text"]:
  248. if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " "
  249. else:
  250. bxs[ii]["text"] += c["text"]
  251. for b in bxs:
  252. if not b["text"]:
  253. left, right, top, bott = b["x0"] * ZM, b["x1"] * ZM, b["top"] * ZM, b["bottom"] * ZM
  254. b["text"] = self.ocr.recognize(np.array(img),
  255. np.array([[left, top], [right, top], [right, bott], [left, bott]],
  256. dtype=np.float32))
  257. del b["txt"]
  258. bxs = [b for b in bxs if b["text"]]
  259. if self.mean_height[-1] == 0:
  260. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  261. for b in bxs])
  262. self.boxes.append(bxs)
  263. def _layouts_rec(self, ZM, drop=True):
  264. assert len(self.page_images) == len(self.boxes)
  265. self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM, drop=drop)
  266. # cumlative Y
  267. for i in range(len(self.boxes)):
  268. self.boxes[i]["top"] += \
  269. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  270. self.boxes[i]["bottom"] += \
  271. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  272. def _text_merge(self):
  273. # merge adjusted boxes
  274. bxs = self.boxes
  275. def end_with(b, txt):
  276. txt = txt.strip()
  277. tt = b.get("text", "").strip()
  278. return tt and tt.find(txt) == len(tt) - len(txt)
  279. def start_with(b, txts):
  280. tt = b.get("text", "").strip()
  281. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  282. # horizontally merge adjacent box with the same layout
  283. i = 0
  284. while i < len(bxs) - 1:
  285. b = bxs[i]
  286. b_ = bxs[i + 1]
  287. if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
  288. "equation"]:
  289. i += 1
  290. continue
  291. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
  292. # merge
  293. bxs[i]["x1"] = b_["x1"]
  294. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  295. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  296. bxs[i]["text"] += b_["text"]
  297. bxs.pop(i + 1)
  298. continue
  299. i += 1
  300. continue
  301. dis_thr = 1
  302. dis = b["x1"] - b_["x0"]
  303. if b.get("layout_type", "") != "text" or b_.get(
  304. "layout_type", "") != "text":
  305. if end_with(b, ",") or start_with(b_, "(,"):
  306. dis_thr = -8
  307. else:
  308. i += 1
  309. continue
  310. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  311. and dis >= dis_thr and b["x1"] < b_["x1"]:
  312. # merge
  313. bxs[i]["x1"] = b_["x1"]
  314. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  315. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  316. bxs[i]["text"] += b_["text"]
  317. bxs.pop(i + 1)
  318. continue
  319. i += 1
  320. self.boxes = bxs
  321. def _naive_vertical_merge(self):
  322. bxs = Recognizer.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
  323. i = 0
  324. while i + 1 < len(bxs):
  325. b = bxs[i]
  326. b_ = bxs[i + 1]
  327. if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
  328. bxs.pop(i)
  329. continue
  330. if not b["text"].strip():
  331. bxs.pop(i)
  332. continue
  333. concatting_feats = [
  334. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  335. len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
  336. b["text"].strip()[0] in "。;?!?”)),,、:",
  337. ]
  338. # features for not concating
  339. feats = [
  340. b.get("layoutno", 0) != b.get("layoutno", 0),
  341. b["text"].strip()[-1] in "。?!?",
  342. self.is_english and b["text"].strip()[-1] in ".!?",
  343. b["page_number"] == b_["page_number"] and b_["top"] - \
  344. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  345. b["page_number"] < b_["page_number"] and abs(
  346. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
  347. ]
  348. # split features
  349. detach_feats = [b["x1"] < b_["x0"],
  350. b["x0"] > b_["x1"]]
  351. if (any(feats) and not any(concatting_feats)) or any(detach_feats):
  352. print(b["text"], b_["text"], any(feats), any(concatting_feats), any(detach_feats))
  353. i += 1
  354. continue
  355. # merge up and down
  356. b["bottom"] = b_["bottom"]
  357. b["text"] += b_["text"]
  358. b["x0"] = min(b["x0"], b_["x0"])
  359. b["x1"] = max(b["x1"], b_["x1"])
  360. bxs.pop(i + 1)
  361. self.boxes = bxs
  362. def _concat_downward(self, concat_between_pages=True):
  363. # count boxes in the same row as a feature
  364. for i in range(len(self.boxes)):
  365. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  366. self.boxes[i]["in_row"] = 0
  367. j = max(0, i - 12)
  368. while j < min(i + 12, len(self.boxes)):
  369. if j == i:
  370. j += 1
  371. continue
  372. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  373. if abs(ydis) < 1:
  374. self.boxes[i]["in_row"] += 1
  375. elif ydis > 0:
  376. break
  377. j += 1
  378. # concat between rows
  379. boxes = deepcopy(self.boxes)
  380. blocks = []
  381. while boxes:
  382. chunks = []
  383. def dfs(up, dp):
  384. chunks.append(up)
  385. i = dp
  386. while i < min(dp + 12, len(boxes)):
  387. ydis = self._y_dis(up, boxes[i])
  388. smpg = up["page_number"] == boxes[i]["page_number"]
  389. mh = self.mean_height[up["page_number"] - 1]
  390. mw = self.mean_width[up["page_number"] - 1]
  391. if smpg and ydis > mh * 4:
  392. break
  393. if not smpg and ydis > mh * 16:
  394. break
  395. down = boxes[i]
  396. if not concat_between_pages and down["page_number"] > up["page_number"]:
  397. break
  398. if up.get("R", "") != down.get(
  399. "R", "") and up["text"][-1] != ",":
  400. i += 1
  401. continue
  402. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  403. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
  404. i += 1
  405. continue
  406. if not down["text"].strip():
  407. i += 1
  408. continue
  409. if up["x1"] < down["x0"] - 10 * \
  410. mw or up["x0"] > down["x1"] + 10 * mw:
  411. i += 1
  412. continue
  413. if i - dp < 5 and up.get("layout_type") == "text":
  414. if up.get("layoutno", "1") == down.get(
  415. "layoutno", "2"):
  416. dfs(down, i + 1)
  417. boxes.pop(i)
  418. return
  419. i += 1
  420. continue
  421. fea = self._updown_concat_features(up, down)
  422. if self.updown_cnt_mdl.predict(
  423. xgb.DMatrix([fea]))[0] <= 0.5:
  424. i += 1
  425. continue
  426. dfs(down, i + 1)
  427. boxes.pop(i)
  428. return
  429. dfs(boxes[0], 1)
  430. boxes.pop(0)
  431. if chunks:
  432. blocks.append(chunks)
  433. # concat within each block
  434. boxes = []
  435. for b in blocks:
  436. if len(b) == 1:
  437. boxes.append(b[0])
  438. continue
  439. t = b[0]
  440. for c in b[1:]:
  441. t["text"] = t["text"].strip()
  442. c["text"] = c["text"].strip()
  443. if not c["text"]:
  444. continue
  445. if t["text"] and re.match(
  446. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  447. t["text"] += " "
  448. t["text"] += c["text"]
  449. t["x0"] = min(t["x0"], c["x0"])
  450. t["x1"] = max(t["x1"], c["x1"])
  451. t["page_number"] = min(t["page_number"], c["page_number"])
  452. t["bottom"] = c["bottom"]
  453. if not t["layout_type"] \
  454. and c["layout_type"]:
  455. t["layout_type"] = c["layout_type"]
  456. boxes.append(t)
  457. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  458. def _filter_forpages(self):
  459. if not self.boxes:
  460. return
  461. findit = False
  462. i = 0
  463. while i < len(self.boxes):
  464. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
  465. re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  466. i += 1
  467. continue
  468. findit = True
  469. eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
  470. self.boxes.pop(i)
  471. if i >= len(self.boxes): break
  472. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  473. self.boxes[i]["text"].strip().split(" ")[:2])
  474. while not prefix:
  475. self.boxes.pop(i)
  476. if i >= len(self.boxes): break
  477. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  478. self.boxes[i]["text"].strip().split(" ")[:2])
  479. self.boxes.pop(i)
  480. if i >= len(self.boxes) or not prefix: break
  481. for j in range(i, min(i + 128, len(self.boxes))):
  482. if not re.match(prefix, self.boxes[j]["text"]):
  483. continue
  484. for k in range(i, j): self.boxes.pop(i)
  485. break
  486. if findit: return
  487. page_dirty = [0] * len(self.page_images)
  488. for b in self.boxes:
  489. if re.search(r"(··|··|··)", b["text"]):
  490. page_dirty[b["page_number"] - 1] += 1
  491. page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
  492. if not page_dirty: return
  493. i = 0
  494. while i < len(self.boxes):
  495. if self.boxes[i]["page_number"] in page_dirty:
  496. self.boxes.pop(i)
  497. continue
  498. i += 1
  499. def _merge_with_same_bullet(self):
  500. i = 0
  501. while i + 1 < len(self.boxes):
  502. b = self.boxes[i]
  503. b_ = self.boxes[i + 1]
  504. if not b["text"].strip():
  505. self.boxes.pop(i)
  506. continue
  507. if not b_["text"].strip():
  508. self.boxes.pop(i + 1)
  509. continue
  510. if b["text"].strip()[0] != b_["text"].strip()[0] \
  511. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  512. or huqie.is_chinese(b["text"].strip()[0]) \
  513. or b["top"] > b_["bottom"]:
  514. i += 1
  515. continue
  516. b_["text"] = b["text"] + "\n" + b_["text"]
  517. b_["x0"] = min(b["x0"], b_["x0"])
  518. b_["x1"] = max(b["x1"], b_["x1"])
  519. b_["top"] = b["top"]
  520. self.boxes.pop(i)
  521. def _extract_table_figure(self, need_image, ZM, return_html, need_position):
  522. tables = {}
  523. figures = {}
  524. # extract figure and table boxes
  525. i = 0
  526. lst_lout_no = ""
  527. nomerge_lout_no = []
  528. while i < len(self.boxes):
  529. if "layoutno" not in self.boxes[i]:
  530. i += 1
  531. continue
  532. lout_no = str(self.boxes[i]["page_number"]) + \
  533. "-" + str(self.boxes[i]["layoutno"])
  534. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
  535. "title",
  536. "figure caption",
  537. "reference"]:
  538. nomerge_lout_no.append(lst_lout_no)
  539. if self.boxes[i]["layout_type"] == "table":
  540. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  541. self.boxes.pop(i)
  542. continue
  543. if lout_no not in tables:
  544. tables[lout_no] = []
  545. tables[lout_no].append(self.boxes[i])
  546. self.boxes.pop(i)
  547. lst_lout_no = lout_no
  548. continue
  549. if need_image and self.boxes[i]["layout_type"] == "figure":
  550. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  551. self.boxes.pop(i)
  552. continue
  553. if lout_no not in figures:
  554. figures[lout_no] = []
  555. figures[lout_no].append(self.boxes[i])
  556. self.boxes.pop(i)
  557. lst_lout_no = lout_no
  558. continue
  559. i += 1
  560. # merge table on different pages
  561. nomerge_lout_no = set(nomerge_lout_no)
  562. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  563. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  564. i = len(tbls) - 1
  565. while i - 1 >= 0:
  566. k0, bxs0 = tbls[i - 1]
  567. k, bxs = tbls[i]
  568. i -= 1
  569. if k0 in nomerge_lout_no:
  570. continue
  571. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  572. continue
  573. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  574. continue
  575. mh = self.mean_height[bxs[0]["page_number"] - 1]
  576. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  577. continue
  578. tables[k0].extend(tables[k])
  579. del tables[k]
  580. def x_overlapped(a, b):
  581. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  582. # find captions and pop out
  583. i = 0
  584. while i < len(self.boxes):
  585. c = self.boxes[i]
  586. # mh = self.mean_height[c["page_number"]-1]
  587. if not TableStructureRecognizer.is_caption(c):
  588. i += 1
  589. continue
  590. # find the nearest layouts
  591. def nearest(tbls):
  592. nonlocal c
  593. mink = ""
  594. minv = 1000000000
  595. for k, bxs in tbls.items():
  596. for b in bxs:
  597. if b.get("layout_type", "").find("caption") >= 0:
  598. continue
  599. y_dis = self._y_dis(c, b)
  600. x_dis = self._x_dis(
  601. c, b) if not x_overlapped(
  602. c, b) else 0
  603. dis = y_dis * y_dis + x_dis * x_dis
  604. if dis < minv:
  605. mink = k
  606. minv = dis
  607. return mink, minv
  608. tk, tv = nearest(tables)
  609. fk, fv = nearest(figures)
  610. # if min(tv, fv) > 2000:
  611. # i += 1
  612. # continue
  613. if tv < fv and tk:
  614. tables[tk].insert(0, c)
  615. logging.debug(
  616. "TABLE:" +
  617. self.boxes[i]["text"] +
  618. "; Cap: " +
  619. tk)
  620. elif fk:
  621. figures[fk].insert(0, c)
  622. logging.debug(
  623. "FIGURE:" +
  624. self.boxes[i]["text"] +
  625. "; Cap: " +
  626. tk)
  627. self.boxes.pop(i)
  628. res = []
  629. positions = []
  630. def cropout(bxs, ltype, poss):
  631. nonlocal ZM
  632. pn = set([b["page_number"] - 1 for b in bxs])
  633. if len(pn) < 2:
  634. pn = list(pn)[0]
  635. ht = self.page_cum_height[pn]
  636. b = {
  637. "x0": np.min([b["x0"] for b in bxs]),
  638. "top": np.min([b["top"] for b in bxs]) - ht,
  639. "x1": np.max([b["x1"] for b in bxs]),
  640. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  641. }
  642. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  643. ii = Recognizer.find_overlapped(b, louts, naive=True)
  644. if ii is not None:
  645. b = louts[ii]
  646. else:
  647. logging.warn(
  648. f"Missing layout match: {pn + 1},%s" %
  649. (bxs[0].get(
  650. "layoutno", "")))
  651. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  652. poss.append((pn + self.page_from, left, right, top, bott))
  653. return self.page_images[pn] \
  654. .crop((left * ZM, top * ZM,
  655. right * ZM, bott * ZM))
  656. pn = {}
  657. for b in bxs:
  658. p = b["page_number"] - 1
  659. if p not in pn:
  660. pn[p] = []
  661. pn[p].append(b)
  662. pn = sorted(pn.items(), key=lambda x: x[0])
  663. imgs = [cropout(arr, ltype, poss) for p, arr in pn]
  664. pic = Image.new("RGB",
  665. (int(np.max([i.size[0] for i in imgs])),
  666. int(np.sum([m.size[1] for m in imgs]))),
  667. (245, 245, 245))
  668. height = 0
  669. for img in imgs:
  670. pic.paste(img, (0, int(height)))
  671. height += img.size[1]
  672. return pic
  673. # crop figure out and add caption
  674. for k, bxs in figures.items():
  675. txt = "\n".join([b["text"] for b in bxs])
  676. if not txt:
  677. continue
  678. poss = []
  679. res.append(
  680. (cropout(
  681. bxs,
  682. "figure", poss),
  683. [txt]))
  684. positions.append(poss)
  685. for k, bxs in tables.items():
  686. if not bxs:
  687. continue
  688. bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"] - b["top"]) / 2 for b in bxs]))
  689. poss = []
  690. res.append((cropout(bxs, "table", poss),
  691. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  692. positions.append(poss)
  693. assert len(positions) == len(res)
  694. if need_position: return list(zip(res, positions))
  695. return res
  696. def proj_match(self, line):
  697. if len(line) <= 2:
  698. return
  699. if re.match(r"[0-9 ().,%%+/-]+$", line):
  700. return False
  701. for p, j in [
  702. (r"第[零一二三四五六七八九十百]+章", 1),
  703. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  704. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  705. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  706. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  707. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  708. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  709. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  710. (r".{,48}[::??]$", 9),
  711. (r"[0-9]+)", 10),
  712. (r"[\((][0-9]+[)\)]", 11),
  713. (r"[零一二三四五六七八九十百]+是", 12),
  714. (r"[⚫•➢✓]", 12)
  715. ]:
  716. if re.match(p, line):
  717. return j
  718. return
  719. def _line_tag(self, bx, ZM):
  720. pn = [bx["page_number"]]
  721. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  722. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  723. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  724. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  725. pn.append(pn[-1] + 1)
  726. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  727. .format("-".join([str(p) for p in pn]),
  728. bx["x0"], bx["x1"], top, bott)
  729. def __filterout_scraps(self, boxes, ZM):
  730. def width(b):
  731. return b["x1"] - b["x0"]
  732. def height(b):
  733. return b["bottom"] - b["top"]
  734. def usefull(b):
  735. if b.get("layout_type"):
  736. return True
  737. if width(
  738. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  739. return True
  740. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  741. return True
  742. return False
  743. res = []
  744. while boxes:
  745. lines = []
  746. widths = []
  747. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  748. mh = self.mean_height[boxes[0]["page_number"] - 1]
  749. mj = self.proj_match(
  750. boxes[0]["text"]) or boxes[0].get(
  751. "layout_type",
  752. "") == "title"
  753. def dfs(line, st):
  754. nonlocal mh, pw, lines, widths
  755. lines.append(line)
  756. widths.append(width(line))
  757. width_mean = np.mean(widths)
  758. mmj = self.proj_match(
  759. line["text"]) or line.get(
  760. "layout_type",
  761. "") == "title"
  762. for i in range(st + 1, min(st + 20, len(boxes))):
  763. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  764. break
  765. if not mmj and self._y_dis(
  766. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  767. break
  768. if not usefull(boxes[i]):
  769. continue
  770. if mmj or \
  771. (self._x_dis(boxes[i], line) < pw / 10): \
  772. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  773. # concat following
  774. dfs(boxes[i], i)
  775. boxes.pop(i)
  776. break
  777. try:
  778. if usefull(boxes[0]):
  779. dfs(boxes[0], 0)
  780. else:
  781. logging.debug("WASTE: " + boxes[0]["text"])
  782. except Exception as e:
  783. pass
  784. boxes.pop(0)
  785. mw = np.mean(widths)
  786. if mj or mw / pw >= 0.35 or mw > 200:
  787. res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  788. else:
  789. logging.debug("REMOVED: " +
  790. "<<".join([c["text"] for c in lines]))
  791. return "\n\n".join(res)
  792. @staticmethod
  793. def total_page_number(fnm, binary=None):
  794. try:
  795. pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary))
  796. return len(pdf.pages)
  797. except Exception as e:
  798. pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
  799. return len(pdf)
  800. def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
  801. self.lefted_chars = []
  802. self.mean_height = []
  803. self.mean_width = []
  804. self.boxes = []
  805. self.garbages = {}
  806. self.page_cum_height = [0]
  807. self.page_layout = []
  808. self.page_from = page_from
  809. try:
  810. self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
  811. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  812. enumerate(self.pdf.pages[page_from:page_to])]
  813. self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in
  814. self.pdf.pages[page_from:page_to]]
  815. self.total_page = len(self.pdf.pages)
  816. except Exception as e:
  817. self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
  818. self.page_images = []
  819. self.page_chars = []
  820. mat = fitz.Matrix(zoomin, zoomin)
  821. self.total_page = len(self.pdf)
  822. for i, page in enumerate(self.pdf):
  823. if i < page_from: continue
  824. if i >= page_to: break
  825. pix = page.get_pixmap(matrix=mat)
  826. img = Image.frombytes("RGB", [pix.width, pix.height],
  827. pix.samples)
  828. self.page_images.append(img)
  829. self.page_chars.append([])
  830. self.outlines = []
  831. try:
  832. self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
  833. outlines = self.pdf.outline
  834. def dfs(arr, depth):
  835. for a in arr:
  836. if isinstance(a, dict):
  837. self.outlines.append((a["/Title"], depth))
  838. continue
  839. dfs(a, depth+1)
  840. dfs(outlines, 0)
  841. except Exception as e:
  842. logging.warning(f"Outlines exception: {e}")
  843. if not self.outlines:
  844. logging.warning(f"Miss outlines")
  845. logging.info("Images converted.")
  846. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
  847. random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
  848. range(len(self.page_chars))]
  849. if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
  850. self.is_english = True
  851. else:
  852. self.is_english = False
  853. for i, img in enumerate(self.page_images):
  854. chars = self.page_chars[i] if not self.is_english else []
  855. self.mean_height.append(
  856. np.median(sorted([c["height"] for c in chars])) if chars else 0
  857. )
  858. self.mean_width.append(
  859. np.median(sorted([c["width"] for c in chars])) if chars else 8
  860. )
  861. self.page_cum_height.append(img.size[1] / zoomin)
  862. j = 0
  863. while j + 1 < len(chars):
  864. if chars[j]["text"] and chars[j + 1]["text"] \
  865. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  866. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  867. chars[j]["width"]) / 2:
  868. chars[j]["text"] += " "
  869. j += 1
  870. # if i > 0:
  871. # if not chars:
  872. # self.page_cum_height.append(img.size[1] / zoomin)
  873. # else:
  874. # self.page_cum_height.append(
  875. # np.max([c["bottom"] for c in chars]))
  876. self.__ocr(i + 1, img, chars, zoomin)
  877. if callback: callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
  878. if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
  879. bxes = [b for bxs in self.boxes for b in bxs]
  880. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
  881. "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  882. logging.info("Is it English:", self.is_english)
  883. self.page_cum_height = np.cumsum(self.page_cum_height)
  884. assert len(self.page_cum_height) == len(self.page_images) + 1
  885. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  886. self.__images__(fnm, zoomin)
  887. self._layouts_rec(zoomin)
  888. self._table_transformer_job(zoomin)
  889. self._text_merge()
  890. self._concat_downward()
  891. self._filter_forpages()
  892. tbls = self._extract_table_figure(need_image, zoomin, return_html, False)
  893. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  894. def remove_tag(self, txt):
  895. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  896. def crop(self, text, ZM=3, need_position=False):
  897. imgs = []
  898. poss = []
  899. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  900. pn, left, right, top, bottom = tag.strip(
  901. "#").strip("@").split("\t")
  902. left, right, top, bottom = float(left), float(
  903. right), float(top), float(bottom)
  904. poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
  905. if not poss:
  906. if need_position: return None, None
  907. return
  908. max_width = np.max([right - left for (_, left, right, _, _) in poss])
  909. GAP = 6
  910. pos = poss[0]
  911. poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
  912. pos = poss[-1]
  913. poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
  914. min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
  915. positions = []
  916. for ii, (pns, left, right, top, bottom) in enumerate(poss):
  917. right = left + max_width
  918. bottom *= ZM
  919. for pn in pns[1:]:
  920. bottom += self.page_images[pn - 1].size[1]
  921. imgs.append(
  922. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  923. right *
  924. ZM, min(
  925. bottom, self.page_images[pns[0]].size[1])
  926. ))
  927. )
  928. if 0 < ii < len(poss) - 1:
  929. positions.append((pns[0] + self.page_from, left, right, top, min(
  930. bottom, self.page_images[pns[0]].size[1]) / ZM))
  931. bottom -= self.page_images[pns[0]].size[1]
  932. for pn in pns[1:]:
  933. imgs.append(
  934. self.page_images[pn].crop((left * ZM, 0,
  935. right * ZM,
  936. min(bottom,
  937. self.page_images[pn].size[1])
  938. ))
  939. )
  940. if 0 < ii < len(poss) - 1:
  941. positions.append((pn + self.page_from, left, right, 0, min(
  942. bottom, self.page_images[pn].size[1]) / ZM))
  943. bottom -= self.page_images[pn].size[1]
  944. if not imgs:
  945. if need_position: return None, None
  946. return
  947. height = 0
  948. for img in imgs:
  949. height += img.size[1] + GAP
  950. height = int(height)
  951. width = int(np.max([i.size[0] for i in imgs]))
  952. pic = Image.new("RGB",
  953. (width, height),
  954. (245, 245, 245))
  955. height = 0
  956. for ii, img in enumerate(imgs):
  957. if ii == 0 or ii + 1 == len(imgs):
  958. img = img.convert('RGBA')
  959. overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
  960. overlay.putalpha(128)
  961. img = Image.alpha_composite(img, overlay).convert("RGB")
  962. pic.paste(img, (0, int(height)))
  963. height += img.size[1] + GAP
  964. if need_position:
  965. return pic, positions
  966. return pic
  967. def get_position(self, bx, ZM):
  968. poss = []
  969. pn = bx["page_number"]
  970. top = bx["top"] - self.page_cum_height[pn - 1]
  971. bott = bx["bottom"] - self.page_cum_height[pn - 1]
  972. poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
  973. while bott * ZM > self.page_images[pn - 1].size[1]:
  974. bott -= self.page_images[pn - 1].size[1] / ZM
  975. top = 0
  976. pn += 1
  977. poss.append((pn, bx["x0"], bx["x1"], top, min(bott, self.page_images[pn - 1].size[1] / ZM)))
  978. return poss
  979. class PlainParser(object):
  980. def __call__(self, filename, **kwargs):
  981. self.outlines = []
  982. lines = []
  983. try:
  984. self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename))
  985. outlines = self.pdf.outline
  986. for page in self.pdf.pages:
  987. lines.extend([t for t in page.extract_text().split("\n")])
  988. def dfs(arr, depth):
  989. for a in arr:
  990. if isinstance(a, dict):
  991. self.outlines.append((a["/Title"], depth))
  992. continue
  993. dfs(a, depth + 1)
  994. dfs(outlines, 0)
  995. except Exception as e:
  996. logging.warning(f"Outlines exception: {e}")
  997. if not self.outlines:
  998. logging.warning(f"Miss outlines")
  999. return [(l, "") for l in lines], []
  1000. def crop(self, ck, need_position):
  1001. raise NotImplementedError
  1002. @staticmethod
  1003. def remove_tag(txt):
  1004. raise NotImplementedError
  1005. if __name__ == "__main__":
  1006. pass