Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

pdf_parser.py 38KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977
  1. # -*- coding: utf-8 -*-
  2. import random
  3. import fitz
  4. import xgboost as xgb
  5. from io import BytesIO
  6. import torch
  7. import re
  8. import pdfplumber
  9. import logging
  10. from PIL import Image
  11. import numpy as np
  12. from api.db import ParserType
  13. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  14. from rag.nlp import huqie
  15. from copy import deepcopy
  16. from huggingface_hub import hf_hub_download
  17. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  18. class HuParser:
  19. def __init__(self):
  20. self.ocr = OCR()
  21. if not hasattr(self, "model_speciess"):
  22. self.model_speciess = ParserType.GENERAL.value
  23. self.layouter = LayoutRecognizer("layout."+self.model_speciess)
  24. self.tbl_det = TableStructureRecognizer()
  25. self.updown_cnt_mdl = xgb.Booster()
  26. if torch.cuda.is_available():
  27. self.updown_cnt_mdl.set_param({"device": "cuda"})
  28. self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
  29. filename="updown_concat_xgb.model"))
  30. """
  31. If you have trouble downloading HuggingFace models, -_^ this might help!!
  32. For Linux:
  33. export HF_ENDPOINT=https://hf-mirror.com
  34. For Windows:
  35. Good luck
  36. ^_-
  37. """
  38. def __char_width(self, c):
  39. return (c["x1"] - c["x0"]) // len(c["text"])
  40. def __height(self, c):
  41. return c["bottom"] - c["top"]
  42. def _x_dis(self, a, b):
  43. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  44. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  45. def _y_dis(
  46. self, a, b):
  47. return (
  48. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  49. def _match_proj(self, b):
  50. proj_patt = [
  51. r"第[零一二三四五六七八九十百]+章",
  52. r"第[零一二三四五六七八九十百]+[条节]",
  53. r"[零一二三四五六七八九十百]+[、是  ]",
  54. r"[\((][零一二三四五六七八九十百]+[)\)]",
  55. r"[\((][0-9]+[)\)]",
  56. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  57. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  58. r"[⚫•➢①② ]",
  59. ]
  60. return any([re.match(p, b["text"]) for p in proj_patt])
  61. def _updown_concat_features(self, up, down):
  62. w = max(self.__char_width(up), self.__char_width(down))
  63. h = max(self.__height(up), self.__height(down))
  64. y_dis = self._y_dis(up, down)
  65. LEN = 6
  66. tks_down = huqie.qie(down["text"][:LEN]).split(" ")
  67. tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
  68. tks_all = up["text"][-LEN:].strip() \
  69. + (" " if re.match(r"[a-zA-Z0-9]+",
  70. up["text"][-1] + down["text"][0]) else "") \
  71. + down["text"][:LEN].strip()
  72. tks_all = huqie.qie(tks_all).split(" ")
  73. fea = [
  74. up.get("R", -1) == down.get("R", -1),
  75. y_dis / h,
  76. down["page_number"] - up["page_number"],
  77. up["layout_type"] == down["layout_type"],
  78. up["layout_type"] == "text",
  79. down["layout_type"] == "text",
  80. up["layout_type"] == "table",
  81. down["layout_type"] == "table",
  82. True if re.search(
  83. r"([。?!;!?;+))]|[a-z]\.)$",
  84. up["text"]) else False,
  85. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  86. True if re.search(
  87. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  88. down["text"]) else False,
  89. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  90. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  91. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  92. True if re.search(r"[\((][^\))]+$", up["text"])
  93. and re.search(r"[\))]", down["text"]) else False,
  94. self._match_proj(down),
  95. True if re.match(r"[A-Z]", down["text"]) else False,
  96. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  97. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  98. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  99. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  100. ) > 1 and len(
  101. down["text"].strip()) > 1 else False,
  102. up["x0"] > down["x1"],
  103. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  104. self.__height(down)),
  105. self._x_dis(up, down) / max(w, 0.000001),
  106. (len(up["text"]) - len(down["text"])) /
  107. max(len(up["text"]), len(down["text"])),
  108. len(tks_all) - len(tks_up) - len(tks_down),
  109. len(tks_down) - len(tks_up),
  110. tks_down[-1] == tks_up[-1],
  111. max(down["in_row"], up["in_row"]),
  112. abs(down["in_row"] - up["in_row"]),
  113. len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
  114. len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
  115. ]
  116. return fea
  117. @staticmethod
  118. def sort_X_by_page(arr, threashold):
  119. # sort using y1 first and then x1
  120. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  121. for i in range(len(arr) - 1):
  122. for j in range(i, -1, -1):
  123. # restore the order using th
  124. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  125. and arr[j + 1]["top"] < arr[j]["top"]\
  126. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  127. tmp = arr[j]
  128. arr[j] = arr[j + 1]
  129. arr[j + 1] = tmp
  130. return arr
  131. def _has_color(self, o):
  132. if o.get("ncs", "") == "DeviceGray":
  133. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  134. o["non_stroking_color"][0] == 1:
  135. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  136. return False
  137. return True
  138. def _table_transformer_job(self, ZM):
  139. logging.info("Table processing...")
  140. imgs, pos = [], []
  141. tbcnt = [0]
  142. MARGIN = 10
  143. self.tb_cpns = []
  144. assert len(self.page_layout) == len(self.page_images)
  145. for p, tbls in enumerate(self.page_layout): # for page
  146. tbls = [f for f in tbls if f["type"] == "table"]
  147. tbcnt.append(len(tbls))
  148. if not tbls:
  149. continue
  150. for tb in tbls: # for table
  151. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  152. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  153. left *= ZM
  154. top *= ZM
  155. right *= ZM
  156. bott *= ZM
  157. pos.append((left, top))
  158. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  159. assert len(self.page_images) == len(tbcnt) - 1
  160. if not imgs:
  161. return
  162. recos = self.tbl_det(imgs)
  163. tbcnt = np.cumsum(tbcnt)
  164. for i in range(len(tbcnt) - 1): # for page
  165. pg = []
  166. for j, tb_items in enumerate(
  167. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  168. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  169. for it in tb_items: # for table components
  170. it["x0"] = (it["x0"] + poss[j][0])
  171. it["x1"] = (it["x1"] + poss[j][0])
  172. it["top"] = (it["top"] + poss[j][1])
  173. it["bottom"] = (it["bottom"] + poss[j][1])
  174. for n in ["x0", "x1", "top", "bottom"]:
  175. it[n] /= ZM
  176. it["top"] += self.page_cum_height[i]
  177. it["bottom"] += self.page_cum_height[i]
  178. it["pn"] = i
  179. it["layoutno"] = j
  180. pg.append(it)
  181. self.tb_cpns.extend(pg)
  182. def gather(kwd, fzy=10, ption=0.6):
  183. eles = Recognizer.sort_Y_firstly(
  184. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  185. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  186. return Recognizer.sort_Y_firstly(eles, 0)
  187. # add R,H,C,SP tag to boxes within table layout
  188. headers = gather(r".*header$")
  189. rows = gather(r".* (row|header)")
  190. spans = gather(r".*spanning")
  191. clmns = sorted([r for r in self.tb_cpns if re.match(
  192. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  193. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  194. for b in self.boxes:
  195. if b.get("layout_type", "") != "table":
  196. continue
  197. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  198. if ii is not None:
  199. b["R"] = ii
  200. b["R_top"] = rows[ii]["top"]
  201. b["R_bott"] = rows[ii]["bottom"]
  202. ii = Recognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
  203. if ii is not None:
  204. b["H_top"] = headers[ii]["top"]
  205. b["H_bott"] = headers[ii]["bottom"]
  206. b["H_left"] = headers[ii]["x0"]
  207. b["H_right"] = headers[ii]["x1"]
  208. b["H"] = ii
  209. ii = Recognizer.find_overlapped_with_threashold(b, clmns, thr=0.3)
  210. if ii is not None:
  211. b["C"] = ii
  212. b["C_left"] = clmns[ii]["x0"]
  213. b["C_right"] = clmns[ii]["x1"]
  214. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  215. if ii is not None:
  216. b["H_top"] = spans[ii]["top"]
  217. b["H_bott"] = spans[ii]["bottom"]
  218. b["H_left"] = spans[ii]["x0"]
  219. b["H_right"] = spans[ii]["x1"]
  220. b["SP"] = ii
  221. def __ocr(self, pagenum, img, chars, ZM=3):
  222. bxs = self.ocr(np.array(img))
  223. if not bxs:
  224. self.boxes.append([])
  225. return
  226. bxs = [(line[0], line[1][0]) for line in bxs]
  227. bxs = Recognizer.sort_Y_firstly(
  228. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  229. "top": b[0][1] / ZM, "text": "", "txt": t,
  230. "bottom": b[-1][1] / ZM,
  231. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  232. self.mean_height[-1] / 3
  233. )
  234. # merge chars in the same rect
  235. for c in Recognizer.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4):
  236. ii = Recognizer.find_overlapped(c, bxs)
  237. if ii is None:
  238. self.lefted_chars.append(c)
  239. continue
  240. ch = c["bottom"] - c["top"]
  241. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  242. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  243. self.lefted_chars.append(c)
  244. continue
  245. if c["text"] == " " and bxs[ii]["text"]:
  246. if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " "
  247. else:
  248. bxs[ii]["text"] += c["text"]
  249. for b in bxs:
  250. if not b["text"]:
  251. b["text"] = b["txt"]
  252. del b["txt"]
  253. if self.mean_height[-1] == 0:
  254. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  255. for b in bxs])
  256. self.boxes.append(bxs)
  257. def _layouts_rec(self, ZM):
  258. assert len(self.page_images) == len(self.boxes)
  259. self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM)
  260. # cumlative Y
  261. for i in range(len(self.boxes)):
  262. self.boxes[i]["top"] += \
  263. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  264. self.boxes[i]["bottom"] += \
  265. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  266. def _text_merge(self):
  267. # merge adjusted boxes
  268. bxs = self.boxes
  269. def end_with(b, txt):
  270. txt = txt.strip()
  271. tt = b.get("text", "").strip()
  272. return tt and tt.find(txt) == len(tt) - len(txt)
  273. def start_with(b, txts):
  274. tt = b.get("text", "").strip()
  275. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  276. # horizontally merge adjacent box with the same layout
  277. i = 0
  278. while i < len(bxs) - 1:
  279. b = bxs[i]
  280. b_ = bxs[i + 1]
  281. if b.get("layoutno", "0") != b_.get("layoutno", "1"):
  282. i += 1
  283. continue
  284. dis_thr = 1
  285. dis = b["x1"] - b_["x0"]
  286. if b.get("layout_type", "") != "text" or b_.get(
  287. "layout_type", "") != "text":
  288. if end_with(b, ",") or start_with(b_, "(,"):
  289. dis_thr = -8
  290. else:
  291. i += 1
  292. continue
  293. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  294. and dis >= dis_thr and b["x1"] < b_["x1"]:
  295. # merge
  296. bxs[i]["x1"] = b_["x1"]
  297. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  298. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  299. bxs[i]["text"] += b_["text"]
  300. bxs.pop(i + 1)
  301. continue
  302. i += 1
  303. self.boxes = bxs
  304. def _naive_vertical_merge(self):
  305. bxs = Recognizer.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
  306. i = 0
  307. while i + 1 < len(bxs):
  308. b = bxs[i]
  309. b_ = bxs[i + 1]
  310. if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
  311. bxs.pop(i)
  312. continue
  313. concatting_feats = [
  314. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  315. len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
  316. b["text"].strip()[0] in "。;?!?”)),,、:",
  317. ]
  318. # features for not concating
  319. feats = [
  320. b.get("layoutno", 0) != b.get("layoutno", 0),
  321. b["text"].strip()[-1] in "。?!?",
  322. self.is_english and b["text"].strip()[-1] in ".!?",
  323. b["page_number"] == b_["page_number"] and b_["top"] - \
  324. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  325. b["page_number"] < b_["page_number"] and abs(
  326. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
  327. ]
  328. if any(feats) and not any(concatting_feats):
  329. i += 1
  330. continue
  331. # merge up and down
  332. b["bottom"] = b_["bottom"]
  333. b["text"] += b_["text"]
  334. b["x0"] = min(b["x0"], b_["x0"])
  335. b["x1"] = max(b["x1"], b_["x1"])
  336. bxs.pop(i + 1)
  337. self.boxes = bxs
  338. def _concat_downward(self, concat_between_pages=True):
  339. # count boxes in the same row as a feature
  340. for i in range(len(self.boxes)):
  341. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  342. self.boxes[i]["in_row"] = 0
  343. j = max(0, i - 12)
  344. while j < min(i + 12, len(self.boxes)):
  345. if j == i:
  346. j += 1
  347. continue
  348. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  349. if abs(ydis) < 1:
  350. self.boxes[i]["in_row"] += 1
  351. elif ydis > 0:
  352. break
  353. j += 1
  354. # concat between rows
  355. boxes = deepcopy(self.boxes)
  356. blocks = []
  357. while boxes:
  358. chunks = []
  359. def dfs(up, dp):
  360. chunks.append(up)
  361. i = dp
  362. while i < min(dp + 12, len(boxes)):
  363. ydis = self._y_dis(up, boxes[i])
  364. smpg = up["page_number"] == boxes[i]["page_number"]
  365. mh = self.mean_height[up["page_number"] - 1]
  366. mw = self.mean_width[up["page_number"] - 1]
  367. if smpg and ydis > mh * 4:
  368. break
  369. if not smpg and ydis > mh * 16:
  370. break
  371. down = boxes[i]
  372. if not concat_between_pages and down["page_number"] > up["page_number"]:
  373. break
  374. if up.get("R", "") != down.get(
  375. "R", "") and up["text"][-1] != ",":
  376. i += 1
  377. continue
  378. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  379. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
  380. i += 1
  381. continue
  382. if not down["text"].strip():
  383. i += 1
  384. continue
  385. if up["x1"] < down["x0"] - 10 * \
  386. mw or up["x0"] > down["x1"] + 10 * mw:
  387. i += 1
  388. continue
  389. if i - dp < 5 and up.get("layout_type") == "text":
  390. if up.get("layoutno", "1") == down.get(
  391. "layoutno", "2"):
  392. dfs(down, i + 1)
  393. boxes.pop(i)
  394. return
  395. i += 1
  396. continue
  397. fea = self._updown_concat_features(up, down)
  398. if self.updown_cnt_mdl.predict(
  399. xgb.DMatrix([fea]))[0] <= 0.5:
  400. i += 1
  401. continue
  402. dfs(down, i + 1)
  403. boxes.pop(i)
  404. return
  405. dfs(boxes[0], 1)
  406. boxes.pop(0)
  407. if chunks:
  408. blocks.append(chunks)
  409. # concat within each block
  410. boxes = []
  411. for b in blocks:
  412. if len(b) == 1:
  413. boxes.append(b[0])
  414. continue
  415. t = b[0]
  416. for c in b[1:]:
  417. t["text"] = t["text"].strip()
  418. c["text"] = c["text"].strip()
  419. if not c["text"]:
  420. continue
  421. if t["text"] and re.match(
  422. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  423. t["text"] += " "
  424. t["text"] += c["text"]
  425. t["x0"] = min(t["x0"], c["x0"])
  426. t["x1"] = max(t["x1"], c["x1"])
  427. t["page_number"] = min(t["page_number"], c["page_number"])
  428. t["bottom"] = c["bottom"]
  429. if not t["layout_type"] \
  430. and c["layout_type"]:
  431. t["layout_type"] = c["layout_type"]
  432. boxes.append(t)
  433. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  434. def _filter_forpages(self):
  435. if not self.boxes:
  436. return
  437. findit = False
  438. i = 0
  439. while i < len(self.boxes):
  440. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  441. i += 1
  442. continue
  443. findit = True
  444. eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
  445. self.boxes.pop(i)
  446. if i >= len(self.boxes): break
  447. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
  448. while not prefix:
  449. self.boxes.pop(i)
  450. if i >= len(self.boxes): break
  451. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
  452. self.boxes.pop(i)
  453. if i >= len(self.boxes) or not prefix: break
  454. for j in range(i, min(i + 128, len(self.boxes))):
  455. if not re.match(prefix, self.boxes[j]["text"]):
  456. continue
  457. for k in range(i, j): self.boxes.pop(i)
  458. break
  459. if findit:return
  460. page_dirty = [0] * len(self.page_images)
  461. for b in self.boxes:
  462. if re.search(r"(··|··|··)", b["text"]):
  463. page_dirty[b["page_number"]-1] += 1
  464. page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3])
  465. if not page_dirty: return
  466. i = 0
  467. while i < len(self.boxes):
  468. if self.boxes[i]["page_number"] in page_dirty:
  469. self.boxes.pop(i)
  470. continue
  471. i += 1
  472. def _merge_with_same_bullet(self):
  473. i = 0
  474. while i + 1 < len(self.boxes):
  475. b = self.boxes[i]
  476. b_ = self.boxes[i + 1]
  477. if not b["text"].strip():
  478. self.boxes.pop(i)
  479. continue
  480. if not b_["text"].strip():
  481. self.boxes.pop(i+1)
  482. continue
  483. if b["text"].strip()[0] != b_["text"].strip()[0] \
  484. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  485. or huqie.is_chinese(b["text"].strip()[0]) \
  486. or b["top"] > b_["bottom"]:
  487. i += 1
  488. continue
  489. b_["text"] = b["text"] + "\n" + b_["text"]
  490. b_["x0"] = min(b["x0"], b_["x0"])
  491. b_["x1"] = max(b["x1"], b_["x1"])
  492. b_["top"] = b["top"]
  493. self.boxes.pop(i)
  494. def _extract_table_figure(self, need_image, ZM, return_html):
  495. tables = {}
  496. figures = {}
  497. # extract figure and table boxes
  498. i = 0
  499. lst_lout_no = ""
  500. nomerge_lout_no = []
  501. while i < len(self.boxes):
  502. if "layoutno" not in self.boxes[i]:
  503. i += 1
  504. continue
  505. lout_no = str(self.boxes[i]["page_number"]) + \
  506. "-" + str(self.boxes[i]["layoutno"])
  507. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
  508. "figure caption", "reference"]:
  509. nomerge_lout_no.append(lst_lout_no)
  510. if self.boxes[i]["layout_type"] == "table":
  511. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  512. self.boxes.pop(i)
  513. continue
  514. if lout_no not in tables:
  515. tables[lout_no] = []
  516. tables[lout_no].append(self.boxes[i])
  517. self.boxes.pop(i)
  518. lst_lout_no = lout_no
  519. continue
  520. if need_image and self.boxes[i]["layout_type"] == "figure":
  521. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  522. self.boxes.pop(i)
  523. continue
  524. if lout_no not in figures:
  525. figures[lout_no] = []
  526. figures[lout_no].append(self.boxes[i])
  527. self.boxes.pop(i)
  528. lst_lout_no = lout_no
  529. continue
  530. i += 1
  531. # merge table on different pages
  532. nomerge_lout_no = set(nomerge_lout_no)
  533. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  534. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  535. i = len(tbls) - 1
  536. while i - 1 >= 0:
  537. k0, bxs0 = tbls[i - 1]
  538. k, bxs = tbls[i]
  539. i -= 1
  540. if k0 in nomerge_lout_no:
  541. continue
  542. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  543. continue
  544. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  545. continue
  546. mh = self.mean_height[bxs[0]["page_number"] - 1]
  547. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  548. continue
  549. tables[k0].extend(tables[k])
  550. del tables[k]
  551. def x_overlapped(a, b):
  552. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  553. # find captions and pop out
  554. i = 0
  555. while i < len(self.boxes):
  556. c = self.boxes[i]
  557. # mh = self.mean_height[c["page_number"]-1]
  558. if not TableStructureRecognizer.is_caption(c):
  559. i += 1
  560. continue
  561. # find the nearest layouts
  562. def nearest(tbls):
  563. nonlocal c
  564. mink = ""
  565. minv = 1000000000
  566. for k, bxs in tbls.items():
  567. for b in bxs[:10]:
  568. if b.get("layout_type", "").find("caption") >= 0:
  569. continue
  570. y_dis = self._y_dis(c, b)
  571. x_dis = self._x_dis(
  572. c, b) if not x_overlapped(
  573. c, b) else 0
  574. dis = y_dis * y_dis + x_dis * x_dis
  575. if dis < minv:
  576. mink = k
  577. minv = dis
  578. return mink, minv
  579. tk, tv = nearest(tables)
  580. fk, fv = nearest(figures)
  581. if min(tv, fv) > 2000:
  582. i += 1
  583. continue
  584. if tv < fv:
  585. tables[tk].insert(0, c)
  586. logging.debug(
  587. "TABLE:" +
  588. self.boxes[i]["text"] +
  589. "; Cap: " +
  590. tk)
  591. else:
  592. figures[fk].insert(0, c)
  593. logging.debug(
  594. "FIGURE:" +
  595. self.boxes[i]["text"] +
  596. "; Cap: " +
  597. tk)
  598. self.boxes.pop(i)
  599. res = []
  600. def cropout(bxs, ltype):
  601. nonlocal ZM
  602. pn = set([b["page_number"] - 1 for b in bxs])
  603. if len(pn) < 2:
  604. pn = list(pn)[0]
  605. ht = self.page_cum_height[pn]
  606. b = {
  607. "x0": np.min([b["x0"] for b in bxs]),
  608. "top": np.min([b["top"] for b in bxs]) - ht,
  609. "x1": np.max([b["x1"] for b in bxs]),
  610. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  611. }
  612. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  613. ii = Recognizer.find_overlapped(b, louts, naive=True)
  614. if ii is not None:
  615. b = louts[ii]
  616. else:
  617. logging.warn(
  618. f"Missing layout match: {pn + 1},%s" %
  619. (bxs[0].get(
  620. "layoutno", "")))
  621. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  622. return self.page_images[pn] \
  623. .crop((left * ZM, top * ZM,
  624. right * ZM, bott * ZM))
  625. pn = {}
  626. for b in bxs:
  627. p = b["page_number"] - 1
  628. if p not in pn:
  629. pn[p] = []
  630. pn[p].append(b)
  631. pn = sorted(pn.items(), key=lambda x: x[0])
  632. imgs = [cropout(arr, ltype) for p, arr in pn]
  633. pic = Image.new("RGB",
  634. (int(np.max([i.size[0] for i in imgs])),
  635. int(np.sum([m.size[1] for m in imgs]))),
  636. (245, 245, 245))
  637. height = 0
  638. for img in imgs:
  639. pic.paste(img, (0, int(height)))
  640. height += img.size[1]
  641. return pic
  642. # crop figure out and add caption
  643. for k, bxs in figures.items():
  644. txt = "\n".join(
  645. [b["text"] for b in bxs
  646. if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
  647. and len(b["text"].strip()) >= 4
  648. ]
  649. )
  650. if not txt:
  651. continue
  652. res.append(
  653. (cropout(
  654. bxs,
  655. "figure"),
  656. [txt] if not return_html else [f"<p>{txt}</p>"]))
  657. for k, bxs in tables.items():
  658. if not bxs:
  659. continue
  660. res.append((cropout(bxs, "table"),
  661. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  662. return res
  663. def proj_match(self, line):
  664. if len(line) <= 2:
  665. return
  666. if re.match(r"[0-9 ().,%%+/-]+$", line):
  667. return False
  668. for p, j in [
  669. (r"第[零一二三四五六七八九十百]+章", 1),
  670. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  671. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  672. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  673. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  674. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  675. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  676. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  677. (r".{,48}[::??]$", 9),
  678. (r"[0-9]+)", 10),
  679. (r"[\((][0-9]+[)\)]", 11),
  680. (r"[零一二三四五六七八九十百]+是", 12),
  681. (r"[⚫•➢✓]", 12)
  682. ]:
  683. if re.match(p, line):
  684. return j
  685. return
  686. def _line_tag(self, bx, ZM):
  687. pn = [bx["page_number"]]
  688. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  689. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  690. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  691. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  692. pn.append(pn[-1] + 1)
  693. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  694. .format("-".join([str(p) for p in pn]),
  695. bx["x0"], bx["x1"], top, bott)
  696. def __filterout_scraps(self, boxes, ZM):
  697. def width(b):
  698. return b["x1"] - b["x0"]
  699. def height(b):
  700. return b["bottom"] - b["top"]
  701. def usefull(b):
  702. if b.get("layout_type"):
  703. return True
  704. if width(
  705. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  706. return True
  707. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  708. return True
  709. return False
  710. res = []
  711. while boxes:
  712. lines = []
  713. widths = []
  714. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  715. mh = self.mean_height[boxes[0]["page_number"] - 1]
  716. mj = self.proj_match(
  717. boxes[0]["text"]) or boxes[0].get(
  718. "layout_type",
  719. "") == "title"
  720. def dfs(line, st):
  721. nonlocal mh, pw, lines, widths
  722. lines.append(line)
  723. widths.append(width(line))
  724. width_mean = np.mean(widths)
  725. mmj = self.proj_match(
  726. line["text"]) or line.get(
  727. "layout_type",
  728. "") == "title"
  729. for i in range(st + 1, min(st + 20, len(boxes))):
  730. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  731. break
  732. if not mmj and self._y_dis(
  733. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  734. break
  735. if not usefull(boxes[i]):
  736. continue
  737. if mmj or \
  738. (self._x_dis(boxes[i], line) < pw / 10): \
  739. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  740. # concat following
  741. dfs(boxes[i], i)
  742. boxes.pop(i)
  743. break
  744. try:
  745. if usefull(boxes[0]):
  746. dfs(boxes[0], 0)
  747. else:
  748. logging.debug("WASTE: " + boxes[0]["text"])
  749. except Exception as e:
  750. pass
  751. boxes.pop(0)
  752. mw = np.mean(widths)
  753. if mj or mw / pw >= 0.35 or mw > 200:
  754. res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  755. else:
  756. logging.debug("REMOVED: " +
  757. "<<".join([c["text"] for c in lines]))
  758. return "\n\n".join(res)
  759. @staticmethod
  760. def total_page_number(fnm, binary=None):
  761. try:
  762. pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary))
  763. return len(pdf.pages)
  764. except Exception as e:
  765. pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
  766. return len(pdf)
  767. def __images__(self, fnm, zoomin=3, page_from=0, page_to=299):
  768. self.lefted_chars = []
  769. self.mean_height = []
  770. self.mean_width = []
  771. self.boxes = []
  772. self.garbages = {}
  773. self.page_cum_height = [0]
  774. self.page_layout = []
  775. try:
  776. self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
  777. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  778. enumerate(self.pdf.pages[page_from:page_to])]
  779. self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
  780. self.total_page = len(self.pdf.pages)
  781. except Exception as e:
  782. self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
  783. self.page_images = []
  784. self.page_chars = []
  785. mat = fitz.Matrix(zoomin, zoomin)
  786. self.total_page = len(self.pdf)
  787. for i, page in enumerate(self.pdf):
  788. if i < page_from:continue
  789. if i >= page_to:break
  790. pix = page.get_pixmap(matrix=mat)
  791. img = Image.frombytes("RGB", [pix.width, pix.height],
  792. pix.samples)
  793. self.page_images.append(img)
  794. self.page_chars.append([])
  795. logging.info("Images converted.")
  796. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))]
  797. if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
  798. self.is_english = True
  799. else:
  800. self.is_english = False
  801. for i, img in enumerate(self.page_images):
  802. chars = self.page_chars[i] if not self.is_english else []
  803. self.mean_height.append(
  804. np.median(sorted([c["height"] for c in chars])) if chars else 0
  805. )
  806. self.mean_width.append(
  807. np.median(sorted([c["width"] for c in chars])) if chars else 8
  808. )
  809. self.page_cum_height.append(img.size[1] / zoomin)
  810. j = 0
  811. while j + 1 < len(chars):
  812. if chars[j]["text"] and chars[j + 1]["text"] \
  813. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  814. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  815. chars[j]["width"]) / 2:
  816. chars[j]["text"] += " "
  817. j += 1
  818. # if i > 0:
  819. # if not chars:
  820. # self.page_cum_height.append(img.size[1] / zoomin)
  821. # else:
  822. # self.page_cum_height.append(
  823. # np.max([c["bottom"] for c in chars]))
  824. self.__ocr(i + 1, img, chars, zoomin)
  825. if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
  826. bxes = [b for bxs in self.boxes for b in bxs]
  827. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  828. logging.info("Is it English:", self.is_english)
  829. self.page_cum_height = np.cumsum(self.page_cum_height)
  830. assert len(self.page_cum_height) == len(self.page_images) + 1
  831. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  832. self.__images__(fnm, zoomin)
  833. self._layouts_rec(zoomin)
  834. self._table_transformer_job(zoomin)
  835. self._text_merge()
  836. self._concat_downward()
  837. self._filter_forpages()
  838. tbls = self._extract_table_figure(need_image, zoomin, return_html)
  839. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  840. def remove_tag(self, txt):
  841. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  842. def crop(self, text, ZM=3):
  843. imgs = []
  844. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  845. pn, left, right, top, bottom = tag.strip(
  846. "#").strip("@").split("\t")
  847. left, right, top, bottom = float(left), float(
  848. right), float(top), float(bottom)
  849. bottom *= ZM
  850. pns = [int(p) - 1 for p in pn.split("-")]
  851. for pn in pns[1:]:
  852. bottom += self.page_images[pn - 1].size[1]
  853. imgs.append(
  854. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  855. right *
  856. ZM, min(
  857. bottom, self.page_images[pns[0]].size[1])
  858. ))
  859. )
  860. bottom -= self.page_images[pns[0]].size[1]
  861. for pn in pns[1:]:
  862. imgs.append(
  863. self.page_images[pn].crop((left * ZM, 0,
  864. right * ZM,
  865. min(bottom,
  866. self.page_images[pn].size[1])
  867. ))
  868. )
  869. bottom -= self.page_images[pn].size[1]
  870. if not imgs:
  871. return
  872. GAP = 2
  873. height = 0
  874. for img in imgs:
  875. height += img.size[1] + GAP
  876. height = int(height)
  877. pic = Image.new("RGB",
  878. (int(np.max([i.size[0] for i in imgs])), height),
  879. (245, 245, 245))
  880. height = 0
  881. for img in imgs:
  882. pic.paste(img, (0, int(height)))
  883. height += img.size[1] + GAP
  884. return pic
  885. if __name__ == "__main__":
  886. pass