Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

pdf_parser.py 38KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978
  1. # -*- coding: utf-8 -*-
  2. import random
  3. import fitz
  4. import xgboost as xgb
  5. from io import BytesIO
  6. import torch
  7. import re
  8. import pdfplumber
  9. import logging
  10. from PIL import Image
  11. import numpy as np
  12. from api.db import ParserType
  13. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  14. from rag.nlp import huqie
  15. from copy import deepcopy
  16. from huggingface_hub import hf_hub_download
  17. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  18. class HuParser:
  19. def __init__(self):
  20. self.ocr = OCR()
  21. if hasattr(self, "model_speciess"):
  22. self.layouter = LayoutRecognizer("layout."+self.model_speciess)
  23. else:
  24. self.layouter = LayoutRecognizer("layout")
  25. self.tbl_det = TableStructureRecognizer()
  26. self.updown_cnt_mdl = xgb.Booster()
  27. if torch.cuda.is_available():
  28. self.updown_cnt_mdl.set_param({"device": "cuda"})
  29. self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
  30. filename="updown_concat_xgb.model"))
  31. """
  32. If you have trouble downloading HuggingFace models, -_^ this might help!!
  33. For Linux:
  34. export HF_ENDPOINT=https://hf-mirror.com
  35. For Windows:
  36. Good luck
  37. ^_-
  38. """
  39. def __char_width(self, c):
  40. return (c["x1"] - c["x0"]) // len(c["text"])
  41. def __height(self, c):
  42. return c["bottom"] - c["top"]
  43. def _x_dis(self, a, b):
  44. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  45. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  46. def _y_dis(
  47. self, a, b):
  48. return (
  49. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  50. def _match_proj(self, b):
  51. proj_patt = [
  52. r"第[零一二三四五六七八九十百]+章",
  53. r"第[零一二三四五六七八九十百]+[条节]",
  54. r"[零一二三四五六七八九十百]+[、是  ]",
  55. r"[\((][零一二三四五六七八九十百]+[)\)]",
  56. r"[\((][0-9]+[)\)]",
  57. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  58. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  59. r"[⚫•➢①② ]",
  60. ]
  61. return any([re.match(p, b["text"]) for p in proj_patt])
  62. def _updown_concat_features(self, up, down):
  63. w = max(self.__char_width(up), self.__char_width(down))
  64. h = max(self.__height(up), self.__height(down))
  65. y_dis = self._y_dis(up, down)
  66. LEN = 6
  67. tks_down = huqie.qie(down["text"][:LEN]).split(" ")
  68. tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
  69. tks_all = up["text"][-LEN:].strip() \
  70. + (" " if re.match(r"[a-zA-Z0-9]+",
  71. up["text"][-1] + down["text"][0]) else "") \
  72. + down["text"][:LEN].strip()
  73. tks_all = huqie.qie(tks_all).split(" ")
  74. fea = [
  75. up.get("R", -1) == down.get("R", -1),
  76. y_dis / h,
  77. down["page_number"] - up["page_number"],
  78. up["layout_type"] == down["layout_type"],
  79. up["layout_type"] == "text",
  80. down["layout_type"] == "text",
  81. up["layout_type"] == "table",
  82. down["layout_type"] == "table",
  83. True if re.search(
  84. r"([。?!;!?;+))]|[a-z]\.)$",
  85. up["text"]) else False,
  86. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  87. True if re.search(
  88. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  89. down["text"]) else False,
  90. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  91. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  92. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  93. True if re.search(r"[\((][^\))]+$", up["text"])
  94. and re.search(r"[\))]", down["text"]) else False,
  95. self._match_proj(down),
  96. True if re.match(r"[A-Z]", down["text"]) else False,
  97. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  98. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  99. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  100. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  101. ) > 1 and len(
  102. down["text"].strip()) > 1 else False,
  103. up["x0"] > down["x1"],
  104. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  105. self.__height(down)),
  106. self._x_dis(up, down) / max(w, 0.000001),
  107. (len(up["text"]) - len(down["text"])) /
  108. max(len(up["text"]), len(down["text"])),
  109. len(tks_all) - len(tks_up) - len(tks_down),
  110. len(tks_down) - len(tks_up),
  111. tks_down[-1] == tks_up[-1],
  112. max(down["in_row"], up["in_row"]),
  113. abs(down["in_row"] - up["in_row"]),
  114. len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
  115. len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
  116. ]
  117. return fea
  118. @staticmethod
  119. def sort_X_by_page(arr, threashold):
  120. # sort using y1 first and then x1
  121. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  122. for i in range(len(arr) - 1):
  123. for j in range(i, -1, -1):
  124. # restore the order using th
  125. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  126. and arr[j + 1]["top"] < arr[j]["top"]\
  127. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  128. tmp = arr[j]
  129. arr[j] = arr[j + 1]
  130. arr[j + 1] = tmp
  131. return arr
  132. def _has_color(self, o):
  133. if o.get("ncs", "") == "DeviceGray":
  134. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  135. o["non_stroking_color"][0] == 1:
  136. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  137. return False
  138. return True
  139. def _table_transformer_job(self, ZM):
  140. logging.info("Table processing...")
  141. imgs, pos = [], []
  142. tbcnt = [0]
  143. MARGIN = 10
  144. self.tb_cpns = []
  145. assert len(self.page_layout) == len(self.page_images)
  146. for p, tbls in enumerate(self.page_layout): # for page
  147. tbls = [f for f in tbls if f["type"] == "table"]
  148. tbcnt.append(len(tbls))
  149. if not tbls:
  150. continue
  151. for tb in tbls: # for table
  152. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  153. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  154. left *= ZM
  155. top *= ZM
  156. right *= ZM
  157. bott *= ZM
  158. pos.append((left, top))
  159. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  160. assert len(self.page_images) == len(tbcnt) - 1
  161. if not imgs:
  162. return
  163. recos = self.tbl_det(imgs)
  164. tbcnt = np.cumsum(tbcnt)
  165. for i in range(len(tbcnt) - 1): # for page
  166. pg = []
  167. for j, tb_items in enumerate(
  168. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  169. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  170. for it in tb_items: # for table components
  171. it["x0"] = (it["x0"] + poss[j][0])
  172. it["x1"] = (it["x1"] + poss[j][0])
  173. it["top"] = (it["top"] + poss[j][1])
  174. it["bottom"] = (it["bottom"] + poss[j][1])
  175. for n in ["x0", "x1", "top", "bottom"]:
  176. it[n] /= ZM
  177. it["top"] += self.page_cum_height[i]
  178. it["bottom"] += self.page_cum_height[i]
  179. it["pn"] = i
  180. it["layoutno"] = j
  181. pg.append(it)
  182. self.tb_cpns.extend(pg)
  183. def gather(kwd, fzy=10, ption=0.6):
  184. eles = Recognizer.sort_Y_firstly(
  185. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  186. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  187. return Recognizer.sort_Y_firstly(eles, 0)
  188. # add R,H,C,SP tag to boxes within table layout
  189. headers = gather(r".*header$")
  190. rows = gather(r".* (row|header)")
  191. spans = gather(r".*spanning")
  192. clmns = sorted([r for r in self.tb_cpns if re.match(
  193. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  194. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  195. for b in self.boxes:
  196. if b.get("layout_type", "") != "table":
  197. continue
  198. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  199. if ii is not None:
  200. b["R"] = ii
  201. b["R_top"] = rows[ii]["top"]
  202. b["R_bott"] = rows[ii]["bottom"]
  203. ii = Recognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
  204. if ii is not None:
  205. b["H_top"] = headers[ii]["top"]
  206. b["H_bott"] = headers[ii]["bottom"]
  207. b["H_left"] = headers[ii]["x0"]
  208. b["H_right"] = headers[ii]["x1"]
  209. b["H"] = ii
  210. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  211. if ii is not None:
  212. b["C"] = ii
  213. b["C_left"] = clmns[ii]["x0"]
  214. b["C_right"] = clmns[ii]["x1"]
  215. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  216. if ii is not None:
  217. b["H_top"] = spans[ii]["top"]
  218. b["H_bott"] = spans[ii]["bottom"]
  219. b["H_left"] = spans[ii]["x0"]
  220. b["H_right"] = spans[ii]["x1"]
  221. b["SP"] = ii
  222. def __ocr(self, pagenum, img, chars, ZM=3):
  223. bxs = self.ocr(np.array(img))
  224. if not bxs:
  225. self.boxes.append([])
  226. return
  227. bxs = [(line[0], line[1][0]) for line in bxs]
  228. bxs = Recognizer.sort_Y_firstly(
  229. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  230. "top": b[0][1] / ZM, "text": "", "txt": t,
  231. "bottom": b[-1][1] / ZM,
  232. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  233. self.mean_height[-1] / 3
  234. )
  235. # merge chars in the same rect
  236. for c in Recognizer.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4):
  237. ii = Recognizer.find_overlapped(c, bxs)
  238. if ii is None:
  239. self.lefted_chars.append(c)
  240. continue
  241. ch = c["bottom"] - c["top"]
  242. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  243. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  244. self.lefted_chars.append(c)
  245. continue
  246. if c["text"] == " " and bxs[ii]["text"]:
  247. if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " "
  248. else:
  249. bxs[ii]["text"] += c["text"]
  250. for b in bxs:
  251. if not b["text"]:
  252. b["text"] = b["txt"]
  253. del b["txt"]
  254. if self.mean_height[-1] == 0:
  255. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  256. for b in bxs])
  257. self.boxes.append(bxs)
  258. def _layouts_rec(self, ZM):
  259. assert len(self.page_images) == len(self.boxes)
  260. self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM)
  261. # cumlative Y
  262. for i in range(len(self.boxes)):
  263. self.boxes[i]["top"] += \
  264. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  265. self.boxes[i]["bottom"] += \
  266. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  267. def _text_merge(self):
  268. # merge adjusted boxes
  269. bxs = self.boxes
  270. def end_with(b, txt):
  271. txt = txt.strip()
  272. tt = b.get("text", "").strip()
  273. return tt and tt.find(txt) == len(tt) - len(txt)
  274. def start_with(b, txts):
  275. tt = b.get("text", "").strip()
  276. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  277. # horizontally merge adjacent box with the same layout
  278. i = 0
  279. while i < len(bxs) - 1:
  280. b = bxs[i]
  281. b_ = bxs[i + 1]
  282. if b.get("layoutno", "0") != b_.get("layoutno", "1"):
  283. i += 1
  284. continue
  285. dis_thr = 1
  286. dis = b["x1"] - b_["x0"]
  287. if b.get("layout_type", "") != "text" or b_.get(
  288. "layout_type", "") != "text":
  289. if end_with(b, ",") or start_with(b_, "(,"):
  290. dis_thr = -8
  291. else:
  292. i += 1
  293. continue
  294. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  295. and dis >= dis_thr and b["x1"] < b_["x1"]:
  296. # merge
  297. bxs[i]["x1"] = b_["x1"]
  298. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  299. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  300. bxs[i]["text"] += b_["text"]
  301. bxs.pop(i + 1)
  302. continue
  303. i += 1
  304. self.boxes = bxs
  305. def _naive_vertical_merge(self):
  306. bxs = Recognizer.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
  307. i = 0
  308. while i + 1 < len(bxs):
  309. b = bxs[i]
  310. b_ = bxs[i + 1]
  311. if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
  312. bxs.pop(i)
  313. continue
  314. concatting_feats = [
  315. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  316. len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
  317. b["text"].strip()[0] in "。;?!?”)),,、:",
  318. ]
  319. # features for not concating
  320. feats = [
  321. b.get("layoutno", 0) != b.get("layoutno", 0),
  322. b["text"].strip()[-1] in "。?!?",
  323. self.is_english and b["text"].strip()[-1] in ".!?",
  324. b["page_number"] == b_["page_number"] and b_["top"] - \
  325. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  326. b["page_number"] < b_["page_number"] and abs(
  327. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
  328. ]
  329. if any(feats) and not any(concatting_feats):
  330. i += 1
  331. continue
  332. # merge up and down
  333. b["bottom"] = b_["bottom"]
  334. b["text"] += b_["text"]
  335. b["x0"] = min(b["x0"], b_["x0"])
  336. b["x1"] = max(b["x1"], b_["x1"])
  337. bxs.pop(i + 1)
  338. self.boxes = bxs
  339. def _concat_downward(self, concat_between_pages=True):
  340. # count boxes in the same row as a feature
  341. for i in range(len(self.boxes)):
  342. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  343. self.boxes[i]["in_row"] = 0
  344. j = max(0, i - 12)
  345. while j < min(i + 12, len(self.boxes)):
  346. if j == i:
  347. j += 1
  348. continue
  349. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  350. if abs(ydis) < 1:
  351. self.boxes[i]["in_row"] += 1
  352. elif ydis > 0:
  353. break
  354. j += 1
  355. # concat between rows
  356. boxes = deepcopy(self.boxes)
  357. blocks = []
  358. while boxes:
  359. chunks = []
  360. def dfs(up, dp):
  361. chunks.append(up)
  362. i = dp
  363. while i < min(dp + 12, len(boxes)):
  364. ydis = self._y_dis(up, boxes[i])
  365. smpg = up["page_number"] == boxes[i]["page_number"]
  366. mh = self.mean_height[up["page_number"] - 1]
  367. mw = self.mean_width[up["page_number"] - 1]
  368. if smpg and ydis > mh * 4:
  369. break
  370. if not smpg and ydis > mh * 16:
  371. break
  372. down = boxes[i]
  373. if not concat_between_pages and down["page_number"] > up["page_number"]:
  374. break
  375. if up.get("R", "") != down.get(
  376. "R", "") and up["text"][-1] != ",":
  377. i += 1
  378. continue
  379. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  380. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
  381. i += 1
  382. continue
  383. if not down["text"].strip():
  384. i += 1
  385. continue
  386. if up["x1"] < down["x0"] - 10 * \
  387. mw or up["x0"] > down["x1"] + 10 * mw:
  388. i += 1
  389. continue
  390. if i - dp < 5 and up.get("layout_type") == "text":
  391. if up.get("layoutno", "1") == down.get(
  392. "layoutno", "2"):
  393. dfs(down, i + 1)
  394. boxes.pop(i)
  395. return
  396. i += 1
  397. continue
  398. fea = self._updown_concat_features(up, down)
  399. if self.updown_cnt_mdl.predict(
  400. xgb.DMatrix([fea]))[0] <= 0.5:
  401. i += 1
  402. continue
  403. dfs(down, i + 1)
  404. boxes.pop(i)
  405. return
  406. dfs(boxes[0], 1)
  407. boxes.pop(0)
  408. if chunks:
  409. blocks.append(chunks)
  410. # concat within each block
  411. boxes = []
  412. for b in blocks:
  413. if len(b) == 1:
  414. boxes.append(b[0])
  415. continue
  416. t = b[0]
  417. for c in b[1:]:
  418. t["text"] = t["text"].strip()
  419. c["text"] = c["text"].strip()
  420. if not c["text"]:
  421. continue
  422. if t["text"] and re.match(
  423. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  424. t["text"] += " "
  425. t["text"] += c["text"]
  426. t["x0"] = min(t["x0"], c["x0"])
  427. t["x1"] = max(t["x1"], c["x1"])
  428. t["page_number"] = min(t["page_number"], c["page_number"])
  429. t["bottom"] = c["bottom"]
  430. if not t["layout_type"] \
  431. and c["layout_type"]:
  432. t["layout_type"] = c["layout_type"]
  433. boxes.append(t)
  434. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  435. def _filter_forpages(self):
  436. if not self.boxes:
  437. return
  438. findit = False
  439. i = 0
  440. while i < len(self.boxes):
  441. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  442. i += 1
  443. continue
  444. findit = True
  445. eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
  446. self.boxes.pop(i)
  447. if i >= len(self.boxes): break
  448. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
  449. while not prefix:
  450. self.boxes.pop(i)
  451. if i >= len(self.boxes): break
  452. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
  453. self.boxes.pop(i)
  454. if i >= len(self.boxes) or not prefix: break
  455. for j in range(i, min(i + 128, len(self.boxes))):
  456. if not re.match(prefix, self.boxes[j]["text"]):
  457. continue
  458. for k in range(i, j): self.boxes.pop(i)
  459. break
  460. if findit:return
  461. page_dirty = [0] * len(self.page_images)
  462. for b in self.boxes:
  463. if re.search(r"(··|··|··)", b["text"]):
  464. page_dirty[b["page_number"]-1] += 1
  465. page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3])
  466. if not page_dirty: return
  467. i = 0
  468. while i < len(self.boxes):
  469. if self.boxes[i]["page_number"] in page_dirty:
  470. self.boxes.pop(i)
  471. continue
  472. i += 1
  473. def _merge_with_same_bullet(self):
  474. i = 0
  475. while i + 1 < len(self.boxes):
  476. b = self.boxes[i]
  477. b_ = self.boxes[i + 1]
  478. if not b["text"].strip():
  479. self.boxes.pop(i)
  480. continue
  481. if not b_["text"].strip():
  482. self.boxes.pop(i+1)
  483. continue
  484. if b["text"].strip()[0] != b_["text"].strip()[0] \
  485. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  486. or huqie.is_chinese(b["text"].strip()[0]) \
  487. or b["top"] > b_["bottom"]:
  488. i += 1
  489. continue
  490. b_["text"] = b["text"] + "\n" + b_["text"]
  491. b_["x0"] = min(b["x0"], b_["x0"])
  492. b_["x1"] = max(b["x1"], b_["x1"])
  493. b_["top"] = b["top"]
  494. self.boxes.pop(i)
  495. def _extract_table_figure(self, need_image, ZM, return_html):
  496. tables = {}
  497. figures = {}
  498. # extract figure and table boxes
  499. i = 0
  500. lst_lout_no = ""
  501. nomerge_lout_no = []
  502. while i < len(self.boxes):
  503. if "layoutno" not in self.boxes[i]:
  504. i += 1
  505. continue
  506. lout_no = str(self.boxes[i]["page_number"]) + \
  507. "-" + str(self.boxes[i]["layoutno"])
  508. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
  509. "figure caption", "reference"]:
  510. nomerge_lout_no.append(lst_lout_no)
  511. if self.boxes[i]["layout_type"] == "table":
  512. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  513. self.boxes.pop(i)
  514. continue
  515. if lout_no not in tables:
  516. tables[lout_no] = []
  517. tables[lout_no].append(self.boxes[i])
  518. self.boxes.pop(i)
  519. lst_lout_no = lout_no
  520. continue
  521. if need_image and self.boxes[i]["layout_type"] == "figure":
  522. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  523. self.boxes.pop(i)
  524. continue
  525. if lout_no not in figures:
  526. figures[lout_no] = []
  527. figures[lout_no].append(self.boxes[i])
  528. self.boxes.pop(i)
  529. lst_lout_no = lout_no
  530. continue
  531. i += 1
  532. # merge table on different pages
  533. nomerge_lout_no = set(nomerge_lout_no)
  534. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  535. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  536. i = len(tbls) - 1
  537. while i - 1 >= 0:
  538. k0, bxs0 = tbls[i - 1]
  539. k, bxs = tbls[i]
  540. i -= 1
  541. if k0 in nomerge_lout_no:
  542. continue
  543. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  544. continue
  545. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  546. continue
  547. mh = self.mean_height[bxs[0]["page_number"] - 1]
  548. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  549. continue
  550. tables[k0].extend(tables[k])
  551. del tables[k]
  552. def x_overlapped(a, b):
  553. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  554. # find captions and pop out
  555. i = 0
  556. while i < len(self.boxes):
  557. c = self.boxes[i]
  558. # mh = self.mean_height[c["page_number"]-1]
  559. if not TableStructureRecognizer.is_caption(c):
  560. i += 1
  561. continue
  562. # find the nearest layouts
  563. def nearest(tbls):
  564. nonlocal c
  565. mink = ""
  566. minv = 1000000000
  567. for k, bxs in tbls.items():
  568. for b in bxs[:10]:
  569. if b.get("layout_type", "").find("caption") >= 0:
  570. continue
  571. y_dis = self._y_dis(c, b)
  572. x_dis = self._x_dis(
  573. c, b) if not x_overlapped(
  574. c, b) else 0
  575. dis = y_dis * y_dis + x_dis * x_dis
  576. if dis < minv:
  577. mink = k
  578. minv = dis
  579. return mink, minv
  580. tk, tv = nearest(tables)
  581. fk, fv = nearest(figures)
  582. if min(tv, fv) > 2000:
  583. i += 1
  584. continue
  585. if tv < fv:
  586. tables[tk].insert(0, c)
  587. logging.debug(
  588. "TABLE:" +
  589. self.boxes[i]["text"] +
  590. "; Cap: " +
  591. tk)
  592. else:
  593. figures[fk].insert(0, c)
  594. logging.debug(
  595. "FIGURE:" +
  596. self.boxes[i]["text"] +
  597. "; Cap: " +
  598. tk)
  599. self.boxes.pop(i)
  600. res = []
  601. def cropout(bxs, ltype):
  602. nonlocal ZM
  603. pn = set([b["page_number"] - 1 for b in bxs])
  604. if len(pn) < 2:
  605. pn = list(pn)[0]
  606. ht = self.page_cum_height[pn]
  607. b = {
  608. "x0": np.min([b["x0"] for b in bxs]),
  609. "top": np.min([b["top"] for b in bxs]) - ht,
  610. "x1": np.max([b["x1"] for b in bxs]),
  611. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  612. }
  613. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  614. ii = Recognizer.find_overlapped(b, louts, naive=True)
  615. if ii is not None:
  616. b = louts[ii]
  617. else:
  618. logging.warn(
  619. f"Missing layout match: {pn + 1},%s" %
  620. (bxs[0].get(
  621. "layoutno", "")))
  622. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  623. return self.page_images[pn] \
  624. .crop((left * ZM, top * ZM,
  625. right * ZM, bott * ZM))
  626. pn = {}
  627. for b in bxs:
  628. p = b["page_number"] - 1
  629. if p not in pn:
  630. pn[p] = []
  631. pn[p].append(b)
  632. pn = sorted(pn.items(), key=lambda x: x[0])
  633. imgs = [cropout(arr, ltype) for p, arr in pn]
  634. pic = Image.new("RGB",
  635. (int(np.max([i.size[0] for i in imgs])),
  636. int(np.sum([m.size[1] for m in imgs]))),
  637. (245, 245, 245))
  638. height = 0
  639. for img in imgs:
  640. pic.paste(img, (0, int(height)))
  641. height += img.size[1]
  642. return pic
  643. # crop figure out and add caption
  644. for k, bxs in figures.items():
  645. txt = "\n".join(
  646. [b["text"] for b in bxs
  647. if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
  648. and len(b["text"].strip()) >= 4
  649. ]
  650. )
  651. if not txt:
  652. continue
  653. res.append(
  654. (cropout(
  655. bxs,
  656. "figure"),
  657. [txt] if not return_html else [f"<p>{txt}</p>"]))
  658. for k, bxs in tables.items():
  659. if not bxs:
  660. continue
  661. res.append((cropout(bxs, "table"),
  662. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  663. return res
  664. def proj_match(self, line):
  665. if len(line) <= 2:
  666. return
  667. if re.match(r"[0-9 ().,%%+/-]+$", line):
  668. return False
  669. for p, j in [
  670. (r"第[零一二三四五六七八九十百]+章", 1),
  671. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  672. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  673. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  674. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  675. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  676. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  677. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  678. (r".{,48}[::??]$", 9),
  679. (r"[0-9]+)", 10),
  680. (r"[\((][0-9]+[)\)]", 11),
  681. (r"[零一二三四五六七八九十百]+是", 12),
  682. (r"[⚫•➢✓]", 12)
  683. ]:
  684. if re.match(p, line):
  685. return j
  686. return
  687. def _line_tag(self, bx, ZM):
  688. pn = [bx["page_number"]]
  689. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  690. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  691. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  692. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  693. pn.append(pn[-1] + 1)
  694. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  695. .format("-".join([str(p) for p in pn]),
  696. bx["x0"], bx["x1"], top, bott)
  697. def __filterout_scraps(self, boxes, ZM):
  698. def width(b):
  699. return b["x1"] - b["x0"]
  700. def height(b):
  701. return b["bottom"] - b["top"]
  702. def usefull(b):
  703. if b.get("layout_type"):
  704. return True
  705. if width(
  706. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  707. return True
  708. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  709. return True
  710. return False
  711. res = []
  712. while boxes:
  713. lines = []
  714. widths = []
  715. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  716. mh = self.mean_height[boxes[0]["page_number"] - 1]
  717. mj = self.proj_match(
  718. boxes[0]["text"]) or boxes[0].get(
  719. "layout_type",
  720. "") == "title"
  721. def dfs(line, st):
  722. nonlocal mh, pw, lines, widths
  723. lines.append(line)
  724. widths.append(width(line))
  725. width_mean = np.mean(widths)
  726. mmj = self.proj_match(
  727. line["text"]) or line.get(
  728. "layout_type",
  729. "") == "title"
  730. for i in range(st + 1, min(st + 20, len(boxes))):
  731. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  732. break
  733. if not mmj and self._y_dis(
  734. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  735. break
  736. if not usefull(boxes[i]):
  737. continue
  738. if mmj or \
  739. (self._x_dis(boxes[i], line) < pw / 10): \
  740. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  741. # concat following
  742. dfs(boxes[i], i)
  743. boxes.pop(i)
  744. break
  745. try:
  746. if usefull(boxes[0]):
  747. dfs(boxes[0], 0)
  748. else:
  749. logging.debug("WASTE: " + boxes[0]["text"])
  750. except Exception as e:
  751. pass
  752. boxes.pop(0)
  753. mw = np.mean(widths)
  754. if mj or mw / pw >= 0.35 or mw > 200:
  755. res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  756. else:
  757. logging.debug("REMOVED: " +
  758. "<<".join([c["text"] for c in lines]))
  759. return "\n\n".join(res)
  760. @staticmethod
  761. def total_page_number(fnm, binary=None):
  762. try:
  763. pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary))
  764. return len(pdf.pages)
  765. except Exception as e:
  766. pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
  767. return len(pdf)
  768. def __images__(self, fnm, zoomin=3, page_from=0, page_to=299):
  769. self.lefted_chars = []
  770. self.mean_height = []
  771. self.mean_width = []
  772. self.boxes = []
  773. self.garbages = {}
  774. self.page_cum_height = [0]
  775. self.page_layout = []
  776. try:
  777. self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
  778. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  779. enumerate(self.pdf.pages[page_from:page_to])]
  780. self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
  781. self.total_page = len(self.pdf.pages)
  782. except Exception as e:
  783. self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
  784. self.page_images = []
  785. self.page_chars = []
  786. mat = fitz.Matrix(zoomin, zoomin)
  787. self.total_page = len(self.pdf)
  788. for i, page in enumerate(self.pdf):
  789. if i < page_from:continue
  790. if i >= page_to:break
  791. pix = page.get_pixmap(matrix=mat)
  792. img = Image.frombytes("RGB", [pix.width, pix.height],
  793. pix.samples)
  794. self.page_images.append(img)
  795. self.page_chars.append([])
  796. logging.info("Images converted.")
  797. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))]
  798. if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
  799. self.is_english = True
  800. else:
  801. self.is_english = False
  802. for i, img in enumerate(self.page_images):
  803. chars = self.page_chars[i] if not self.is_english else []
  804. self.mean_height.append(
  805. np.median(sorted([c["height"] for c in chars])) if chars else 0
  806. )
  807. self.mean_width.append(
  808. np.median(sorted([c["width"] for c in chars])) if chars else 8
  809. )
  810. self.page_cum_height.append(img.size[1] / zoomin)
  811. j = 0
  812. while j + 1 < len(chars):
  813. if chars[j]["text"] and chars[j + 1]["text"] \
  814. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  815. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  816. chars[j]["width"]) / 2:
  817. chars[j]["text"] += " "
  818. j += 1
  819. # if i > 0:
  820. # if not chars:
  821. # self.page_cum_height.append(img.size[1] / zoomin)
  822. # else:
  823. # self.page_cum_height.append(
  824. # np.max([c["bottom"] for c in chars]))
  825. self.__ocr(i + 1, img, chars, zoomin)
  826. if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
  827. bxes = [b for bxs in self.boxes for b in bxs]
  828. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  829. logging.info("Is it English:", self.is_english)
  830. self.page_cum_height = np.cumsum(self.page_cum_height)
  831. assert len(self.page_cum_height) == len(self.page_images) + 1
  832. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  833. self.__images__(fnm, zoomin)
  834. self._layouts_rec(zoomin)
  835. self._table_transformer_job(zoomin)
  836. self._text_merge()
  837. self._concat_downward()
  838. self._filter_forpages()
  839. tbls = self._extract_table_figure(need_image, zoomin, return_html)
  840. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  841. def remove_tag(self, txt):
  842. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  843. def crop(self, text, ZM=3):
  844. imgs = []
  845. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  846. pn, left, right, top, bottom = tag.strip(
  847. "#").strip("@").split("\t")
  848. left, right, top, bottom = float(left), float(
  849. right), float(top), float(bottom)
  850. bottom *= ZM
  851. pns = [int(p) - 1 for p in pn.split("-")]
  852. for pn in pns[1:]:
  853. bottom += self.page_images[pn - 1].size[1]
  854. imgs.append(
  855. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  856. right *
  857. ZM, min(
  858. bottom, self.page_images[pns[0]].size[1])
  859. ))
  860. )
  861. bottom -= self.page_images[pns[0]].size[1]
  862. for pn in pns[1:]:
  863. imgs.append(
  864. self.page_images[pn].crop((left * ZM, 0,
  865. right * ZM,
  866. min(bottom,
  867. self.page_images[pn].size[1])
  868. ))
  869. )
  870. bottom -= self.page_images[pn].size[1]
  871. if not imgs:
  872. return
  873. GAP = 2
  874. height = 0
  875. for img in imgs:
  876. height += img.size[1] + GAP
  877. height = int(height)
  878. pic = Image.new("RGB",
  879. (int(np.max([i.size[0] for i in imgs])), height),
  880. (245, 245, 245))
  881. height = 0
  882. for img in imgs:
  883. pic.paste(img, (0, int(height)))
  884. height += img.size[1] + GAP
  885. return pic
  886. if __name__ == "__main__":
  887. pass