Nelze vybrat více než 25 témat Téma musí začínat písmenem nebo číslem, může obsahovat pomlčky („-“) a může být dlouhé až 35 znaků.

pdf_parser.py 63KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657
  1. # -*- coding: utf-8 -*-
  2. import fitz
  3. import xgboost as xgb
  4. from io import BytesIO
  5. import torch
  6. import re
  7. import pdfplumber
  8. import logging
  9. from PIL import Image
  10. import numpy as np
  11. from rag.nlp import huqie
  12. from collections import Counter
  13. from copy import deepcopy
  14. from rag.cv.table_recognize import TableTransformer
  15. from rag.cv.ppdetection import PPDet
  16. from huggingface_hub import hf_hub_download
  17. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  18. class HuParser:
  19. def __init__(self):
  20. from paddleocr import PaddleOCR
  21. logging.getLogger("ppocr").setLevel(logging.ERROR)
  22. self.ocr = PaddleOCR(use_angle_cls=False, lang="ch")
  23. self.layouter = PPDet()
  24. self.tbl_det = TableTransformer()
  25. self.updown_cnt_mdl = xgb.Booster()
  26. if torch.cuda.is_available():
  27. self.updown_cnt_mdl.set_param({"device": "cuda"})
  28. self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
  29. filename="updown_concat_xgb.model"))
  30. """
  31. If you have trouble downloading HuggingFace models, -_^ this might help!!
  32. For Linux:
  33. export HF_ENDPOINT=https://hf-mirror.com
  34. For Windows:
  35. Good luck
  36. ^_-
  37. """
  38. def __char_width(self, c):
  39. return (c["x1"] - c["x0"]) // len(c["text"])
  40. def __height(self, c):
  41. return c["bottom"] - c["top"]
  42. def _x_dis(self, a, b):
  43. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  44. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  45. def _y_dis(
  46. self, a, b):
  47. return (
  48. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  49. def _match_proj(self, b):
  50. proj_patt = [
  51. r"第[零一二三四五六七八九十百]+章",
  52. r"第[零一二三四五六七八九十百]+[条节]",
  53. r"[零一二三四五六七八九十百]+[、是  ]",
  54. r"[\((][零一二三四五六七八九十百]+[)\)]",
  55. r"[\((][0-9]+[)\)]",
  56. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  57. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  58. r"[⚫•➢①② ]",
  59. ]
  60. return any([re.match(p, b["text"]) for p in proj_patt])
  61. def _updown_concat_features(self, up, down):
  62. w = max(self.__char_width(up), self.__char_width(down))
  63. h = max(self.__height(up), self.__height(down))
  64. y_dis = self._y_dis(up, down)
  65. LEN = 6
  66. tks_down = huqie.qie(down["text"][:LEN]).split(" ")
  67. tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
  68. tks_all = up["text"][-LEN:].strip() \
  69. + (" " if re.match(r"[a-zA-Z0-9]+",
  70. up["text"][-1] + down["text"][0]) else "") \
  71. + down["text"][:LEN].strip()
  72. tks_all = huqie.qie(tks_all).split(" ")
  73. fea = [
  74. up.get("R", -1) == down.get("R", -1),
  75. y_dis / h,
  76. down["page_number"] - up["page_number"],
  77. up["layout_type"] == down["layout_type"],
  78. up["layout_type"] == "text",
  79. down["layout_type"] == "text",
  80. up["layout_type"] == "table",
  81. down["layout_type"] == "table",
  82. True if re.search(
  83. r"([。?!;!?;+))]|[a-z]\.)$",
  84. up["text"]) else False,
  85. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  86. True if re.search(
  87. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  88. down["text"]) else False,
  89. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  90. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  91. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  92. True if re.search(r"[\((][^\))]+$", up["text"])
  93. and re.search(r"[\))]", down["text"]) else False,
  94. self._match_proj(down),
  95. True if re.match(r"[A-Z]", down["text"]) else False,
  96. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  97. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  98. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  99. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  100. ) > 1 and len(
  101. down["text"].strip()) > 1 else False,
  102. up["x0"] > down["x1"],
  103. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  104. self.__height(down)),
  105. self._x_dis(up, down) / max(w, 0.000001),
  106. (len(up["text"]) - len(down["text"])) /
  107. max(len(up["text"]), len(down["text"])),
  108. len(tks_all) - len(tks_up) - len(tks_down),
  109. len(tks_down) - len(tks_up),
  110. tks_down[-1] == tks_up[-1],
  111. max(down["in_row"], up["in_row"]),
  112. abs(down["in_row"] - up["in_row"]),
  113. len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
  114. len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
  115. ]
  116. return fea
  117. @staticmethod
  118. def sort_Y_firstly(arr, threashold):
  119. # sort using y1 first and then x1
  120. arr = sorted(arr, key=lambda r: (r["top"], r["x0"]))
  121. for i in range(len(arr) - 1):
  122. for j in range(i, -1, -1):
  123. # restore the order using th
  124. if abs(arr[j + 1]["top"] - arr[j]["top"]) < threashold \
  125. and arr[j + 1]["x0"] < arr[j]["x0"]:
  126. tmp = deepcopy(arr[j])
  127. arr[j] = deepcopy(arr[j + 1])
  128. arr[j + 1] = deepcopy(tmp)
  129. return arr
  130. @staticmethod
  131. def sort_R_firstly(arr, thr=0):
  132. # sort using y1 first and then x1
  133. # sorted(arr, key=lambda r: (r["top"], r["x0"]))
  134. arr = HuParser.sort_Y_firstly(arr, thr)
  135. for i in range(len(arr) - 1):
  136. for j in range(i, -1, -1):
  137. if "R" not in arr[j] or "R" not in arr[j + 1]:
  138. continue
  139. if arr[j + 1]["R"] < arr[j]["R"] \
  140. or (
  141. arr[j + 1]["R"] == arr[j]["R"]
  142. and arr[j + 1]["x0"] < arr[j]["x0"]
  143. ):
  144. tmp = arr[j]
  145. arr[j] = arr[j + 1]
  146. arr[j + 1] = tmp
  147. return arr
  148. @staticmethod
  149. def sort_X_firstly(arr, threashold, copy=True):
  150. # sort using y1 first and then x1
  151. arr = sorted(arr, key=lambda r: (r["x0"], r["top"]))
  152. for i in range(len(arr) - 1):
  153. for j in range(i, -1, -1):
  154. # restore the order using th
  155. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  156. and arr[j + 1]["top"] < arr[j]["top"]:
  157. tmp = deepcopy(arr[j]) if copy else arr[j]
  158. arr[j] = deepcopy(arr[j + 1]) if copy else arr[j + 1]
  159. arr[j + 1] = deepcopy(tmp) if copy else tmp
  160. return arr
  161. @staticmethod
  162. def sort_C_firstly(arr, thr=0):
  163. # sort using y1 first and then x1
  164. # sorted(arr, key=lambda r: (r["x0"], r["top"]))
  165. arr = HuParser.sort_X_firstly(arr, thr)
  166. for i in range(len(arr) - 1):
  167. for j in range(i, -1, -1):
  168. # restore the order using th
  169. if "C" not in arr[j] or "C" not in arr[j + 1]:
  170. continue
  171. if arr[j + 1]["C"] < arr[j]["C"] \
  172. or (
  173. arr[j + 1]["C"] == arr[j]["C"]
  174. and arr[j + 1]["top"] < arr[j]["top"]
  175. ):
  176. tmp = arr[j]
  177. arr[j] = arr[j + 1]
  178. arr[j + 1] = tmp
  179. return arr
  180. return sorted(arr, key=lambda r: (r.get("C", r["x0"]), r["top"]))
  181. def _has_color(self, o):
  182. if o.get("ncs", "") == "DeviceGray":
  183. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  184. o["non_stroking_color"][0] == 1:
  185. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  186. return False
  187. return True
  188. def __overlapped_area(self, a, b, ratio=True):
  189. tp, btm, x0, x1 = a["top"], a["bottom"], a["x0"], a["x1"]
  190. if b["x0"] > x1 or b["x1"] < x0:
  191. return 0
  192. if b["bottom"] < tp or b["top"] > btm:
  193. return 0
  194. x0_ = max(b["x0"], x0)
  195. x1_ = min(b["x1"], x1)
  196. assert x0_ <= x1_, "Fuckedup! T:{},B:{},X0:{},X1:{} ==> {}".format(
  197. tp, btm, x0, x1, b)
  198. tp_ = max(b["top"], tp)
  199. btm_ = min(b["bottom"], btm)
  200. assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
  201. tp, btm, x0, x1, b)
  202. ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
  203. x0 != 0 and btm - tp != 0 else 0
  204. if ov > 0 and ratio:
  205. ov /= (x1 - x0) * (btm - tp)
  206. return ov
  207. def __find_overlapped_with_threashold(self, box, boxes, thr=0.3):
  208. if not boxes:
  209. return
  210. max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0
  211. s, e = 0, len(boxes)
  212. for i in range(s, e):
  213. ov = self.__overlapped_area(box, boxes[i])
  214. _ov = self.__overlapped_area(boxes[i], box)
  215. if (ov, _ov) < (max_overlaped, _max_overlaped):
  216. continue
  217. max_overlaped_i = i
  218. max_overlaped = ov
  219. _max_overlaped = _ov
  220. return max_overlaped_i
  221. def __find_overlapped(self, box, boxes_sorted_by_y, naive=False):
  222. if not boxes_sorted_by_y:
  223. return
  224. bxs = boxes_sorted_by_y
  225. s, e, ii = 0, len(bxs), 0
  226. while s < e and not naive:
  227. ii = (e + s) // 2
  228. pv = bxs[ii]
  229. if box["bottom"] < pv["top"]:
  230. e = ii
  231. continue
  232. if box["top"] > pv["bottom"]:
  233. s = ii + 1
  234. continue
  235. break
  236. while s < ii:
  237. if box["top"] > bxs[s]["bottom"]:
  238. s += 1
  239. break
  240. while e - 1 > ii:
  241. if box["bottom"] < bxs[e - 1]["top"]:
  242. e -= 1
  243. break
  244. max_overlaped_i, max_overlaped = None, 0
  245. for i in range(s, e):
  246. ov = self.__overlapped_area(bxs[i], box)
  247. if ov <= max_overlaped:
  248. continue
  249. max_overlaped_i = i
  250. max_overlaped = ov
  251. return max_overlaped_i
  252. def _is_garbage(self, b):
  253. patt = [r"^•+$", r"(版权归©|免责条款|地址[::])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
  254. r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
  255. "(资料|数据)来源[::]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}",
  256. "\\(cid *: *[0-9]+ *\\)"
  257. ]
  258. return any([re.search(p, b["text"]) for p in patt])
  259. def __layouts_cleanup(self, boxes, layouts, far=2, thr=0.7):
  260. def notOverlapped(a, b):
  261. return any([a["x1"] < b["x0"],
  262. a["x0"] > b["x1"],
  263. a["bottom"] < b["top"],
  264. a["top"] > b["bottom"]])
  265. i = 0
  266. while i + 1 < len(layouts):
  267. j = i + 1
  268. while j < min(i + far, len(layouts)) \
  269. and (layouts[i].get("type", "") != layouts[j].get("type", "")
  270. or notOverlapped(layouts[i], layouts[j])):
  271. j += 1
  272. if j >= min(i + far, len(layouts)):
  273. i += 1
  274. continue
  275. if self.__overlapped_area(layouts[i], layouts[j]) < thr \
  276. and self.__overlapped_area(layouts[j], layouts[i]) < thr:
  277. i += 1
  278. continue
  279. if layouts[i].get("score") and layouts[j].get("score"):
  280. if layouts[i]["score"] > layouts[j]["score"]:
  281. layouts.pop(j)
  282. else:
  283. layouts.pop(i)
  284. continue
  285. area_i, area_i_1 = 0, 0
  286. for b in boxes:
  287. if not notOverlapped(b, layouts[i]):
  288. area_i += self.__overlapped_area(b, layouts[i], False)
  289. if not notOverlapped(b, layouts[j]):
  290. area_i_1 += self.__overlapped_area(b, layouts[j], False)
  291. if area_i > area_i_1:
  292. layouts.pop(j)
  293. else:
  294. layouts.pop(i)
  295. return layouts
  296. def __table_paddle(self, images):
  297. tbls = self.tbl_det([img for img in images], threshold=0.5)
  298. res = []
  299. # align left&right for rows, align top&bottom for columns
  300. for tbl in tbls:
  301. lts = [{"label": b["type"],
  302. "score": b["score"],
  303. "x0": b["bbox"][0], "x1": b["bbox"][2],
  304. "top": b["bbox"][1], "bottom": b["bbox"][-1]
  305. } for b in tbl]
  306. if not lts:
  307. continue
  308. left = [b["x0"] for b in lts if b["label"].find(
  309. "row") > 0 or b["label"].find("header") > 0]
  310. right = [b["x1"] for b in lts if b["label"].find(
  311. "row") > 0 or b["label"].find("header") > 0]
  312. if not left:
  313. continue
  314. left = np.median(left) if len(left) > 4 else np.min(left)
  315. right = np.median(right) if len(right) > 4 else np.max(right)
  316. for b in lts:
  317. if b["label"].find("row") > 0 or b["label"].find("header") > 0:
  318. if b["x0"] > left:
  319. b["x0"] = left
  320. if b["x1"] < right:
  321. b["x1"] = right
  322. top = [b["top"] for b in lts if b["label"] == "table column"]
  323. bottom = [b["bottom"] for b in lts if b["label"] == "table column"]
  324. if not top:
  325. res.append(lts)
  326. continue
  327. top = np.median(top) if len(top) > 4 else np.min(top)
  328. bottom = np.median(bottom) if len(bottom) > 4 else np.max(bottom)
  329. for b in lts:
  330. if b["label"] == "table column":
  331. if b["top"] > top:
  332. b["top"] = top
  333. if b["bottom"] < bottom:
  334. b["bottom"] = bottom
  335. res.append(lts)
  336. return res
  337. def _table_transformer_job(self, ZM):
  338. logging.info("Table processing...")
  339. imgs, pos = [], []
  340. tbcnt = [0]
  341. MARGIN = 10
  342. self.tb_cpns = []
  343. assert len(self.page_layout) == len(self.page_images)
  344. for p, tbls in enumerate(self.page_layout): # for page
  345. tbls = [f for f in tbls if f["type"] == "table"]
  346. tbcnt.append(len(tbls))
  347. if not tbls:
  348. continue
  349. for tb in tbls: # for table
  350. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  351. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  352. left *= ZM
  353. top *= ZM
  354. right *= ZM
  355. bott *= ZM
  356. pos.append((left, top))
  357. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  358. assert len(self.page_images) == len(tbcnt) - 1
  359. if not imgs:
  360. return
  361. recos = self.__table_paddle(imgs)
  362. tbcnt = np.cumsum(tbcnt)
  363. for i in range(len(tbcnt) - 1): # for page
  364. pg = []
  365. for j, tb_items in enumerate(
  366. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  367. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  368. for it in tb_items: # for table components
  369. it["x0"] = (it["x0"] + poss[j][0])
  370. it["x1"] = (it["x1"] + poss[j][0])
  371. it["top"] = (it["top"] + poss[j][1])
  372. it["bottom"] = (it["bottom"] + poss[j][1])
  373. for n in ["x0", "x1", "top", "bottom"]:
  374. it[n] /= ZM
  375. it["top"] += self.page_cum_height[i]
  376. it["bottom"] += self.page_cum_height[i]
  377. it["pn"] = i
  378. it["layoutno"] = j
  379. pg.append(it)
  380. self.tb_cpns.extend(pg)
  381. def gather(kwd, fzy=10, ption=0.6):
  382. eles = self.sort_Y_firstly(
  383. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  384. eles = self.__layouts_cleanup(self.boxes, eles, 5, ption)
  385. return self.sort_Y_firstly(eles, 0)
  386. # add R,H,C,SP tag to boxes within table layout
  387. headers = gather(r".*header$")
  388. rows = gather(r".* (row|header)")
  389. spans = gather(r".*spanning")
  390. clmns = sorted([r for r in self.tb_cpns if re.match(
  391. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  392. clmns = self.__layouts_cleanup(self.boxes, clmns, 5, 0.5)
  393. for b in self.boxes:
  394. if b.get("layout_type", "") != "table":
  395. continue
  396. ii = self.__find_overlapped_with_threashold(b, rows, thr=0.3)
  397. if ii is not None:
  398. b["R"] = ii
  399. b["R_top"] = rows[ii]["top"]
  400. b["R_bott"] = rows[ii]["bottom"]
  401. ii = self.__find_overlapped_with_threashold(b, headers, thr=0.3)
  402. if ii is not None:
  403. b["H_top"] = headers[ii]["top"]
  404. b["H_bott"] = headers[ii]["bottom"]
  405. b["H_left"] = headers[ii]["x0"]
  406. b["H_right"] = headers[ii]["x1"]
  407. b["H"] = ii
  408. ii = self.__find_overlapped_with_threashold(b, clmns, thr=0.3)
  409. if ii is not None:
  410. b["C"] = ii
  411. b["C_left"] = clmns[ii]["x0"]
  412. b["C_right"] = clmns[ii]["x1"]
  413. ii = self.__find_overlapped_with_threashold(b, spans, thr=0.3)
  414. if ii is not None:
  415. b["H_top"] = spans[ii]["top"]
  416. b["H_bott"] = spans[ii]["bottom"]
  417. b["H_left"] = spans[ii]["x0"]
  418. b["H_right"] = spans[ii]["x1"]
  419. b["SP"] = ii
  420. def __ocr_paddle(self, pagenum, img, chars, ZM=3):
  421. bxs = self.ocr.ocr(np.array(img), cls=True)[0]
  422. if not bxs:
  423. self.boxes.append([])
  424. return
  425. bxs = [(line[0], line[1][0]) for line in bxs]
  426. bxs = self.sort_Y_firstly(
  427. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  428. "top": b[0][1] / ZM, "text": "", "txt": t,
  429. "bottom": b[-1][1] / ZM,
  430. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  431. self.mean_height[-1] / 3
  432. )
  433. # merge chars in the same rect
  434. for c in self.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4):
  435. ii = self.__find_overlapped(c, bxs)
  436. if ii is None:
  437. self.lefted_chars.append(c)
  438. continue
  439. ch = c["bottom"] - c["top"]
  440. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  441. if abs(ch - bh) / max(ch, bh) >= 0.7:
  442. self.lefted_chars.append(c)
  443. continue
  444. bxs[ii]["text"] += c["text"]
  445. for b in bxs:
  446. if not b["text"]:
  447. b["text"] = b["txt"]
  448. del b["txt"]
  449. if self.mean_height[-1] == 0:
  450. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  451. for b in bxs])
  452. self.boxes.append(bxs)
  453. def _layouts_paddle(self, ZM):
  454. assert len(self.page_images) == len(self.boxes)
  455. # Tag layout type
  456. boxes = []
  457. layouts = self.layouter([np.array(img) for img in self.page_images])
  458. assert len(self.page_images) == len(layouts)
  459. for pn, lts in enumerate(layouts):
  460. bxs = self.boxes[pn]
  461. lts = [{"type": b["type"],
  462. "score": float(b["score"]),
  463. "x0": b["bbox"][0] / ZM, "x1": b["bbox"][2] / ZM,
  464. "top": b["bbox"][1] / ZM, "bottom": b["bbox"][-1] / ZM,
  465. "page_number": pn,
  466. } for b in lts]
  467. lts = self.sort_Y_firstly(lts, self.mean_height[pn] / 2)
  468. lts = self.__layouts_cleanup(bxs, lts)
  469. self.page_layout.append(lts)
  470. # Tag layout type, layouts are ready
  471. def findLayout(ty):
  472. nonlocal bxs, lts
  473. lts_ = [lt for lt in lts if lt["type"] == ty]
  474. i = 0
  475. while i < len(bxs):
  476. if bxs[i].get("layout_type"):
  477. i += 1
  478. continue
  479. if self._is_garbage(bxs[i]):
  480. logging.debug("GARBAGE: " + bxs[i]["text"])
  481. bxs.pop(i)
  482. continue
  483. ii = self.__find_overlapped_with_threashold(bxs[i], lts_,
  484. thr=0.4)
  485. if ii is None: # belong to nothing
  486. bxs[i]["layout_type"] = ""
  487. i += 1
  488. continue
  489. lts_[ii]["visited"] = True
  490. if lts_[ii]["type"] in ["footer", "header", "reference"]:
  491. if lts_[ii]["type"] not in self.garbages:
  492. self.garbages[lts_[ii]["type"]] = []
  493. self.garbages[lts_[ii]["type"]].append(bxs[i]["text"])
  494. logging.debug("GARBAGE: " + bxs[i]["text"])
  495. bxs.pop(i)
  496. continue
  497. bxs[i]["layoutno"] = f"{ty}-{ii}"
  498. bxs[i]["layout_type"] = lts_[ii]["type"]
  499. i += 1
  500. for lt in ["footer", "header", "reference", "figure caption",
  501. "table caption", "title", "text", "table", "figure"]:
  502. findLayout(lt)
  503. # add box to figure layouts which has not text box
  504. for i, lt in enumerate(
  505. [lt for lt in lts if lt["type"] == "figure"]):
  506. if lt.get("visited"):
  507. continue
  508. lt = deepcopy(lt)
  509. del lt["type"]
  510. lt["text"] = ""
  511. lt["layout_type"] = "figure"
  512. lt["layoutno"] = f"figure-{i}"
  513. bxs.append(lt)
  514. boxes.extend(bxs)
  515. self.boxes = boxes
  516. garbage = set()
  517. for k in self.garbages.keys():
  518. self.garbages[k] = Counter(self.garbages[k])
  519. for g, c in self.garbages[k].items():
  520. if c > 1:
  521. garbage.add(g)
  522. logging.debug("GARBAGE:" + ",".join(garbage))
  523. self.boxes = [b for b in self.boxes if b["text"].strip() not in garbage]
  524. # cumlative Y
  525. for i in range(len(self.boxes)):
  526. self.boxes[i]["top"] += \
  527. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  528. self.boxes[i]["bottom"] += \
  529. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  530. def _text_merge(self):
  531. # merge adjusted boxes
  532. bxs = self.boxes
  533. def end_with(b, txt):
  534. txt = txt.strip()
  535. tt = b.get("text", "").strip()
  536. return tt and tt.find(txt) == len(tt) - len(txt)
  537. def start_with(b, txts):
  538. tt = b.get("text", "").strip()
  539. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  540. # horizontally merge adjacent box with the same layout
  541. i = 0
  542. while i < len(bxs) - 1:
  543. b = bxs[i]
  544. b_ = bxs[i + 1]
  545. if b.get("layoutno", "0") != b_.get("layoutno", "1"):
  546. i += 1
  547. continue
  548. dis_thr = 1
  549. dis = b["x1"] - b_["x0"]
  550. if b.get("layout_type", "") != "text" or b_.get(
  551. "layout_type", "") != "text":
  552. if end_with(b, ",") or start_with(b_, "(,"):
  553. dis_thr = -8
  554. else:
  555. i += 1
  556. continue
  557. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  558. and dis >= dis_thr and b["x1"] < b_["x1"]:
  559. # merge
  560. bxs[i]["x1"] = b_["x1"]
  561. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  562. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  563. bxs[i]["text"] += b_["text"]
  564. bxs.pop(i + 1)
  565. continue
  566. i += 1
  567. self.boxes = bxs
  568. def _concat_downward(self):
  569. # count boxes in the same row as a feature
  570. for i in range(len(self.boxes)):
  571. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  572. self.boxes[i]["in_row"] = 0
  573. j = max(0, i - 12)
  574. while j < min(i + 12, len(self.boxes)):
  575. if j == i:
  576. j += 1
  577. continue
  578. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  579. if abs(ydis) < 1:
  580. self.boxes[i]["in_row"] += 1
  581. elif ydis > 0:
  582. break
  583. j += 1
  584. # concat between rows
  585. boxes = deepcopy(self.boxes)
  586. blocks = []
  587. while boxes:
  588. chunks = []
  589. def dfs(up, dp):
  590. chunks.append(up)
  591. i = dp
  592. while i < min(dp + 12, len(boxes)):
  593. ydis = self._y_dis(up, boxes[i])
  594. smpg = up["page_number"] == boxes[i]["page_number"]
  595. mh = self.mean_height[up["page_number"] - 1]
  596. mw = self.mean_width[up["page_number"] - 1]
  597. if smpg and ydis > mh * 4:
  598. break
  599. if not smpg and ydis > mh * 16:
  600. break
  601. down = boxes[i]
  602. if up.get("R", "") != down.get(
  603. "R", "") and up["text"][-1] != ",":
  604. i += 1
  605. continue
  606. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  607. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
  608. i += 1
  609. continue
  610. if not down["text"].strip():
  611. i += 1
  612. continue
  613. if up["x1"] < down["x0"] - 10 * \
  614. mw or up["x0"] > down["x1"] + 10 * mw:
  615. i += 1
  616. continue
  617. if i - dp < 5 and up.get("layout_type") == "text":
  618. if up.get("layoutno", "1") == down.get(
  619. "layoutno", "2"):
  620. dfs(down, i + 1)
  621. boxes.pop(i)
  622. return
  623. i += 1
  624. continue
  625. fea = self._updown_concat_features(up, down)
  626. if self.updown_cnt_mdl.predict(
  627. xgb.DMatrix([fea]))[0] <= 0.5:
  628. i += 1
  629. continue
  630. dfs(down, i + 1)
  631. boxes.pop(i)
  632. return
  633. dfs(boxes[0], 1)
  634. boxes.pop(0)
  635. if chunks:
  636. blocks.append(chunks)
  637. # concat within each block
  638. boxes = []
  639. for b in blocks:
  640. if len(b) == 1:
  641. boxes.append(b[0])
  642. continue
  643. t = b[0]
  644. for c in b[1:]:
  645. t["text"] = t["text"].strip()
  646. c["text"] = c["text"].strip()
  647. if not c["text"]:
  648. continue
  649. if t["text"] and re.match(
  650. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  651. t["text"] += " "
  652. t["text"] += c["text"]
  653. t["x0"] = min(t["x0"], c["x0"])
  654. t["x1"] = max(t["x1"], c["x1"])
  655. t["page_number"] = min(t["page_number"], c["page_number"])
  656. t["bottom"] = c["bottom"]
  657. if not t["layout_type"] \
  658. and c["layout_type"]:
  659. t["layout_type"] = c["layout_type"]
  660. boxes.append(t)
  661. self.boxes = self.sort_Y_firstly(boxes, 0)
  662. def __filter_forpages(self):
  663. if not self.boxes:
  664. return
  665. to = min(7, len(self.page_images) // 5)
  666. pg_hits = [0 for _ in range(to)]
  667. def possible(c):
  668. if c.get("layout_type", "") == "reference":
  669. return True
  670. if c["bottom"] - c["top"] >= 2 * \
  671. self.mean_height[c["page_number"] - 1]:
  672. return False
  673. if c["text"].find("....") >= 0 \
  674. or (c["x1"] - c["x0"] > 250 and re.search(r"[0-9]+$",
  675. c["text"].strip())):
  676. return True
  677. return self.is_caption(c) and re.search(
  678. r"[0-9]+$", c["text"].strip())
  679. for c in self.boxes:
  680. if c["page_number"] >= to:
  681. break
  682. if possible(c):
  683. pg_hits[c["page_number"] - 1] += 1
  684. st, ed = -1, -1
  685. for i in range(len(self.boxes)):
  686. c = self.boxes[i]
  687. if c["page_number"] >= to:
  688. break
  689. if pg_hits[c["page_number"] - 1] >= 3 and possible(c):
  690. if st < 0:
  691. st = i
  692. else:
  693. ed = i
  694. for _ in range(st, ed + 1):
  695. self.boxes.pop(st)
  696. def _blockType(self, b):
  697. patt = [
  698. ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
  699. (r"^(20|19)[0-9]{2}年$", "Dt"),
  700. (r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"),
  701. ("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"),
  702. (r"^第*[一二三四1-4]季度$", "Dt"),
  703. (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
  704. (r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"),
  705. ("^[0-9.,+%/ -]+$", "Nu"),
  706. (r"^[0-9A-Z/\._~-]+$", "Ca"),
  707. (r"^[A-Z]*[a-z' -]+$", "En"),
  708. (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
  709. (r"^.{1}$", "Sg")
  710. ]
  711. for p, n in patt:
  712. if re.search(p, b["text"].strip()):
  713. return n
  714. tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1]
  715. if len(tks) > 3:
  716. if len(tks) < 12:
  717. return "Tx"
  718. else:
  719. return "Lx"
  720. if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
  721. return "Nr"
  722. return "Ot"
  723. def __cal_spans(self, boxes, rows, cols, tbl, html=True):
  724. # caculate span
  725. clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
  726. for cln in cols]
  727. crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln])
  728. for cln in cols]
  729. rtop = [np.mean([c.get("R_top", c["top"]) for c in row])
  730. for row in rows]
  731. rbtm = [np.mean([c.get("R_btm", c["bottom"])
  732. for c in row]) for row in rows]
  733. for b in boxes:
  734. if "SP" not in b:
  735. continue
  736. b["colspan"] = [b["cn"]]
  737. b["rowspan"] = [b["rn"]]
  738. # col span
  739. for j in range(0, len(clft)):
  740. if j == b["cn"]:
  741. continue
  742. if clft[j] + (crgt[j] - clft[j]) / 2 < b["H_left"]:
  743. continue
  744. if crgt[j] - (crgt[j] - clft[j]) / 2 > b["H_right"]:
  745. continue
  746. b["colspan"].append(j)
  747. # row span
  748. for j in range(0, len(rtop)):
  749. if j == b["rn"]:
  750. continue
  751. if rtop[j] + (rbtm[j] - rtop[j]) / 2 < b["H_top"]:
  752. continue
  753. if rbtm[j] - (rbtm[j] - rtop[j]) / 2 > b["H_bott"]:
  754. continue
  755. b["rowspan"].append(j)
  756. def join(arr):
  757. if not arr:
  758. return ""
  759. return "".join([t["text"] for t in arr])
  760. # rm the spaning cells
  761. for i in range(len(tbl)):
  762. for j, arr in enumerate(tbl[i]):
  763. if not arr:
  764. continue
  765. if all(["rowspan" not in a and "colspan" not in a for a in arr]):
  766. continue
  767. rowspan, colspan = [], []
  768. for a in arr:
  769. if isinstance(a.get("rowspan", 0), list):
  770. rowspan.extend(a["rowspan"])
  771. if isinstance(a.get("colspan", 0), list):
  772. colspan.extend(a["colspan"])
  773. rowspan, colspan = set(rowspan), set(colspan)
  774. if len(rowspan) < 2 and len(colspan) < 2:
  775. for a in arr:
  776. if "rowspan" in a:
  777. del a["rowspan"]
  778. if "colspan" in a:
  779. del a["colspan"]
  780. continue
  781. rowspan, colspan = sorted(rowspan), sorted(colspan)
  782. rowspan = list(range(rowspan[0], rowspan[-1] + 1))
  783. colspan = list(range(colspan[0], colspan[-1] + 1))
  784. assert i in rowspan, rowspan
  785. assert j in colspan, colspan
  786. arr = []
  787. for r in rowspan:
  788. for c in colspan:
  789. arr_txt = join(arr)
  790. if tbl[r][c] and join(tbl[r][c]) != arr_txt:
  791. arr.extend(tbl[r][c])
  792. tbl[r][c] = None if html else arr
  793. for a in arr:
  794. if len(rowspan) > 1:
  795. a["rowspan"] = len(rowspan)
  796. elif "rowspan" in a:
  797. del a["rowspan"]
  798. if len(colspan) > 1:
  799. a["colspan"] = len(colspan)
  800. elif "colspan" in a:
  801. del a["colspan"]
  802. tbl[rowspan[0]][colspan[0]] = arr
  803. return tbl
  804. def __construct_table(self, boxes, html=False):
  805. cap = ""
  806. i = 0
  807. while i < len(boxes):
  808. if self.is_caption(boxes[i]):
  809. cap += boxes[i]["text"]
  810. boxes.pop(i)
  811. i -= 1
  812. i += 1
  813. if not boxes:
  814. return []
  815. for b in boxes:
  816. b["btype"] = self._blockType(b)
  817. max_type = Counter([b["btype"] for b in boxes]).items()
  818. max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
  819. logging.debug("MAXTYPE: " + max_type)
  820. rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
  821. rowh = np.min(rowh) if rowh else 0
  822. # boxes = self.sort_Y_firstly(boxes, rowh/5)
  823. boxes = self.sort_R_firstly(boxes, rowh / 2)
  824. boxes[0]["rn"] = 0
  825. rows = [[boxes[0]]]
  826. btm = boxes[0]["bottom"]
  827. for b in boxes[1:]:
  828. b["rn"] = len(rows) - 1
  829. lst_r = rows[-1]
  830. if lst_r[-1].get("R", "") != b.get("R", "") \
  831. or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
  832. ): # new row
  833. btm = b["bottom"]
  834. b["rn"] += 1
  835. rows.append([b])
  836. continue
  837. btm = (btm + b["bottom"]) / 2.
  838. rows[-1].append(b)
  839. colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b]
  840. colwm = np.min(colwm) if colwm else 0
  841. crosspage = len(set([b["page_number"] for b in boxes])) > 1
  842. if crosspage:
  843. boxes = self.sort_X_firstly(boxes, colwm / 2, False)
  844. else:
  845. boxes = self.sort_C_firstly(boxes, colwm / 2)
  846. boxes[0]["cn"] = 0
  847. cols = [[boxes[0]]]
  848. right = boxes[0]["x1"]
  849. for b in boxes[1:]:
  850. b["cn"] = len(cols) - 1
  851. lst_c = cols[-1]
  852. if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1][
  853. "page_number"]) \
  854. or (b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")): # new col
  855. right = b["x1"]
  856. b["cn"] += 1
  857. cols.append([b])
  858. continue
  859. right = (right + b["x1"]) / 2.
  860. cols[-1].append(b)
  861. tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))]
  862. for b in boxes:
  863. tbl[b["rn"]][b["cn"]].append(b)
  864. if len(rows) >= 4:
  865. # remove single in column
  866. j = 0
  867. while j < len(tbl[0]):
  868. e, ii = 0, 0
  869. for i in range(len(tbl)):
  870. if tbl[i][j]:
  871. e += 1
  872. ii = i
  873. if e > 1:
  874. break
  875. if e > 1:
  876. j += 1
  877. continue
  878. f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
  879. [j - 1][0].get("text")) or j == 0
  880. ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
  881. [j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
  882. if f and ff:
  883. j += 1
  884. continue
  885. bx = tbl[ii][j][0]
  886. logging.debug("Relocate column single: " + bx["text"])
  887. # j column only has one value
  888. left, right = 100000, 100000
  889. if j > 0 and not f:
  890. for i in range(len(tbl)):
  891. if tbl[i][j - 1]:
  892. left = min(left, np.min(
  893. [bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
  894. if j + 1 < len(tbl[0]) and not ff:
  895. for i in range(len(tbl)):
  896. if tbl[i][j + 1]:
  897. right = min(right, np.min(
  898. [a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
  899. assert left < 100000 or right < 100000
  900. if left < right:
  901. for jj in range(j, len(tbl[0])):
  902. for i in range(len(tbl)):
  903. for a in tbl[i][jj]:
  904. a["cn"] -= 1
  905. if tbl[ii][j - 1]:
  906. tbl[ii][j - 1].extend(tbl[ii][j])
  907. else:
  908. tbl[ii][j - 1] = tbl[ii][j]
  909. for i in range(len(tbl)):
  910. tbl[i].pop(j)
  911. else:
  912. for jj in range(j + 1, len(tbl[0])):
  913. for i in range(len(tbl)):
  914. for a in tbl[i][jj]:
  915. a["cn"] -= 1
  916. if tbl[ii][j + 1]:
  917. tbl[ii][j + 1].extend(tbl[ii][j])
  918. else:
  919. tbl[ii][j + 1] = tbl[ii][j]
  920. for i in range(len(tbl)):
  921. tbl[i].pop(j)
  922. cols.pop(j)
  923. assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (
  924. len(cols), len(tbl[0]))
  925. if len(cols) >= 4:
  926. # remove single in row
  927. i = 0
  928. while i < len(tbl):
  929. e, jj = 0, 0
  930. for j in range(len(tbl[i])):
  931. if tbl[i][j]:
  932. e += 1
  933. jj = j
  934. if e > 1:
  935. break
  936. if e > 1:
  937. i += 1
  938. continue
  939. f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
  940. [jj][0].get("text")) or i == 0
  941. ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
  942. [jj][0].get("text")) or i + 1 >= len(tbl)
  943. if f and ff:
  944. i += 1
  945. continue
  946. bx = tbl[i][jj][0]
  947. logging.debug("Relocate row single: " + bx["text"])
  948. # i row only has one value
  949. up, down = 100000, 100000
  950. if i > 0 and not f:
  951. for j in range(len(tbl[i - 1])):
  952. if tbl[i - 1][j]:
  953. up = min(up, np.min(
  954. [bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
  955. if i + 1 < len(tbl) and not ff:
  956. for j in range(len(tbl[i + 1])):
  957. if tbl[i + 1][j]:
  958. down = min(down, np.min(
  959. [a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
  960. assert up < 100000 or down < 100000
  961. if up < down:
  962. for ii in range(i, len(tbl)):
  963. for j in range(len(tbl[ii])):
  964. for a in tbl[ii][j]:
  965. a["rn"] -= 1
  966. if tbl[i - 1][jj]:
  967. tbl[i - 1][jj].extend(tbl[i][jj])
  968. else:
  969. tbl[i - 1][jj] = tbl[i][jj]
  970. tbl.pop(i)
  971. else:
  972. for ii in range(i + 1, len(tbl)):
  973. for j in range(len(tbl[ii])):
  974. for a in tbl[ii][j]:
  975. a["rn"] -= 1
  976. if tbl[i + 1][jj]:
  977. tbl[i + 1][jj].extend(tbl[i][jj])
  978. else:
  979. tbl[i + 1][jj] = tbl[i][jj]
  980. tbl.pop(i)
  981. rows.pop(i)
  982. # which rows are headers
  983. hdset = set([])
  984. for i in range(len(tbl)):
  985. cnt, h = 0, 0
  986. for j, arr in enumerate(tbl[i]):
  987. if not arr:
  988. continue
  989. cnt += 1
  990. if max_type == "Nu" and arr[0]["btype"] == "Nu":
  991. continue
  992. if any([a.get("H") for a in arr]) \
  993. or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
  994. h += 1
  995. if h / cnt > 0.5:
  996. hdset.add(i)
  997. if html:
  998. return [self.__html_table(cap, hdset,
  999. self.__cal_spans(boxes, rows,
  1000. cols, tbl, True)
  1001. )]
  1002. return self.__desc_table(cap, hdset,
  1003. self.__cal_spans(boxes, rows, cols, tbl, False))
  1004. def __html_table(self, cap, hdset, tbl):
  1005. # constrcut HTML
  1006. html = "<table>"
  1007. if cap:
  1008. html += f"<caption>{cap}</caption>"
  1009. for i in range(len(tbl)):
  1010. row = "<tr>"
  1011. txts = []
  1012. for j, arr in enumerate(tbl[i]):
  1013. if arr is None:
  1014. continue
  1015. if not arr:
  1016. row += "<td></td>" if i not in hdset else "<th></th>"
  1017. continue
  1018. txt = ""
  1019. if arr:
  1020. h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2,
  1021. self.mean_height[arr[0]["page_number"] - 1] / 2)
  1022. txt = "".join([c["text"]
  1023. for c in self.sort_Y_firstly(arr, h)])
  1024. txts.append(txt)
  1025. sp = ""
  1026. if arr[0].get("colspan"):
  1027. sp = "colspan={}".format(arr[0]["colspan"])
  1028. if arr[0].get("rowspan"):
  1029. sp += " rowspan={}".format(arr[0]["rowspan"])
  1030. if i in hdset:
  1031. row += f"<th {sp} >" + txt + "</th>"
  1032. else:
  1033. row += f"<td {sp} >" + txt + "</td>"
  1034. if i in hdset:
  1035. if all([t in hdset for t in txts]):
  1036. continue
  1037. for t in txts:
  1038. hdset.add(t)
  1039. if row != "<tr>":
  1040. row += "</tr>"
  1041. else:
  1042. row = ""
  1043. html += "\n" + row
  1044. html += "\n</table>"
  1045. return html
  1046. def __desc_table(self, cap, hdr_rowno, tbl):
  1047. # get text of every colomn in header row to become header text
  1048. clmno = len(tbl[0])
  1049. rowno = len(tbl)
  1050. headers = {}
  1051. hdrset = set()
  1052. lst_hdr = []
  1053. for r in sorted(list(hdr_rowno)):
  1054. headers[r] = ["" for _ in range(clmno)]
  1055. for i in range(clmno):
  1056. if not tbl[r][i]:
  1057. continue
  1058. txt = "".join([a["text"].strip() for a in tbl[r][i]])
  1059. headers[r][i] = txt
  1060. hdrset.add(txt)
  1061. if all([not t for t in headers[r]]):
  1062. del headers[r]
  1063. hdr_rowno.remove(r)
  1064. continue
  1065. for j in range(clmno):
  1066. if headers[r][j]:
  1067. continue
  1068. if j >= len(lst_hdr):
  1069. break
  1070. headers[r][j] = lst_hdr[j]
  1071. lst_hdr = headers[r]
  1072. for i in range(rowno):
  1073. if i not in hdr_rowno:
  1074. continue
  1075. for j in range(i + 1, rowno):
  1076. if j not in hdr_rowno:
  1077. break
  1078. for k in range(clmno):
  1079. if not headers[j - 1][k]:
  1080. continue
  1081. if headers[j][k].find(headers[j - 1][k]) >= 0:
  1082. continue
  1083. if len(headers[j][k]) > len(headers[j - 1][k]):
  1084. headers[j][k] += ("的" if headers[j][k]
  1085. else "") + headers[j - 1][k]
  1086. else:
  1087. headers[j][k] = headers[j - 1][k] \
  1088. + ("的" if headers[j - 1][k] else "") \
  1089. + headers[j][k]
  1090. logging.debug(
  1091. f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
  1092. row_txt = []
  1093. for i in range(rowno):
  1094. if i in hdr_rowno:
  1095. continue
  1096. rtxt = []
  1097. def append(delimer):
  1098. nonlocal rtxt, row_txt
  1099. rtxt = delimer.join(rtxt)
  1100. if row_txt and len(row_txt[-1]) + len(rtxt) < 64:
  1101. row_txt[-1] += "\n" + rtxt
  1102. else:
  1103. row_txt.append(rtxt)
  1104. r = 0
  1105. if len(headers.items()):
  1106. _arr = [(i - r, r) for r, _ in headers.items() if r < i]
  1107. if _arr:
  1108. _, r = min(_arr, key=lambda x: x[0])
  1109. if r not in headers and clmno <= 2:
  1110. for j in range(clmno):
  1111. if not tbl[i][j]:
  1112. continue
  1113. txt = "".join([a["text"].strip() for a in tbl[i][j]])
  1114. if txt:
  1115. rtxt.append(txt)
  1116. if rtxt:
  1117. append(":")
  1118. continue
  1119. for j in range(clmno):
  1120. if not tbl[i][j]:
  1121. continue
  1122. txt = "".join([a["text"].strip() for a in tbl[i][j]])
  1123. if not txt:
  1124. continue
  1125. ctt = headers[r][j] if r in headers else ""
  1126. if ctt:
  1127. ctt += ":"
  1128. ctt += txt
  1129. if ctt:
  1130. rtxt.append(ctt)
  1131. if rtxt:
  1132. row_txt.append("; ".join(rtxt))
  1133. if cap:
  1134. row_txt = [t + f"\t——来自“{cap}”" for t in row_txt]
  1135. return row_txt
  1136. @staticmethod
  1137. def is_caption(bx):
  1138. patt = [
  1139. r"[图表]+[ 0-9::]{2,}"
  1140. ]
  1141. if any([re.match(p, bx["text"].strip()) for p in patt]) \
  1142. or bx["layout_type"].find("caption") >= 0:
  1143. return True
  1144. return False
  1145. def __extract_table_figure(self, need_image, ZM, return_html):
  1146. tables = {}
  1147. figures = {}
  1148. # extract figure and table boxes
  1149. i = 0
  1150. lst_lout_no = ""
  1151. nomerge_lout_no = []
  1152. while i < len(self.boxes):
  1153. if "layoutno" not in self.boxes[i]:
  1154. i += 1
  1155. continue
  1156. lout_no = str(self.boxes[i]["page_number"]) + \
  1157. "-" + str(self.boxes[i]["layoutno"])
  1158. if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
  1159. "figure caption", "reference"]:
  1160. nomerge_lout_no.append(lst_lout_no)
  1161. if self.boxes[i]["layout_type"] == "table":
  1162. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  1163. self.boxes.pop(i)
  1164. continue
  1165. if lout_no not in tables:
  1166. tables[lout_no] = []
  1167. tables[lout_no].append(self.boxes[i])
  1168. self.boxes.pop(i)
  1169. lst_lout_no = lout_no
  1170. continue
  1171. if need_image and self.boxes[i]["layout_type"] == "figure":
  1172. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  1173. self.boxes.pop(i)
  1174. continue
  1175. if lout_no not in figures:
  1176. figures[lout_no] = []
  1177. figures[lout_no].append(self.boxes[i])
  1178. self.boxes.pop(i)
  1179. lst_lout_no = lout_no
  1180. continue
  1181. i += 1
  1182. # merge table on different pages
  1183. nomerge_lout_no = set(nomerge_lout_no)
  1184. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  1185. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  1186. i = len(tbls) - 1
  1187. while i - 1 >= 0:
  1188. k0, bxs0 = tbls[i - 1]
  1189. k, bxs = tbls[i]
  1190. i -= 1
  1191. if k0 in nomerge_lout_no:
  1192. continue
  1193. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  1194. continue
  1195. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  1196. continue
  1197. mh = self.mean_height[bxs[0]["page_number"] - 1]
  1198. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  1199. continue
  1200. tables[k0].extend(tables[k])
  1201. del tables[k]
  1202. def x_overlapped(a, b):
  1203. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  1204. # find captions and pop out
  1205. i = 0
  1206. while i < len(self.boxes):
  1207. c = self.boxes[i]
  1208. # mh = self.mean_height[c["page_number"]-1]
  1209. if not self.is_caption(c):
  1210. i += 1
  1211. continue
  1212. # find the nearest layouts
  1213. def nearest(tbls):
  1214. nonlocal c
  1215. mink = ""
  1216. minv = 1000000000
  1217. for k, bxs in tbls.items():
  1218. for b in bxs[:10]:
  1219. if b.get("layout_type", "").find("caption") >= 0:
  1220. continue
  1221. y_dis = self._y_dis(c, b)
  1222. x_dis = self._x_dis(
  1223. c, b) if not x_overlapped(
  1224. c, b) else 0
  1225. dis = y_dis * y_dis + x_dis * x_dis
  1226. if dis < minv:
  1227. mink = k
  1228. minv = dis
  1229. return mink, minv
  1230. tk, tv = nearest(tables)
  1231. fk, fv = nearest(figures)
  1232. if min(tv, fv) > 2000:
  1233. i += 1
  1234. continue
  1235. if tv < fv:
  1236. tables[tk].insert(0, c)
  1237. logging.debug(
  1238. "TABLE:" +
  1239. self.boxes[i]["text"] +
  1240. "; Cap: " +
  1241. tk)
  1242. else:
  1243. figures[fk].insert(0, c)
  1244. logging.debug(
  1245. "FIGURE:" +
  1246. self.boxes[i]["text"] +
  1247. "; Cap: " +
  1248. tk)
  1249. self.boxes.pop(i)
  1250. res = []
  1251. def cropout(bxs, ltype):
  1252. nonlocal ZM
  1253. pn = set([b["page_number"] - 1 for b in bxs])
  1254. if len(pn) < 2:
  1255. pn = list(pn)[0]
  1256. ht = self.page_cum_height[pn]
  1257. b = {
  1258. "x0": np.min([b["x0"] for b in bxs]),
  1259. "top": np.min([b["top"] for b in bxs]) - ht,
  1260. "x1": np.max([b["x1"] for b in bxs]),
  1261. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  1262. }
  1263. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  1264. ii = self.__find_overlapped(b, louts, naive=True)
  1265. if ii is not None:
  1266. b = louts[ii]
  1267. else:
  1268. logging.warn(
  1269. f"Missing layout match: {pn + 1},%s" %
  1270. (bxs[0].get(
  1271. "layoutno", "")))
  1272. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  1273. return self.page_images[pn] \
  1274. .crop((left * ZM, top * ZM,
  1275. right * ZM, bott * ZM))
  1276. pn = {}
  1277. for b in bxs:
  1278. p = b["page_number"] - 1
  1279. if p not in pn:
  1280. pn[p] = []
  1281. pn[p].append(b)
  1282. pn = sorted(pn.items(), key=lambda x: x[0])
  1283. imgs = [cropout(arr, ltype) for p, arr in pn]
  1284. pic = Image.new("RGB",
  1285. (int(np.max([i.size[0] for i in imgs])),
  1286. int(np.sum([m.size[1] for m in imgs]))),
  1287. (245, 245, 245))
  1288. height = 0
  1289. for img in imgs:
  1290. pic.paste(img, (0, int(height)))
  1291. height += img.size[1]
  1292. return pic
  1293. # crop figure out and add caption
  1294. for k, bxs in figures.items():
  1295. txt = "\n".join(
  1296. [b["text"] for b in bxs
  1297. if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
  1298. and len(b["text"].strip()) >= 4
  1299. ]
  1300. )
  1301. if not txt:
  1302. continue
  1303. res.append(
  1304. (cropout(
  1305. bxs,
  1306. "figure"),
  1307. [txt] if not return_html else [f"<p>{txt}</p>"]))
  1308. for k, bxs in tables.items():
  1309. if not bxs:
  1310. continue
  1311. res.append((cropout(bxs, "table"),
  1312. self.__construct_table(bxs, html=return_html)))
  1313. return res
  1314. def proj_match(self, line):
  1315. if len(line) <= 2:
  1316. return
  1317. if re.match(r"[0-9 ().,%%+/-]+$", line):
  1318. return False
  1319. for p, j in [
  1320. (r"第[零一二三四五六七八九十百]+章", 1),
  1321. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  1322. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  1323. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  1324. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  1325. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  1326. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  1327. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  1328. (r".{,48}[::??]$", 9),
  1329. (r"[0-9]+)", 10),
  1330. (r"[\((][0-9]+[)\)]", 11),
  1331. (r"[零一二三四五六七八九十百]+是", 12),
  1332. (r"[⚫•➢✓]", 12)
  1333. ]:
  1334. if re.match(p, line):
  1335. return j
  1336. return
  1337. def _line_tag(self, bx, ZM):
  1338. pn = [bx["page_number"]]
  1339. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  1340. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  1341. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  1342. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  1343. pn.append(pn[-1] + 1)
  1344. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  1345. .format("-".join([str(p) for p in pn]),
  1346. bx["x0"], bx["x1"], top, bott)
  1347. def __filterout_scraps(self, boxes, ZM):
  1348. def width(b):
  1349. return b["x1"] - b["x0"]
  1350. def height(b):
  1351. return b["bottom"] - b["top"]
  1352. def usefull(b):
  1353. if b.get("layout_type"):
  1354. return True
  1355. if width(
  1356. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  1357. return True
  1358. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  1359. return True
  1360. return False
  1361. res = []
  1362. while boxes:
  1363. lines = []
  1364. widths = []
  1365. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  1366. mh = self.mean_height[boxes[0]["page_number"] - 1]
  1367. mj = self.proj_match(
  1368. boxes[0]["text"]) or boxes[0].get(
  1369. "layout_type",
  1370. "") == "title"
  1371. def dfs(line, st):
  1372. nonlocal mh, pw, lines, widths
  1373. lines.append(line)
  1374. widths.append(width(line))
  1375. width_mean = np.mean(widths)
  1376. mmj = self.proj_match(
  1377. line["text"]) or line.get(
  1378. "layout_type",
  1379. "") == "title"
  1380. for i in range(st + 1, min(st + 20, len(boxes))):
  1381. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  1382. break
  1383. if not mmj and self._y_dis(
  1384. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  1385. break
  1386. if not usefull(boxes[i]):
  1387. continue
  1388. if mmj or \
  1389. (self._x_dis(boxes[i], line) < pw / 10): \
  1390. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  1391. # concat following
  1392. dfs(boxes[i], i)
  1393. boxes.pop(i)
  1394. break
  1395. try:
  1396. if usefull(boxes[0]):
  1397. dfs(boxes[0], 0)
  1398. else:
  1399. logging.debug("WASTE: " + boxes[0]["text"])
  1400. except Exception as e:
  1401. pass
  1402. boxes.pop(0)
  1403. mw = np.mean(widths)
  1404. if mj or mw / pw >= 0.35 or mw > 200:
  1405. res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  1406. else:
  1407. logging.debug("REMOVED: " +
  1408. "<<".join([c["text"] for c in lines]))
  1409. return "\n\n".join(res)
  1410. def __images__(self, fnm, zoomin=3, page_from=0, page_to=299):
  1411. self.lefted_chars = []
  1412. self.mean_height = []
  1413. self.mean_width = []
  1414. self.boxes = []
  1415. self.garbages = {}
  1416. self.page_cum_height = [0]
  1417. self.page_layout = []
  1418. try:
  1419. self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
  1420. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  1421. enumerate(self.pdf.pages[page_from:page_to])]
  1422. self.page_chars = [[c for c in self.pdf.pages[i].chars if self._has_color(c)] for i in
  1423. range(len(self.page_images))]
  1424. self.total_page = len(self.pdf.pages)
  1425. except Exception as e:
  1426. self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
  1427. self.page_images = []
  1428. self.page_chars = []
  1429. mat = fitz.Matrix(zoomin, zoomin)
  1430. self.total_page = len(self.pdf)
  1431. for page in self.pdf[page_from:page_to]:
  1432. pix = page.getPixmap(matrix=mat)
  1433. img = Image.frombytes("RGB", [pix.width, pix.height],
  1434. pix.samples)
  1435. self.page_images.append(img)
  1436. self.page_chars.append([])
  1437. logging.info("Images converted.")
  1438. for i, img in enumerate(self.page_images):
  1439. chars = self.page_chars[i]
  1440. self.mean_height.append(
  1441. np.median(sorted([c["height"] for c in chars])) if chars else 0
  1442. )
  1443. self.mean_width.append(
  1444. np.median(sorted([c["width"] for c in chars])) if chars else 8
  1445. )
  1446. self.page_cum_height.append(img.size[1] / zoomin)
  1447. # if i > 0:
  1448. # if not chars:
  1449. # self.page_cum_height.append(img.size[1] / zoomin)
  1450. # else:
  1451. # self.page_cum_height.append(
  1452. # np.max([c["bottom"] for c in chars]))
  1453. self.__ocr_paddle(i + 1, img, chars, zoomin)
  1454. self.page_cum_height = np.cumsum(self.page_cum_height)
  1455. assert len(self.page_cum_height) == len(self.page_images)+1
  1456. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  1457. self.__images__(fnm, zoomin)
  1458. self._layouts_paddle(zoomin)
  1459. self._table_transformer_job(zoomin)
  1460. self._text_merge()
  1461. self._concat_downward()
  1462. self.__filter_forpages()
  1463. tbls = self.__extract_table_figure(need_image, zoomin, return_html)
  1464. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  1465. def remove_tag(self, txt):
  1466. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  1467. def crop(self, text, ZM=3):
  1468. imgs = []
  1469. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  1470. pn, left, right, top, bottom = tag.strip(
  1471. "#").strip("@").split("\t")
  1472. left, right, top, bottom = float(left), float(
  1473. right), float(top), float(bottom)
  1474. bottom *= ZM
  1475. pns = [int(p) - 1 for p in pn.split("-")]
  1476. for pn in pns[1:]:
  1477. bottom += self.page_images[pn - 1].size[1]
  1478. imgs.append(
  1479. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  1480. right *
  1481. ZM, min(
  1482. bottom, self.page_images[pns[0]].size[1])
  1483. ))
  1484. )
  1485. bottom -= self.page_images[pns[0]].size[1]
  1486. for pn in pns[1:]:
  1487. imgs.append(
  1488. self.page_images[pn].crop((left * ZM, 0,
  1489. right * ZM,
  1490. min(bottom,
  1491. self.page_images[pn].size[1])
  1492. ))
  1493. )
  1494. bottom -= self.page_images[pn].size[1]
  1495. if not imgs:
  1496. return
  1497. GAP = 2
  1498. height = 0
  1499. for img in imgs:
  1500. height += img.size[1] + GAP
  1501. height = int(height)
  1502. pic = Image.new("RGB",
  1503. (int(np.max([i.size[0] for i in imgs])), height),
  1504. (245, 245, 245))
  1505. height = 0
  1506. for img in imgs:
  1507. pic.paste(img, (0, int(height)))
  1508. height += img.size[1] + GAP
  1509. return pic
  1510. if __name__ == "__main__":
  1511. pass