You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

pdf_parser.py 70KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import random
  4. from functools import partial
  5. import fitz
  6. import requests
  7. import xgboost as xgb
  8. from io import BytesIO
  9. import torch
  10. import re
  11. import pdfplumber
  12. import logging
  13. from PIL import Image
  14. import numpy as np
  15. from api.db import ParserType
  16. from rag.nlp import huqie
  17. from collections import Counter
  18. from copy import deepcopy
  19. from huggingface_hub import hf_hub_download
  20. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  21. class HuParser:
  22. def __init__(self):
  23. from paddleocr import PaddleOCR
  24. logging.getLogger("ppocr").setLevel(logging.ERROR)
  25. self.ocr = PaddleOCR(use_angle_cls=False, lang="ch")
  26. if not hasattr(self, "model_speciess"):
  27. self.model_speciess = ParserType.GENERAL.value
  28. self.layouter = partial(self.__remote_call, self.model_speciess)
  29. self.tbl_det = partial(self.__remote_call, "table_component")
  30. self.updown_cnt_mdl = xgb.Booster()
  31. if torch.cuda.is_available():
  32. self.updown_cnt_mdl.set_param({"device": "cuda"})
  33. self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
  34. filename="updown_concat_xgb.model"))
  35. """
  36. If you have trouble downloading HuggingFace models, -_^ this might help!!
  37. For Linux:
  38. export HF_ENDPOINT=https://hf-mirror.com
  39. For Windows:
  40. Good luck
  41. ^_-
  42. """
  43. def __remote_call(self, species, images, thr=0.7):
  44. url = os.environ.get("INFINIFLOW_SERVER")
  45. if not url:raise EnvironmentError("Please set environment variable: 'INFINIFLOW_SERVER'")
  46. token = os.environ.get("INFINIFLOW_TOKEN")
  47. if not token:raise EnvironmentError("Please set environment variable: 'INFINIFLOW_TOKEN'")
  48. def convert_image_to_bytes(PILimage):
  49. image = BytesIO()
  50. PILimage.save(image, format='png')
  51. image.seek(0)
  52. return image.getvalue()
  53. images = [convert_image_to_bytes(img) for img in images]
  54. def remote_call():
  55. nonlocal images, thr
  56. res = requests.post(url+"/v1/layout/detect/"+species, files=[("image", img) for img in images], data={"threashold": thr},
  57. headers={"Authorization": token}, timeout=len(images) * 10)
  58. res = res.json()
  59. if res["retcode"] != 0: raise RuntimeError(res["retmsg"])
  60. return res["data"]
  61. for _ in range(3):
  62. try:
  63. return remote_call()
  64. except RuntimeError as e:
  65. raise e
  66. except Exception as e:
  67. logging.error("layout_predict:"+str(e))
  68. return remote_call()
  69. def __char_width(self, c):
  70. return (c["x1"] - c["x0"]) // len(c["text"])
  71. def __height(self, c):
  72. return c["bottom"] - c["top"]
  73. def _x_dis(self, a, b):
  74. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  75. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  76. def _y_dis(
  77. self, a, b):
  78. return (
  79. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  80. def _match_proj(self, b):
  81. proj_patt = [
  82. r"第[零一二三四五六七八九十百]+章",
  83. r"第[零一二三四五六七八九十百]+[条节]",
  84. r"[零一二三四五六七八九十百]+[、是  ]",
  85. r"[\((][零一二三四五六七八九十百]+[)\)]",
  86. r"[\((][0-9]+[)\)]",
  87. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  88. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  89. r"[⚫•➢①② ]",
  90. ]
  91. return any([re.match(p, b["text"]) for p in proj_patt])
  92. def _updown_concat_features(self, up, down):
  93. w = max(self.__char_width(up), self.__char_width(down))
  94. h = max(self.__height(up), self.__height(down))
  95. y_dis = self._y_dis(up, down)
  96. LEN = 6
  97. tks_down = huqie.qie(down["text"][:LEN]).split(" ")
  98. tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
  99. tks_all = up["text"][-LEN:].strip() \
  100. + (" " if re.match(r"[a-zA-Z0-9]+",
  101. up["text"][-1] + down["text"][0]) else "") \
  102. + down["text"][:LEN].strip()
  103. tks_all = huqie.qie(tks_all).split(" ")
  104. fea = [
  105. up.get("R", -1) == down.get("R", -1),
  106. y_dis / h,
  107. down["page_number"] - up["page_number"],
  108. up["layout_type"] == down["layout_type"],
  109. up["layout_type"] == "text",
  110. down["layout_type"] == "text",
  111. up["layout_type"] == "table",
  112. down["layout_type"] == "table",
  113. True if re.search(
  114. r"([。?!;!?;+))]|[a-z]\.)$",
  115. up["text"]) else False,
  116. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  117. True if re.search(
  118. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  119. down["text"]) else False,
  120. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  121. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  122. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  123. True if re.search(r"[\((][^\))]+$", up["text"])
  124. and re.search(r"[\))]", down["text"]) else False,
  125. self._match_proj(down),
  126. True if re.match(r"[A-Z]", down["text"]) else False,
  127. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  128. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  129. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  130. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  131. ) > 1 and len(
  132. down["text"].strip()) > 1 else False,
  133. up["x0"] > down["x1"],
  134. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  135. self.__height(down)),
  136. self._x_dis(up, down) / max(w, 0.000001),
  137. (len(up["text"]) - len(down["text"])) /
  138. max(len(up["text"]), len(down["text"])),
  139. len(tks_all) - len(tks_up) - len(tks_down),
  140. len(tks_down) - len(tks_up),
  141. tks_down[-1] == tks_up[-1],
  142. max(down["in_row"], up["in_row"]),
  143. abs(down["in_row"] - up["in_row"]),
  144. len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
  145. len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
  146. ]
  147. return fea
  148. @staticmethod
  149. def sort_Y_firstly(arr, threashold):
  150. # sort using y1 first and then x1
  151. arr = sorted(arr, key=lambda r: (r["top"], r["x0"]))
  152. for i in range(len(arr) - 1):
  153. for j in range(i, -1, -1):
  154. # restore the order using th
  155. if abs(arr[j + 1]["top"] - arr[j]["top"]) < threashold \
  156. and arr[j + 1]["x0"] < arr[j]["x0"]:
  157. tmp = deepcopy(arr[j])
  158. arr[j] = deepcopy(arr[j + 1])
  159. arr[j + 1] = deepcopy(tmp)
  160. return arr
  161. @staticmethod
  162. def sort_X_by_page(arr, threashold):
  163. # sort using y1 first and then x1
  164. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  165. for i in range(len(arr) - 1):
  166. for j in range(i, -1, -1):
  167. # restore the order using th
  168. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  169. and arr[j + 1]["top"] < arr[j]["top"]\
  170. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  171. tmp = arr[j]
  172. arr[j] = arr[j + 1]
  173. arr[j + 1] = tmp
  174. return arr
  175. @staticmethod
  176. def sort_R_firstly(arr, thr=0):
  177. # sort using y1 first and then x1
  178. # sorted(arr, key=lambda r: (r["top"], r["x0"]))
  179. arr = HuParser.sort_Y_firstly(arr, thr)
  180. for i in range(len(arr) - 1):
  181. for j in range(i, -1, -1):
  182. if "R" not in arr[j] or "R" not in arr[j + 1]:
  183. continue
  184. if arr[j + 1]["R"] < arr[j]["R"] \
  185. or (
  186. arr[j + 1]["R"] == arr[j]["R"]
  187. and arr[j + 1]["x0"] < arr[j]["x0"]
  188. ):
  189. tmp = arr[j]
  190. arr[j] = arr[j + 1]
  191. arr[j + 1] = tmp
  192. return arr
  193. @staticmethod
  194. def sort_X_firstly(arr, threashold, copy=True):
  195. # sort using y1 first and then x1
  196. arr = sorted(arr, key=lambda r: (r["x0"], r["top"]))
  197. for i in range(len(arr) - 1):
  198. for j in range(i, -1, -1):
  199. # restore the order using th
  200. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  201. and arr[j + 1]["top"] < arr[j]["top"]:
  202. tmp = deepcopy(arr[j]) if copy else arr[j]
  203. arr[j] = deepcopy(arr[j + 1]) if copy else arr[j + 1]
  204. arr[j + 1] = deepcopy(tmp) if copy else tmp
  205. return arr
  206. @staticmethod
  207. def sort_C_firstly(arr, thr=0):
  208. # sort using y1 first and then x1
  209. # sorted(arr, key=lambda r: (r["x0"], r["top"]))
  210. arr = HuParser.sort_X_firstly(arr, thr)
  211. for i in range(len(arr) - 1):
  212. for j in range(i, -1, -1):
  213. # restore the order using th
  214. if "C" not in arr[j] or "C" not in arr[j + 1]:
  215. continue
  216. if arr[j + 1]["C"] < arr[j]["C"] \
  217. or (
  218. arr[j + 1]["C"] == arr[j]["C"]
  219. and arr[j + 1]["top"] < arr[j]["top"]
  220. ):
  221. tmp = arr[j]
  222. arr[j] = arr[j + 1]
  223. arr[j + 1] = tmp
  224. return arr
  225. return sorted(arr, key=lambda r: (r.get("C", r["x0"]), r["top"]))
  226. def _has_color(self, o):
  227. if o.get("ncs", "") == "DeviceGray":
  228. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  229. o["non_stroking_color"][0] == 1:
  230. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  231. return False
  232. return True
  233. def __overlapped_area(self, a, b, ratio=True):
  234. tp, btm, x0, x1 = a["top"], a["bottom"], a["x0"], a["x1"]
  235. if b["x0"] > x1 or b["x1"] < x0:
  236. return 0
  237. if b["bottom"] < tp or b["top"] > btm:
  238. return 0
  239. x0_ = max(b["x0"], x0)
  240. x1_ = min(b["x1"], x1)
  241. assert x0_ <= x1_, "Fuckedup! T:{},B:{},X0:{},X1:{} ==> {}".format(
  242. tp, btm, x0, x1, b)
  243. tp_ = max(b["top"], tp)
  244. btm_ = min(b["bottom"], btm)
  245. assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
  246. tp, btm, x0, x1, b)
  247. ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
  248. x0 != 0 and btm - tp != 0 else 0
  249. if ov > 0 and ratio:
  250. ov /= (x1 - x0) * (btm - tp)
  251. return ov
  252. def __find_overlapped_with_threashold(self, box, boxes, thr=0.3):
  253. if not boxes:
  254. return
  255. max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0
  256. s, e = 0, len(boxes)
  257. for i in range(s, e):
  258. ov = self.__overlapped_area(box, boxes[i])
  259. _ov = self.__overlapped_area(boxes[i], box)
  260. if (ov, _ov) < (max_overlaped, _max_overlaped):
  261. continue
  262. max_overlaped_i = i
  263. max_overlaped = ov
  264. _max_overlaped = _ov
  265. return max_overlaped_i
  266. def __find_overlapped(self, box, boxes_sorted_by_y, naive=False):
  267. if not boxes_sorted_by_y:
  268. return
  269. bxs = boxes_sorted_by_y
  270. s, e, ii = 0, len(bxs), 0
  271. while s < e and not naive:
  272. ii = (e + s) // 2
  273. pv = bxs[ii]
  274. if box["bottom"] < pv["top"]:
  275. e = ii
  276. continue
  277. if box["top"] > pv["bottom"]:
  278. s = ii + 1
  279. continue
  280. break
  281. while s < ii:
  282. if box["top"] > bxs[s]["bottom"]:
  283. s += 1
  284. break
  285. while e - 1 > ii:
  286. if box["bottom"] < bxs[e - 1]["top"]:
  287. e -= 1
  288. break
  289. max_overlaped_i, max_overlaped = None, 0
  290. for i in range(s, e):
  291. ov = self.__overlapped_area(bxs[i], box)
  292. if ov <= max_overlaped:
  293. continue
  294. max_overlaped_i = i
  295. max_overlaped = ov
  296. return max_overlaped_i
  297. def _is_garbage(self, b):
  298. patt = [r"^•+$", r"(版权归©|免责条款|地址[::])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
  299. r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
  300. "(资料|数据)来源[::]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}",
  301. "\\(cid *: *[0-9]+ *\\)"
  302. ]
  303. return any([re.search(p, b["text"]) for p in patt])
  304. def __layouts_cleanup(self, boxes, layouts, far=2, thr=0.7):
  305. def notOverlapped(a, b):
  306. return any([a["x1"] < b["x0"],
  307. a["x0"] > b["x1"],
  308. a["bottom"] < b["top"],
  309. a["top"] > b["bottom"]])
  310. i = 0
  311. while i + 1 < len(layouts):
  312. j = i + 1
  313. while j < min(i + far, len(layouts)) \
  314. and (layouts[i].get("type", "") != layouts[j].get("type", "")
  315. or notOverlapped(layouts[i], layouts[j])):
  316. j += 1
  317. if j >= min(i + far, len(layouts)):
  318. i += 1
  319. continue
  320. if self.__overlapped_area(layouts[i], layouts[j]) < thr \
  321. and self.__overlapped_area(layouts[j], layouts[i]) < thr:
  322. i += 1
  323. continue
  324. if layouts[i].get("score") and layouts[j].get("score"):
  325. if layouts[i]["score"] > layouts[j]["score"]:
  326. layouts.pop(j)
  327. else:
  328. layouts.pop(i)
  329. continue
  330. area_i, area_i_1 = 0, 0
  331. for b in boxes:
  332. if not notOverlapped(b, layouts[i]):
  333. area_i += self.__overlapped_area(b, layouts[i], False)
  334. if not notOverlapped(b, layouts[j]):
  335. area_i_1 += self.__overlapped_area(b, layouts[j], False)
  336. if area_i > area_i_1:
  337. layouts.pop(j)
  338. else:
  339. layouts.pop(i)
  340. return layouts
  341. def __table_paddle(self, images):
  342. tbls = self.tbl_det(images, thr=0.5)
  343. res = []
  344. # align left&right for rows, align top&bottom for columns
  345. for tbl in tbls:
  346. lts = [{"label": b["type"],
  347. "score": b["score"],
  348. "x0": b["bbox"][0], "x1": b["bbox"][2],
  349. "top": b["bbox"][1], "bottom": b["bbox"][-1]
  350. } for b in tbl]
  351. if not lts:
  352. continue
  353. left = [b["x0"] for b in lts if b["label"].find(
  354. "row") > 0 or b["label"].find("header") > 0]
  355. right = [b["x1"] for b in lts if b["label"].find(
  356. "row") > 0 or b["label"].find("header") > 0]
  357. if not left:
  358. continue
  359. left = np.median(left) if len(left) > 4 else np.min(left)
  360. right = np.median(right) if len(right) > 4 else np.max(right)
  361. for b in lts:
  362. if b["label"].find("row") > 0 or b["label"].find("header") > 0:
  363. if b["x0"] > left:
  364. b["x0"] = left
  365. if b["x1"] < right:
  366. b["x1"] = right
  367. top = [b["top"] for b in lts if b["label"] == "table column"]
  368. bottom = [b["bottom"] for b in lts if b["label"] == "table column"]
  369. if not top:
  370. res.append(lts)
  371. continue
  372. top = np.median(top) if len(top) > 4 else np.min(top)
  373. bottom = np.median(bottom) if len(bottom) > 4 else np.max(bottom)
  374. for b in lts:
  375. if b["label"] == "table column":
  376. if b["top"] > top:
  377. b["top"] = top
  378. if b["bottom"] < bottom:
  379. b["bottom"] = bottom
  380. res.append(lts)
  381. return res
  382. def _table_transformer_job(self, ZM):
  383. logging.info("Table processing...")
  384. imgs, pos = [], []
  385. tbcnt = [0]
  386. MARGIN = 10
  387. self.tb_cpns = []
  388. assert len(self.page_layout) == len(self.page_images)
  389. for p, tbls in enumerate(self.page_layout): # for page
  390. tbls = [f for f in tbls if f["type"] == "table"]
  391. tbcnt.append(len(tbls))
  392. if not tbls:
  393. continue
  394. for tb in tbls: # for table
  395. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  396. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  397. left *= ZM
  398. top *= ZM
  399. right *= ZM
  400. bott *= ZM
  401. pos.append((left, top))
  402. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  403. assert len(self.page_images) == len(tbcnt) - 1
  404. if not imgs:
  405. return
  406. recos = self.__table_paddle(imgs)
  407. tbcnt = np.cumsum(tbcnt)
  408. for i in range(len(tbcnt) - 1): # for page
  409. pg = []
  410. for j, tb_items in enumerate(
  411. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  412. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  413. for it in tb_items: # for table components
  414. it["x0"] = (it["x0"] + poss[j][0])
  415. it["x1"] = (it["x1"] + poss[j][0])
  416. it["top"] = (it["top"] + poss[j][1])
  417. it["bottom"] = (it["bottom"] + poss[j][1])
  418. for n in ["x0", "x1", "top", "bottom"]:
  419. it[n] /= ZM
  420. it["top"] += self.page_cum_height[i]
  421. it["bottom"] += self.page_cum_height[i]
  422. it["pn"] = i
  423. it["layoutno"] = j
  424. pg.append(it)
  425. self.tb_cpns.extend(pg)
  426. def gather(kwd, fzy=10, ption=0.6):
  427. eles = self.sort_Y_firstly(
  428. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  429. eles = self.__layouts_cleanup(self.boxes, eles, 5, ption)
  430. return self.sort_Y_firstly(eles, 0)
  431. # add R,H,C,SP tag to boxes within table layout
  432. headers = gather(r".*header$")
  433. rows = gather(r".* (row|header)")
  434. spans = gather(r".*spanning")
  435. clmns = sorted([r for r in self.tb_cpns if re.match(
  436. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  437. clmns = self.__layouts_cleanup(self.boxes, clmns, 5, 0.5)
  438. for b in self.boxes:
  439. if b.get("layout_type", "") != "table":
  440. continue
  441. ii = self.__find_overlapped_with_threashold(b, rows, thr=0.3)
  442. if ii is not None:
  443. b["R"] = ii
  444. b["R_top"] = rows[ii]["top"]
  445. b["R_bott"] = rows[ii]["bottom"]
  446. ii = self.__find_overlapped_with_threashold(b, headers, thr=0.3)
  447. if ii is not None:
  448. b["H_top"] = headers[ii]["top"]
  449. b["H_bott"] = headers[ii]["bottom"]
  450. b["H_left"] = headers[ii]["x0"]
  451. b["H_right"] = headers[ii]["x1"]
  452. b["H"] = ii
  453. ii = self.__find_overlapped_with_threashold(b, clmns, thr=0.3)
  454. if ii is not None:
  455. b["C"] = ii
  456. b["C_left"] = clmns[ii]["x0"]
  457. b["C_right"] = clmns[ii]["x1"]
  458. ii = self.__find_overlapped_with_threashold(b, spans, thr=0.3)
  459. if ii is not None:
  460. b["H_top"] = spans[ii]["top"]
  461. b["H_bott"] = spans[ii]["bottom"]
  462. b["H_left"] = spans[ii]["x0"]
  463. b["H_right"] = spans[ii]["x1"]
  464. b["SP"] = ii
  465. def __ocr_paddle(self, pagenum, img, chars, ZM=3):
  466. bxs = self.ocr.ocr(np.array(img), cls=True)[0]
  467. if not bxs:
  468. self.boxes.append([])
  469. return
  470. bxs = [(line[0], line[1][0]) for line in bxs]
  471. bxs = self.sort_Y_firstly(
  472. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  473. "top": b[0][1] / ZM, "text": "", "txt": t,
  474. "bottom": b[-1][1] / ZM,
  475. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  476. self.mean_height[-1] / 3
  477. )
  478. # merge chars in the same rect
  479. for c in self.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4):
  480. ii = self.__find_overlapped(c, bxs)
  481. if ii is None:
  482. self.lefted_chars.append(c)
  483. continue
  484. ch = c["bottom"] - c["top"]
  485. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  486. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  487. self.lefted_chars.append(c)
  488. continue
  489. if c["text"] == " " and bxs[ii]["text"]:
  490. if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " "
  491. else:
  492. bxs[ii]["text"] += c["text"]
  493. for b in bxs:
  494. if not b["text"]:
  495. b["text"] = b["txt"]
  496. del b["txt"]
  497. if self.mean_height[-1] == 0:
  498. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  499. for b in bxs])
  500. self.boxes.append(bxs)
  501. def _layouts_paddle(self, ZM):
  502. assert len(self.page_images) == len(self.boxes)
  503. # Tag layout type
  504. boxes = []
  505. layouts = self.layouter(self.page_images)
  506. assert len(self.page_images) == len(layouts)
  507. for pn, lts in enumerate(layouts):
  508. bxs = self.boxes[pn]
  509. lts = [{"type": b["type"],
  510. "score": float(b["score"]),
  511. "x0": b["bbox"][0] / ZM, "x1": b["bbox"][2] / ZM,
  512. "top": b["bbox"][1] / ZM, "bottom": b["bbox"][-1] / ZM,
  513. "page_number": pn,
  514. } for b in lts]
  515. lts = self.sort_Y_firstly(lts, self.mean_height[pn] / 2)
  516. lts = self.__layouts_cleanup(bxs, lts)
  517. self.page_layout.append(lts)
  518. # Tag layout type, layouts are ready
  519. def findLayout(ty):
  520. nonlocal bxs, lts
  521. lts_ = [lt for lt in lts if lt["type"] == ty]
  522. i = 0
  523. while i < len(bxs):
  524. if bxs[i].get("layout_type"):
  525. i += 1
  526. continue
  527. if self._is_garbage(bxs[i]):
  528. logging.debug("GARBAGE: " + bxs[i]["text"])
  529. bxs.pop(i)
  530. continue
  531. ii = self.__find_overlapped_with_threashold(bxs[i], lts_,
  532. thr=0.4)
  533. if ii is None: # belong to nothing
  534. bxs[i]["layout_type"] = ""
  535. i += 1
  536. continue
  537. lts_[ii]["visited"] = True
  538. if lts_[ii]["type"] in ["footer", "header", "reference"]:
  539. if lts_[ii]["type"] not in self.garbages:
  540. self.garbages[lts_[ii]["type"]] = []
  541. self.garbages[lts_[ii]["type"]].append(bxs[i]["text"])
  542. logging.debug("GARBAGE: " + bxs[i]["text"])
  543. bxs.pop(i)
  544. continue
  545. bxs[i]["layoutno"] = f"{ty}-{ii}"
  546. bxs[i]["layout_type"] = lts_[ii]["type"]
  547. i += 1
  548. for lt in ["footer", "header", "reference", "figure caption",
  549. "table caption", "title", "text", "table", "figure"]:
  550. findLayout(lt)
  551. # add box to figure layouts which has not text box
  552. for i, lt in enumerate(
  553. [lt for lt in lts if lt["type"] == "figure"]):
  554. if lt.get("visited"):
  555. continue
  556. lt = deepcopy(lt)
  557. del lt["type"]
  558. lt["text"] = ""
  559. lt["layout_type"] = "figure"
  560. lt["layoutno"] = f"figure-{i}"
  561. bxs.append(lt)
  562. boxes.extend(bxs)
  563. self.boxes = boxes
  564. garbage = set()
  565. for k in self.garbages.keys():
  566. self.garbages[k] = Counter(self.garbages[k])
  567. for g, c in self.garbages[k].items():
  568. if c > 1:
  569. garbage.add(g)
  570. logging.debug("GARBAGE:" + ",".join(garbage))
  571. self.boxes = [b for b in self.boxes if b["text"].strip() not in garbage]
  572. # cumlative Y
  573. for i in range(len(self.boxes)):
  574. self.boxes[i]["top"] += \
  575. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  576. self.boxes[i]["bottom"] += \
  577. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  578. def _text_merge(self):
  579. # merge adjusted boxes
  580. bxs = self.boxes
  581. def end_with(b, txt):
  582. txt = txt.strip()
  583. tt = b.get("text", "").strip()
  584. return tt and tt.find(txt) == len(tt) - len(txt)
  585. def start_with(b, txts):
  586. tt = b.get("text", "").strip()
  587. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  588. # horizontally merge adjacent box with the same layout
  589. i = 0
  590. while i < len(bxs) - 1:
  591. b = bxs[i]
  592. b_ = bxs[i + 1]
  593. if b.get("layoutno", "0") != b_.get("layoutno", "1"):
  594. i += 1
  595. continue
  596. dis_thr = 1
  597. dis = b["x1"] - b_["x0"]
  598. if b.get("layout_type", "") != "text" or b_.get(
  599. "layout_type", "") != "text":
  600. if end_with(b, ",") or start_with(b_, "(,"):
  601. dis_thr = -8
  602. else:
  603. i += 1
  604. continue
  605. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  606. and dis >= dis_thr and b["x1"] < b_["x1"]:
  607. # merge
  608. bxs[i]["x1"] = b_["x1"]
  609. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  610. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  611. bxs[i]["text"] += b_["text"]
  612. bxs.pop(i + 1)
  613. continue
  614. i += 1
  615. self.boxes = bxs
  616. def _naive_vertical_merge(self):
  617. bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
  618. i = 0
  619. while i + 1 < len(bxs):
  620. b = bxs[i]
  621. b_ = bxs[i + 1]
  622. if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
  623. bxs.pop(i)
  624. continue
  625. concatting_feats = [
  626. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  627. len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
  628. b["text"].strip()[0] in "。;?!?”)),,、:",
  629. ]
  630. # features for not concating
  631. feats = [
  632. b.get("layoutno", 0) != b.get("layoutno", 0),
  633. b["text"].strip()[-1] in "。?!?",
  634. self.is_english and b["text"].strip()[-1] in ".!?",
  635. b["page_number"] == b_["page_number"] and b_["top"] - \
  636. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  637. b["page_number"] < b_["page_number"] and abs(
  638. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
  639. ]
  640. if any(feats) and not any(concatting_feats):
  641. i += 1
  642. continue
  643. # merge up and down
  644. b["bottom"] = b_["bottom"]
  645. b["text"] += b_["text"]
  646. b["x0"] = min(b["x0"], b_["x0"])
  647. b["x1"] = max(b["x1"], b_["x1"])
  648. bxs.pop(i + 1)
  649. self.boxes = bxs
  650. def _concat_downward(self, concat_between_pages=True):
  651. # count boxes in the same row as a feature
  652. for i in range(len(self.boxes)):
  653. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  654. self.boxes[i]["in_row"] = 0
  655. j = max(0, i - 12)
  656. while j < min(i + 12, len(self.boxes)):
  657. if j == i:
  658. j += 1
  659. continue
  660. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  661. if abs(ydis) < 1:
  662. self.boxes[i]["in_row"] += 1
  663. elif ydis > 0:
  664. break
  665. j += 1
  666. # concat between rows
  667. boxes = deepcopy(self.boxes)
  668. blocks = []
  669. while boxes:
  670. chunks = []
  671. def dfs(up, dp):
  672. chunks.append(up)
  673. i = dp
  674. while i < min(dp + 12, len(boxes)):
  675. ydis = self._y_dis(up, boxes[i])
  676. smpg = up["page_number"] == boxes[i]["page_number"]
  677. mh = self.mean_height[up["page_number"] - 1]
  678. mw = self.mean_width[up["page_number"] - 1]
  679. if smpg and ydis > mh * 4:
  680. break
  681. if not smpg and ydis > mh * 16:
  682. break
  683. down = boxes[i]
  684. if not concat_between_pages and down["page_number"] > up["page_number"]:
  685. break
  686. if up.get("R", "") != down.get(
  687. "R", "") and up["text"][-1] != ",":
  688. i += 1
  689. continue
  690. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  691. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
  692. i += 1
  693. continue
  694. if not down["text"].strip():
  695. i += 1
  696. continue
  697. if up["x1"] < down["x0"] - 10 * \
  698. mw or up["x0"] > down["x1"] + 10 * mw:
  699. i += 1
  700. continue
  701. if i - dp < 5 and up.get("layout_type") == "text":
  702. if up.get("layoutno", "1") == down.get(
  703. "layoutno", "2"):
  704. dfs(down, i + 1)
  705. boxes.pop(i)
  706. return
  707. i += 1
  708. continue
  709. fea = self._updown_concat_features(up, down)
  710. if self.updown_cnt_mdl.predict(
  711. xgb.DMatrix([fea]))[0] <= 0.5:
  712. i += 1
  713. continue
  714. dfs(down, i + 1)
  715. boxes.pop(i)
  716. return
  717. dfs(boxes[0], 1)
  718. boxes.pop(0)
  719. if chunks:
  720. blocks.append(chunks)
  721. # concat within each block
  722. boxes = []
  723. for b in blocks:
  724. if len(b) == 1:
  725. boxes.append(b[0])
  726. continue
  727. t = b[0]
  728. for c in b[1:]:
  729. t["text"] = t["text"].strip()
  730. c["text"] = c["text"].strip()
  731. if not c["text"]:
  732. continue
  733. if t["text"] and re.match(
  734. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  735. t["text"] += " "
  736. t["text"] += c["text"]
  737. t["x0"] = min(t["x0"], c["x0"])
  738. t["x1"] = max(t["x1"], c["x1"])
  739. t["page_number"] = min(t["page_number"], c["page_number"])
  740. t["bottom"] = c["bottom"]
  741. if not t["layout_type"] \
  742. and c["layout_type"]:
  743. t["layout_type"] = c["layout_type"]
  744. boxes.append(t)
  745. self.boxes = self.sort_Y_firstly(boxes, 0)
  746. def _filter_forpages(self):
  747. if not self.boxes:
  748. return
  749. findit = False
  750. i = 0
  751. while i < len(self.boxes):
  752. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  753. i += 1
  754. continue
  755. findit = True
  756. eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
  757. self.boxes.pop(i)
  758. if i >= len(self.boxes): break
  759. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
  760. while not prefix:
  761. self.boxes.pop(i)
  762. if i >= len(self.boxes): break
  763. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
  764. self.boxes.pop(i)
  765. if i >= len(self.boxes) or not prefix: break
  766. for j in range(i, min(i + 128, len(self.boxes))):
  767. if not re.match(prefix, self.boxes[j]["text"]):
  768. continue
  769. for k in range(i, j): self.boxes.pop(i)
  770. break
  771. if findit:return
  772. page_dirty = [0] * len(self.page_images)
  773. for b in self.boxes:
  774. if re.search(r"(··|··|··)", b["text"]):
  775. page_dirty[b["page_number"]-1] += 1
  776. page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3])
  777. if not page_dirty: return
  778. i = 0
  779. while i < len(self.boxes):
  780. if self.boxes[i]["page_number"] in page_dirty:
  781. self.boxes.pop(i)
  782. continue
  783. i += 1
  784. def _merge_with_same_bullet(self):
  785. i = 0
  786. while i + 1 < len(self.boxes):
  787. b = self.boxes[i]
  788. b_ = self.boxes[i + 1]
  789. if not b["text"].strip():
  790. self.boxes.pop(i)
  791. continue
  792. if not b_["text"].strip():
  793. self.boxes.pop(i+1)
  794. continue
  795. if b["text"].strip()[0] != b_["text"].strip()[0] \
  796. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  797. or huqie.is_chinese(b["text"].strip()[0]) \
  798. or b["top"] > b_["bottom"]:
  799. i += 1
  800. continue
  801. b_["text"] = b["text"] + "\n" + b_["text"]
  802. b_["x0"] = min(b["x0"], b_["x0"])
  803. b_["x1"] = max(b["x1"], b_["x1"])
  804. b_["top"] = b["top"]
  805. self.boxes.pop(i)
  806. def _blockType(self, b):
  807. patt = [
  808. ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
  809. (r"^(20|19)[0-9]{2}年$", "Dt"),
  810. (r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"),
  811. ("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"),
  812. (r"^第*[一二三四1-4]季度$", "Dt"),
  813. (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
  814. (r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"),
  815. ("^[0-9.,+%/ -]+$", "Nu"),
  816. (r"^[0-9A-Z/\._~-]+$", "Ca"),
  817. (r"^[A-Z]*[a-z' -]+$", "En"),
  818. (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
  819. (r"^.{1}$", "Sg")
  820. ]
  821. for p, n in patt:
  822. if re.search(p, b["text"].strip()):
  823. return n
  824. tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1]
  825. if len(tks) > 3:
  826. if len(tks) < 12:
  827. return "Tx"
  828. else:
  829. return "Lx"
  830. if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
  831. return "Nr"
  832. return "Ot"
  833. def __cal_spans(self, boxes, rows, cols, tbl, html=True):
  834. # caculate span
  835. clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
  836. for cln in cols]
  837. crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln])
  838. for cln in cols]
  839. rtop = [np.mean([c.get("R_top", c["top"]) for c in row])
  840. for row in rows]
  841. rbtm = [np.mean([c.get("R_btm", c["bottom"])
  842. for c in row]) for row in rows]
  843. for b in boxes:
  844. if "SP" not in b:
  845. continue
  846. b["colspan"] = [b["cn"]]
  847. b["rowspan"] = [b["rn"]]
  848. # col span
  849. for j in range(0, len(clft)):
  850. if j == b["cn"]:
  851. continue
  852. if clft[j] + (crgt[j] - clft[j]) / 2 < b["H_left"]:
  853. continue
  854. if crgt[j] - (crgt[j] - clft[j]) / 2 > b["H_right"]:
  855. continue
  856. b["colspan"].append(j)
  857. # row span
  858. for j in range(0, len(rtop)):
  859. if j == b["rn"]:
  860. continue
  861. if rtop[j] + (rbtm[j] - rtop[j]) / 2 < b["H_top"]:
  862. continue
  863. if rbtm[j] - (rbtm[j] - rtop[j]) / 2 > b["H_bott"]:
  864. continue
  865. b["rowspan"].append(j)
  866. def join(arr):
  867. if not arr:
  868. return ""
  869. return "".join([t["text"] for t in arr])
  870. # rm the spaning cells
  871. for i in range(len(tbl)):
  872. for j, arr in enumerate(tbl[i]):
  873. if not arr:
  874. continue
  875. if all(["rowspan" not in a and "colspan" not in a for a in arr]):
  876. continue
  877. rowspan, colspan = [], []
  878. for a in arr:
  879. if isinstance(a.get("rowspan", 0), list):
  880. rowspan.extend(a["rowspan"])
  881. if isinstance(a.get("colspan", 0), list):
  882. colspan.extend(a["colspan"])
  883. rowspan, colspan = set(rowspan), set(colspan)
  884. if len(rowspan) < 2 and len(colspan) < 2:
  885. for a in arr:
  886. if "rowspan" in a:
  887. del a["rowspan"]
  888. if "colspan" in a:
  889. del a["colspan"]
  890. continue
  891. rowspan, colspan = sorted(rowspan), sorted(colspan)
  892. rowspan = list(range(rowspan[0], rowspan[-1] + 1))
  893. colspan = list(range(colspan[0], colspan[-1] + 1))
  894. assert i in rowspan, rowspan
  895. assert j in colspan, colspan
  896. arr = []
  897. for r in rowspan:
  898. for c in colspan:
  899. arr_txt = join(arr)
  900. if tbl[r][c] and join(tbl[r][c]) != arr_txt:
  901. arr.extend(tbl[r][c])
  902. tbl[r][c] = None if html else arr
  903. for a in arr:
  904. if len(rowspan) > 1:
  905. a["rowspan"] = len(rowspan)
  906. elif "rowspan" in a:
  907. del a["rowspan"]
  908. if len(colspan) > 1:
  909. a["colspan"] = len(colspan)
  910. elif "colspan" in a:
  911. del a["colspan"]
  912. tbl[rowspan[0]][colspan[0]] = arr
  913. return tbl
  914. def __construct_table(self, boxes, html=False):
  915. cap = ""
  916. i = 0
  917. while i < len(boxes):
  918. if self.is_caption(boxes[i]):
  919. cap += boxes[i]["text"]
  920. boxes.pop(i)
  921. i -= 1
  922. i += 1
  923. if not boxes:
  924. return []
  925. for b in boxes:
  926. b["btype"] = self._blockType(b)
  927. max_type = Counter([b["btype"] for b in boxes]).items()
  928. max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
  929. logging.debug("MAXTYPE: " + max_type)
  930. rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
  931. rowh = np.min(rowh) if rowh else 0
  932. # boxes = self.sort_Y_firstly(boxes, rowh/5)
  933. boxes = self.sort_R_firstly(boxes, rowh / 2)
  934. boxes[0]["rn"] = 0
  935. rows = [[boxes[0]]]
  936. btm = boxes[0]["bottom"]
  937. for b in boxes[1:]:
  938. b["rn"] = len(rows) - 1
  939. lst_r = rows[-1]
  940. if lst_r[-1].get("R", "") != b.get("R", "") \
  941. or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
  942. ): # new row
  943. btm = b["bottom"]
  944. b["rn"] += 1
  945. rows.append([b])
  946. continue
  947. btm = (btm + b["bottom"]) / 2.
  948. rows[-1].append(b)
  949. colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b]
  950. colwm = np.min(colwm) if colwm else 0
  951. crosspage = len(set([b["page_number"] for b in boxes])) > 1
  952. if crosspage:
  953. boxes = self.sort_X_firstly(boxes, colwm / 2, False)
  954. else:
  955. boxes = self.sort_C_firstly(boxes, colwm / 2)
  956. boxes[0]["cn"] = 0
  957. cols = [[boxes[0]]]
  958. right = boxes[0]["x1"]
  959. for b in boxes[1:]:
  960. b["cn"] = len(cols) - 1
  961. lst_c = cols[-1]
  962. if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1][
  963. "page_number"]) \
  964. or (b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")): # new col
  965. right = b["x1"]
  966. b["cn"] += 1
  967. cols.append([b])
  968. continue
  969. right = (right + b["x1"]) / 2.
  970. cols[-1].append(b)
  971. tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))]
  972. for b in boxes:
  973. tbl[b["rn"]][b["cn"]].append(b)
  974. if len(rows) >= 4:
  975. # remove single in column
  976. j = 0
  977. while j < len(tbl[0]):
  978. e, ii = 0, 0
  979. for i in range(len(tbl)):
  980. if tbl[i][j]:
  981. e += 1
  982. ii = i
  983. if e > 1:
  984. break
  985. if e > 1:
  986. j += 1
  987. continue
  988. f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
  989. [j - 1][0].get("text")) or j == 0
  990. ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
  991. [j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
  992. if f and ff:
  993. j += 1
  994. continue
  995. bx = tbl[ii][j][0]
  996. logging.debug("Relocate column single: " + bx["text"])
  997. # j column only has one value
  998. left, right = 100000, 100000
  999. if j > 0 and not f:
  1000. for i in range(len(tbl)):
  1001. if tbl[i][j - 1]:
  1002. left = min(left, np.min(
  1003. [bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
  1004. if j + 1 < len(tbl[0]) and not ff:
  1005. for i in range(len(tbl)):
  1006. if tbl[i][j + 1]:
  1007. right = min(right, np.min(
  1008. [a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
  1009. assert left < 100000 or right < 100000
  1010. if left < right:
  1011. for jj in range(j, len(tbl[0])):
  1012. for i in range(len(tbl)):
  1013. for a in tbl[i][jj]:
  1014. a["cn"] -= 1
  1015. if tbl[ii][j - 1]:
  1016. tbl[ii][j - 1].extend(tbl[ii][j])
  1017. else:
  1018. tbl[ii][j - 1] = tbl[ii][j]
  1019. for i in range(len(tbl)):
  1020. tbl[i].pop(j)
  1021. else:
  1022. for jj in range(j + 1, len(tbl[0])):
  1023. for i in range(len(tbl)):
  1024. for a in tbl[i][jj]:
  1025. a["cn"] -= 1
  1026. if tbl[ii][j + 1]:
  1027. tbl[ii][j + 1].extend(tbl[ii][j])
  1028. else:
  1029. tbl[ii][j + 1] = tbl[ii][j]
  1030. for i in range(len(tbl)):
  1031. tbl[i].pop(j)
  1032. cols.pop(j)
  1033. assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (
  1034. len(cols), len(tbl[0]))
  1035. if len(cols) >= 4:
  1036. # remove single in row
  1037. i = 0
  1038. while i < len(tbl):
  1039. e, jj = 0, 0
  1040. for j in range(len(tbl[i])):
  1041. if tbl[i][j]:
  1042. e += 1
  1043. jj = j
  1044. if e > 1:
  1045. break
  1046. if e > 1:
  1047. i += 1
  1048. continue
  1049. f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
  1050. [jj][0].get("text")) or i == 0
  1051. ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
  1052. [jj][0].get("text")) or i + 1 >= len(tbl)
  1053. if f and ff:
  1054. i += 1
  1055. continue
  1056. bx = tbl[i][jj][0]
  1057. logging.debug("Relocate row single: " + bx["text"])
  1058. # i row only has one value
  1059. up, down = 100000, 100000
  1060. if i > 0 and not f:
  1061. for j in range(len(tbl[i - 1])):
  1062. if tbl[i - 1][j]:
  1063. up = min(up, np.min(
  1064. [bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
  1065. if i + 1 < len(tbl) and not ff:
  1066. for j in range(len(tbl[i + 1])):
  1067. if tbl[i + 1][j]:
  1068. down = min(down, np.min(
  1069. [a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
  1070. assert up < 100000 or down < 100000
  1071. if up < down:
  1072. for ii in range(i, len(tbl)):
  1073. for j in range(len(tbl[ii])):
  1074. for a in tbl[ii][j]:
  1075. a["rn"] -= 1
  1076. if tbl[i - 1][jj]:
  1077. tbl[i - 1][jj].extend(tbl[i][jj])
  1078. else:
  1079. tbl[i - 1][jj] = tbl[i][jj]
  1080. tbl.pop(i)
  1081. else:
  1082. for ii in range(i + 1, len(tbl)):
  1083. for j in range(len(tbl[ii])):
  1084. for a in tbl[ii][j]:
  1085. a["rn"] -= 1
  1086. if tbl[i + 1][jj]:
  1087. tbl[i + 1][jj].extend(tbl[i][jj])
  1088. else:
  1089. tbl[i + 1][jj] = tbl[i][jj]
  1090. tbl.pop(i)
  1091. rows.pop(i)
  1092. # which rows are headers
  1093. hdset = set([])
  1094. for i in range(len(tbl)):
  1095. cnt, h = 0, 0
  1096. for j, arr in enumerate(tbl[i]):
  1097. if not arr:
  1098. continue
  1099. cnt += 1
  1100. if max_type == "Nu" and arr[0]["btype"] == "Nu":
  1101. continue
  1102. if any([a.get("H") for a in arr]) \
  1103. or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
  1104. h += 1
  1105. if h / cnt > 0.5:
  1106. hdset.add(i)
  1107. if html:
  1108. return [self.__html_table(cap, hdset,
  1109. self.__cal_spans(boxes, rows,
  1110. cols, tbl, True)
  1111. )]
  1112. return self.__desc_table(cap, hdset,
  1113. self.__cal_spans(boxes, rows, cols, tbl, False))
  1114. def __html_table(self, cap, hdset, tbl):
  1115. # constrcut HTML
  1116. html = "<table>"
  1117. if cap:
  1118. html += f"<caption>{cap}</caption>"
  1119. for i in range(len(tbl)):
  1120. row = "<tr>"
  1121. txts = []
  1122. for j, arr in enumerate(tbl[i]):
  1123. if arr is None:
  1124. continue
  1125. if not arr:
  1126. row += "<td></td>" if i not in hdset else "<th></th>"
  1127. continue
  1128. txt = ""
  1129. if arr:
  1130. h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2,
  1131. self.mean_height[arr[0]["page_number"] - 1] / 2)
  1132. txt = "".join([c["text"]
  1133. for c in self.sort_Y_firstly(arr, h)])
  1134. txts.append(txt)
  1135. sp = ""
  1136. if arr[0].get("colspan"):
  1137. sp = "colspan={}".format(arr[0]["colspan"])
  1138. if arr[0].get("rowspan"):
  1139. sp += " rowspan={}".format(arr[0]["rowspan"])
  1140. if i in hdset:
  1141. row += f"<th {sp} >" + txt + "</th>"
  1142. else:
  1143. row += f"<td {sp} >" + txt + "</td>"
  1144. if i in hdset:
  1145. if all([t in hdset for t in txts]):
  1146. continue
  1147. for t in txts:
  1148. hdset.add(t)
  1149. if row != "<tr>":
  1150. row += "</tr>"
  1151. else:
  1152. row = ""
  1153. html += "\n" + row
  1154. html += "\n</table>"
  1155. return html
  1156. def __desc_table(self, cap, hdr_rowno, tbl):
  1157. # get text of every colomn in header row to become header text
  1158. clmno = len(tbl[0])
  1159. rowno = len(tbl)
  1160. headers = {}
  1161. hdrset = set()
  1162. lst_hdr = []
  1163. de = "的" if not self.is_english else " for "
  1164. for r in sorted(list(hdr_rowno)):
  1165. headers[r] = ["" for _ in range(clmno)]
  1166. for i in range(clmno):
  1167. if not tbl[r][i]:
  1168. continue
  1169. txt = "".join([a["text"].strip() for a in tbl[r][i]])
  1170. headers[r][i] = txt
  1171. hdrset.add(txt)
  1172. if all([not t for t in headers[r]]):
  1173. del headers[r]
  1174. hdr_rowno.remove(r)
  1175. continue
  1176. for j in range(clmno):
  1177. if headers[r][j]:
  1178. continue
  1179. if j >= len(lst_hdr):
  1180. break
  1181. headers[r][j] = lst_hdr[j]
  1182. lst_hdr = headers[r]
  1183. for i in range(rowno):
  1184. if i not in hdr_rowno:
  1185. continue
  1186. for j in range(i + 1, rowno):
  1187. if j not in hdr_rowno:
  1188. break
  1189. for k in range(clmno):
  1190. if not headers[j - 1][k]:
  1191. continue
  1192. if headers[j][k].find(headers[j - 1][k]) >= 0:
  1193. continue
  1194. if len(headers[j][k]) > len(headers[j - 1][k]):
  1195. headers[j][k] += (de if headers[j][k]
  1196. else "") + headers[j - 1][k]
  1197. else:
  1198. headers[j][k] = headers[j - 1][k] \
  1199. + (de if headers[j - 1][k] else "") \
  1200. + headers[j][k]
  1201. logging.debug(
  1202. f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
  1203. row_txt = []
  1204. for i in range(rowno):
  1205. if i in hdr_rowno:
  1206. continue
  1207. rtxt = []
  1208. def append(delimer):
  1209. nonlocal rtxt, row_txt
  1210. rtxt = delimer.join(rtxt)
  1211. if row_txt and len(row_txt[-1]) + len(rtxt) < 64:
  1212. row_txt[-1] += "\n" + rtxt
  1213. else:
  1214. row_txt.append(rtxt)
  1215. r = 0
  1216. if len(headers.items()):
  1217. _arr = [(i - r, r) for r, _ in headers.items() if r < i]
  1218. if _arr:
  1219. _, r = min(_arr, key=lambda x: x[0])
  1220. if r not in headers and clmno <= 2:
  1221. for j in range(clmno):
  1222. if not tbl[i][j]:
  1223. continue
  1224. txt = "".join([a["text"].strip() for a in tbl[i][j]])
  1225. if txt:
  1226. rtxt.append(txt)
  1227. if rtxt:
  1228. append(":")
  1229. continue
  1230. for j in range(clmno):
  1231. if not tbl[i][j]:
  1232. continue
  1233. txt = "".join([a["text"].strip() for a in tbl[i][j]])
  1234. if not txt:
  1235. continue
  1236. ctt = headers[r][j] if r in headers else ""
  1237. if ctt:
  1238. ctt += ":"
  1239. ctt += txt
  1240. if ctt:
  1241. rtxt.append(ctt)
  1242. if rtxt:
  1243. row_txt.append("; ".join(rtxt))
  1244. if cap:
  1245. if self.is_english:
  1246. from_ = " in "
  1247. else:
  1248. from_ = "来自"
  1249. row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
  1250. return row_txt
  1251. @staticmethod
  1252. def is_caption(bx):
  1253. patt = [
  1254. r"[图表]+[ 0-9::]{2,}"
  1255. ]
  1256. if any([re.match(p, bx["text"].strip()) for p in patt]) \
  1257. or bx["layout_type"].find("caption") >= 0:
  1258. return True
  1259. return False
  1260. def _extract_table_figure(self, need_image, ZM, return_html):
  1261. tables = {}
  1262. figures = {}
  1263. # extract figure and table boxes
  1264. i = 0
  1265. lst_lout_no = ""
  1266. nomerge_lout_no = []
  1267. while i < len(self.boxes):
  1268. if "layoutno" not in self.boxes[i]:
  1269. i += 1
  1270. continue
  1271. lout_no = str(self.boxes[i]["page_number"]) + \
  1272. "-" + str(self.boxes[i]["layoutno"])
  1273. if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
  1274. "figure caption", "reference"]:
  1275. nomerge_lout_no.append(lst_lout_no)
  1276. if self.boxes[i]["layout_type"] == "table":
  1277. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  1278. self.boxes.pop(i)
  1279. continue
  1280. if lout_no not in tables:
  1281. tables[lout_no] = []
  1282. tables[lout_no].append(self.boxes[i])
  1283. self.boxes.pop(i)
  1284. lst_lout_no = lout_no
  1285. continue
  1286. if need_image and self.boxes[i]["layout_type"] == "figure":
  1287. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  1288. self.boxes.pop(i)
  1289. continue
  1290. if lout_no not in figures:
  1291. figures[lout_no] = []
  1292. figures[lout_no].append(self.boxes[i])
  1293. self.boxes.pop(i)
  1294. lst_lout_no = lout_no
  1295. continue
  1296. i += 1
  1297. # merge table on different pages
  1298. nomerge_lout_no = set(nomerge_lout_no)
  1299. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  1300. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  1301. i = len(tbls) - 1
  1302. while i - 1 >= 0:
  1303. k0, bxs0 = tbls[i - 1]
  1304. k, bxs = tbls[i]
  1305. i -= 1
  1306. if k0 in nomerge_lout_no:
  1307. continue
  1308. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  1309. continue
  1310. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  1311. continue
  1312. mh = self.mean_height[bxs[0]["page_number"] - 1]
  1313. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  1314. continue
  1315. tables[k0].extend(tables[k])
  1316. del tables[k]
  1317. def x_overlapped(a, b):
  1318. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  1319. # find captions and pop out
  1320. i = 0
  1321. while i < len(self.boxes):
  1322. c = self.boxes[i]
  1323. # mh = self.mean_height[c["page_number"]-1]
  1324. if not self.is_caption(c):
  1325. i += 1
  1326. continue
  1327. # find the nearest layouts
  1328. def nearest(tbls):
  1329. nonlocal c
  1330. mink = ""
  1331. minv = 1000000000
  1332. for k, bxs in tbls.items():
  1333. for b in bxs[:10]:
  1334. if b.get("layout_type", "").find("caption") >= 0:
  1335. continue
  1336. y_dis = self._y_dis(c, b)
  1337. x_dis = self._x_dis(
  1338. c, b) if not x_overlapped(
  1339. c, b) else 0
  1340. dis = y_dis * y_dis + x_dis * x_dis
  1341. if dis < minv:
  1342. mink = k
  1343. minv = dis
  1344. return mink, minv
  1345. tk, tv = nearest(tables)
  1346. fk, fv = nearest(figures)
  1347. if min(tv, fv) > 2000:
  1348. i += 1
  1349. continue
  1350. if tv < fv:
  1351. tables[tk].insert(0, c)
  1352. logging.debug(
  1353. "TABLE:" +
  1354. self.boxes[i]["text"] +
  1355. "; Cap: " +
  1356. tk)
  1357. else:
  1358. figures[fk].insert(0, c)
  1359. logging.debug(
  1360. "FIGURE:" +
  1361. self.boxes[i]["text"] +
  1362. "; Cap: " +
  1363. tk)
  1364. self.boxes.pop(i)
  1365. res = []
  1366. def cropout(bxs, ltype):
  1367. nonlocal ZM
  1368. pn = set([b["page_number"] - 1 for b in bxs])
  1369. if len(pn) < 2:
  1370. pn = list(pn)[0]
  1371. ht = self.page_cum_height[pn]
  1372. b = {
  1373. "x0": np.min([b["x0"] for b in bxs]),
  1374. "top": np.min([b["top"] for b in bxs]) - ht,
  1375. "x1": np.max([b["x1"] for b in bxs]),
  1376. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  1377. }
  1378. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  1379. ii = self.__find_overlapped(b, louts, naive=True)
  1380. if ii is not None:
  1381. b = louts[ii]
  1382. else:
  1383. logging.warn(
  1384. f"Missing layout match: {pn + 1},%s" %
  1385. (bxs[0].get(
  1386. "layoutno", "")))
  1387. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  1388. return self.page_images[pn] \
  1389. .crop((left * ZM, top * ZM,
  1390. right * ZM, bott * ZM))
  1391. pn = {}
  1392. for b in bxs:
  1393. p = b["page_number"] - 1
  1394. if p not in pn:
  1395. pn[p] = []
  1396. pn[p].append(b)
  1397. pn = sorted(pn.items(), key=lambda x: x[0])
  1398. imgs = [cropout(arr, ltype) for p, arr in pn]
  1399. pic = Image.new("RGB",
  1400. (int(np.max([i.size[0] for i in imgs])),
  1401. int(np.sum([m.size[1] for m in imgs]))),
  1402. (245, 245, 245))
  1403. height = 0
  1404. for img in imgs:
  1405. pic.paste(img, (0, int(height)))
  1406. height += img.size[1]
  1407. return pic
  1408. # crop figure out and add caption
  1409. for k, bxs in figures.items():
  1410. txt = "\n".join(
  1411. [b["text"] for b in bxs
  1412. if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
  1413. and len(b["text"].strip()) >= 4
  1414. ]
  1415. )
  1416. if not txt:
  1417. continue
  1418. res.append(
  1419. (cropout(
  1420. bxs,
  1421. "figure"),
  1422. [txt] if not return_html else [f"<p>{txt}</p>"]))
  1423. for k, bxs in tables.items():
  1424. if not bxs:
  1425. continue
  1426. res.append((cropout(bxs, "table"),
  1427. self.__construct_table(bxs, html=return_html)))
  1428. return res
  1429. def proj_match(self, line):
  1430. if len(line) <= 2:
  1431. return
  1432. if re.match(r"[0-9 ().,%%+/-]+$", line):
  1433. return False
  1434. for p, j in [
  1435. (r"第[零一二三四五六七八九十百]+章", 1),
  1436. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  1437. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  1438. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  1439. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  1440. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  1441. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  1442. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  1443. (r".{,48}[::??]$", 9),
  1444. (r"[0-9]+)", 10),
  1445. (r"[\((][0-9]+[)\)]", 11),
  1446. (r"[零一二三四五六七八九十百]+是", 12),
  1447. (r"[⚫•➢✓]", 12)
  1448. ]:
  1449. if re.match(p, line):
  1450. return j
  1451. return
  1452. def _line_tag(self, bx, ZM):
  1453. pn = [bx["page_number"]]
  1454. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  1455. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  1456. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  1457. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  1458. pn.append(pn[-1] + 1)
  1459. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  1460. .format("-".join([str(p) for p in pn]),
  1461. bx["x0"], bx["x1"], top, bott)
  1462. def __filterout_scraps(self, boxes, ZM):
  1463. def width(b):
  1464. return b["x1"] - b["x0"]
  1465. def height(b):
  1466. return b["bottom"] - b["top"]
  1467. def usefull(b):
  1468. if b.get("layout_type"):
  1469. return True
  1470. if width(
  1471. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  1472. return True
  1473. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  1474. return True
  1475. return False
  1476. res = []
  1477. while boxes:
  1478. lines = []
  1479. widths = []
  1480. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  1481. mh = self.mean_height[boxes[0]["page_number"] - 1]
  1482. mj = self.proj_match(
  1483. boxes[0]["text"]) or boxes[0].get(
  1484. "layout_type",
  1485. "") == "title"
  1486. def dfs(line, st):
  1487. nonlocal mh, pw, lines, widths
  1488. lines.append(line)
  1489. widths.append(width(line))
  1490. width_mean = np.mean(widths)
  1491. mmj = self.proj_match(
  1492. line["text"]) or line.get(
  1493. "layout_type",
  1494. "") == "title"
  1495. for i in range(st + 1, min(st + 20, len(boxes))):
  1496. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  1497. break
  1498. if not mmj and self._y_dis(
  1499. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  1500. break
  1501. if not usefull(boxes[i]):
  1502. continue
  1503. if mmj or \
  1504. (self._x_dis(boxes[i], line) < pw / 10): \
  1505. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  1506. # concat following
  1507. dfs(boxes[i], i)
  1508. boxes.pop(i)
  1509. break
  1510. try:
  1511. if usefull(boxes[0]):
  1512. dfs(boxes[0], 0)
  1513. else:
  1514. logging.debug("WASTE: " + boxes[0]["text"])
  1515. except Exception as e:
  1516. pass
  1517. boxes.pop(0)
  1518. mw = np.mean(widths)
  1519. if mj or mw / pw >= 0.35 or mw > 200:
  1520. res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  1521. else:
  1522. logging.debug("REMOVED: " +
  1523. "<<".join([c["text"] for c in lines]))
  1524. return "\n\n".join(res)
  1525. @staticmethod
  1526. def total_page_number(fnm, binary=None):
  1527. try:
  1528. pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary))
  1529. return len(pdf.pages)
  1530. except Exception as e:
  1531. pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
  1532. return len(pdf)
  1533. def __images__(self, fnm, zoomin=3, page_from=0, page_to=299):
  1534. self.lefted_chars = []
  1535. self.mean_height = []
  1536. self.mean_width = []
  1537. self.boxes = []
  1538. self.garbages = {}
  1539. self.page_cum_height = [0]
  1540. self.page_layout = []
  1541. try:
  1542. self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
  1543. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  1544. enumerate(self.pdf.pages[page_from:page_to])]
  1545. self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
  1546. self.total_page = len(self.pdf.pages)
  1547. except Exception as e:
  1548. self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
  1549. self.page_images = []
  1550. self.page_chars = []
  1551. mat = fitz.Matrix(zoomin, zoomin)
  1552. self.total_page = len(self.pdf)
  1553. for i, page in enumerate(self.pdf):
  1554. if i < page_from:continue
  1555. if i >= page_to:break
  1556. pix = page.get_pixmap(matrix=mat)
  1557. img = Image.frombytes("RGB", [pix.width, pix.height],
  1558. pix.samples)
  1559. self.page_images.append(img)
  1560. self.page_chars.append([])
  1561. logging.info("Images converted.")
  1562. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))]
  1563. if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
  1564. self.is_english = True
  1565. else:
  1566. self.is_english = False
  1567. for i, img in enumerate(self.page_images):
  1568. chars = self.page_chars[i] if not self.is_english else []
  1569. self.mean_height.append(
  1570. np.median(sorted([c["height"] for c in chars])) if chars else 0
  1571. )
  1572. self.mean_width.append(
  1573. np.median(sorted([c["width"] for c in chars])) if chars else 8
  1574. )
  1575. self.page_cum_height.append(img.size[1] / zoomin)
  1576. j = 0
  1577. while j + 1 < len(chars):
  1578. if chars[j]["text"] and chars[j + 1]["text"] \
  1579. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  1580. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  1581. chars[j]["width"]) / 2:
  1582. chars[j]["text"] += " "
  1583. j += 1
  1584. # if i > 0:
  1585. # if not chars:
  1586. # self.page_cum_height.append(img.size[1] / zoomin)
  1587. # else:
  1588. # self.page_cum_height.append(
  1589. # np.max([c["bottom"] for c in chars]))
  1590. self.__ocr_paddle(i + 1, img, chars, zoomin)
  1591. if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
  1592. bxes = [b for bxs in self.boxes for b in bxs]
  1593. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  1594. logging.info("Is it English:", self.is_english)
  1595. self.page_cum_height = np.cumsum(self.page_cum_height)
  1596. assert len(self.page_cum_height) == len(self.page_images) + 1
  1597. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  1598. self.__images__(fnm, zoomin)
  1599. self._layouts_paddle(zoomin)
  1600. self._table_transformer_job(zoomin)
  1601. self._text_merge()
  1602. self._concat_downward()
  1603. self._filter_forpages()
  1604. tbls = self._extract_table_figure(need_image, zoomin, return_html)
  1605. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  1606. def remove_tag(self, txt):
  1607. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  1608. def crop(self, text, ZM=3):
  1609. imgs = []
  1610. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  1611. pn, left, right, top, bottom = tag.strip(
  1612. "#").strip("@").split("\t")
  1613. left, right, top, bottom = float(left), float(
  1614. right), float(top), float(bottom)
  1615. bottom *= ZM
  1616. pns = [int(p) - 1 for p in pn.split("-")]
  1617. for pn in pns[1:]:
  1618. bottom += self.page_images[pn - 1].size[1]
  1619. imgs.append(
  1620. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  1621. right *
  1622. ZM, min(
  1623. bottom, self.page_images[pns[0]].size[1])
  1624. ))
  1625. )
  1626. bottom -= self.page_images[pns[0]].size[1]
  1627. for pn in pns[1:]:
  1628. imgs.append(
  1629. self.page_images[pn].crop((left * ZM, 0,
  1630. right * ZM,
  1631. min(bottom,
  1632. self.page_images[pn].size[1])
  1633. ))
  1634. )
  1635. bottom -= self.page_images[pn].size[1]
  1636. if not imgs:
  1637. return
  1638. GAP = 2
  1639. height = 0
  1640. for img in imgs:
  1641. height += img.size[1] + GAP
  1642. height = int(height)
  1643. pic = Image.new("RGB",
  1644. (int(np.max([i.size[0] for i in imgs])), height),
  1645. (245, 245, 245))
  1646. height = 0
  1647. for img in imgs:
  1648. pic.paste(img, (0, int(height)))
  1649. height += img.size[1] + GAP
  1650. return pic
  1651. if __name__ == "__main__":
  1652. pass