Você não pode selecionar mais de 25 tópicos Os tópicos devem começar com uma letra ou um número, podem incluir traços ('-') e podem ter até 35 caracteres.

pdf_parser.py 68KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775
  1. # -*- coding: utf-8 -*-
  2. import random
  3. import fitz
  4. import xgboost as xgb
  5. from io import BytesIO
  6. import torch
  7. import re
  8. import pdfplumber
  9. import logging
  10. from PIL import Image
  11. import numpy as np
  12. from rag.nlp import huqie
  13. from collections import Counter
  14. from copy import deepcopy
  15. from rag.cv.table_recognize import TableTransformer
  16. from rag.cv.ppdetection import PPDet
  17. from huggingface_hub import hf_hub_download
  18. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  19. class HuParser:
  20. def __init__(self):
  21. from paddleocr import PaddleOCR
  22. logging.getLogger("ppocr").setLevel(logging.ERROR)
  23. self.ocr = PaddleOCR(use_angle_cls=False, lang="ch")
  24. self.layouter = PPDet("/data/newpeak/medical-gpt/res/ppdet")
  25. self.tbl_det = PPDet("/data/newpeak/medical-gpt/res/ppdet.tbl")
  26. self.updown_cnt_mdl = xgb.Booster()
  27. if torch.cuda.is_available():
  28. self.updown_cnt_mdl.set_param({"device": "cuda"})
  29. self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
  30. filename="updown_concat_xgb.model"))
  31. """
  32. If you have trouble downloading HuggingFace models, -_^ this might help!!
  33. For Linux:
  34. export HF_ENDPOINT=https://hf-mirror.com
  35. For Windows:
  36. Good luck
  37. ^_-
  38. """
  39. def __char_width(self, c):
  40. return (c["x1"] - c["x0"]) // len(c["text"])
  41. def __height(self, c):
  42. return c["bottom"] - c["top"]
  43. def _x_dis(self, a, b):
  44. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  45. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  46. def _y_dis(
  47. self, a, b):
  48. return (
  49. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  50. def _match_proj(self, b):
  51. proj_patt = [
  52. r"第[零一二三四五六七八九十百]+章",
  53. r"第[零一二三四五六七八九十百]+[条节]",
  54. r"[零一二三四五六七八九十百]+[、是  ]",
  55. r"[\((][零一二三四五六七八九十百]+[)\)]",
  56. r"[\((][0-9]+[)\)]",
  57. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  58. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  59. r"[⚫•➢①② ]",
  60. ]
  61. return any([re.match(p, b["text"]) for p in proj_patt])
  62. def _updown_concat_features(self, up, down):
  63. w = max(self.__char_width(up), self.__char_width(down))
  64. h = max(self.__height(up), self.__height(down))
  65. y_dis = self._y_dis(up, down)
  66. LEN = 6
  67. tks_down = huqie.qie(down["text"][:LEN]).split(" ")
  68. tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
  69. tks_all = up["text"][-LEN:].strip() \
  70. + (" " if re.match(r"[a-zA-Z0-9]+",
  71. up["text"][-1] + down["text"][0]) else "") \
  72. + down["text"][:LEN].strip()
  73. tks_all = huqie.qie(tks_all).split(" ")
  74. fea = [
  75. up.get("R", -1) == down.get("R", -1),
  76. y_dis / h,
  77. down["page_number"] - up["page_number"],
  78. up["layout_type"] == down["layout_type"],
  79. up["layout_type"] == "text",
  80. down["layout_type"] == "text",
  81. up["layout_type"] == "table",
  82. down["layout_type"] == "table",
  83. True if re.search(
  84. r"([。?!;!?;+))]|[a-z]\.)$",
  85. up["text"]) else False,
  86. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  87. True if re.search(
  88. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  89. down["text"]) else False,
  90. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  91. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  92. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  93. True if re.search(r"[\((][^\))]+$", up["text"])
  94. and re.search(r"[\))]", down["text"]) else False,
  95. self._match_proj(down),
  96. True if re.match(r"[A-Z]", down["text"]) else False,
  97. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  98. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  99. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  100. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  101. ) > 1 and len(
  102. down["text"].strip()) > 1 else False,
  103. up["x0"] > down["x1"],
  104. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  105. self.__height(down)),
  106. self._x_dis(up, down) / max(w, 0.000001),
  107. (len(up["text"]) - len(down["text"])) /
  108. max(len(up["text"]), len(down["text"])),
  109. len(tks_all) - len(tks_up) - len(tks_down),
  110. len(tks_down) - len(tks_up),
  111. tks_down[-1] == tks_up[-1],
  112. max(down["in_row"], up["in_row"]),
  113. abs(down["in_row"] - up["in_row"]),
  114. len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
  115. len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
  116. ]
  117. return fea
  118. @staticmethod
  119. def sort_Y_firstly(arr, threashold):
  120. # sort using y1 first and then x1
  121. arr = sorted(arr, key=lambda r: (r["top"], r["x0"]))
  122. for i in range(len(arr) - 1):
  123. for j in range(i, -1, -1):
  124. # restore the order using th
  125. if abs(arr[j + 1]["top"] - arr[j]["top"]) < threashold \
  126. and arr[j + 1]["x0"] < arr[j]["x0"]:
  127. tmp = deepcopy(arr[j])
  128. arr[j] = deepcopy(arr[j + 1])
  129. arr[j + 1] = deepcopy(tmp)
  130. return arr
  131. @staticmethod
  132. def sort_X_by_page(arr, threashold):
  133. # sort using y1 first and then x1
  134. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  135. for i in range(len(arr) - 1):
  136. for j in range(i, -1, -1):
  137. # restore the order using th
  138. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  139. and arr[j + 1]["top"] < arr[j]["top"]\
  140. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  141. tmp = arr[j]
  142. arr[j] = arr[j + 1]
  143. arr[j + 1] = tmp
  144. return arr
  145. @staticmethod
  146. def sort_R_firstly(arr, thr=0):
  147. # sort using y1 first and then x1
  148. # sorted(arr, key=lambda r: (r["top"], r["x0"]))
  149. arr = HuParser.sort_Y_firstly(arr, thr)
  150. for i in range(len(arr) - 1):
  151. for j in range(i, -1, -1):
  152. if "R" not in arr[j] or "R" not in arr[j + 1]:
  153. continue
  154. if arr[j + 1]["R"] < arr[j]["R"] \
  155. or (
  156. arr[j + 1]["R"] == arr[j]["R"]
  157. and arr[j + 1]["x0"] < arr[j]["x0"]
  158. ):
  159. tmp = arr[j]
  160. arr[j] = arr[j + 1]
  161. arr[j + 1] = tmp
  162. return arr
  163. @staticmethod
  164. def sort_X_firstly(arr, threashold, copy=True):
  165. # sort using y1 first and then x1
  166. arr = sorted(arr, key=lambda r: (r["x0"], r["top"]))
  167. for i in range(len(arr) - 1):
  168. for j in range(i, -1, -1):
  169. # restore the order using th
  170. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  171. and arr[j + 1]["top"] < arr[j]["top"]:
  172. tmp = deepcopy(arr[j]) if copy else arr[j]
  173. arr[j] = deepcopy(arr[j + 1]) if copy else arr[j + 1]
  174. arr[j + 1] = deepcopy(tmp) if copy else tmp
  175. return arr
  176. @staticmethod
  177. def sort_C_firstly(arr, thr=0):
  178. # sort using y1 first and then x1
  179. # sorted(arr, key=lambda r: (r["x0"], r["top"]))
  180. arr = HuParser.sort_X_firstly(arr, thr)
  181. for i in range(len(arr) - 1):
  182. for j in range(i, -1, -1):
  183. # restore the order using th
  184. if "C" not in arr[j] or "C" not in arr[j + 1]:
  185. continue
  186. if arr[j + 1]["C"] < arr[j]["C"] \
  187. or (
  188. arr[j + 1]["C"] == arr[j]["C"]
  189. and arr[j + 1]["top"] < arr[j]["top"]
  190. ):
  191. tmp = arr[j]
  192. arr[j] = arr[j + 1]
  193. arr[j + 1] = tmp
  194. return arr
  195. return sorted(arr, key=lambda r: (r.get("C", r["x0"]), r["top"]))
  196. def _has_color(self, o):
  197. if o.get("ncs", "") == "DeviceGray":
  198. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  199. o["non_stroking_color"][0] == 1:
  200. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  201. return False
  202. return True
  203. def __overlapped_area(self, a, b, ratio=True):
  204. tp, btm, x0, x1 = a["top"], a["bottom"], a["x0"], a["x1"]
  205. if b["x0"] > x1 or b["x1"] < x0:
  206. return 0
  207. if b["bottom"] < tp or b["top"] > btm:
  208. return 0
  209. x0_ = max(b["x0"], x0)
  210. x1_ = min(b["x1"], x1)
  211. assert x0_ <= x1_, "Fuckedup! T:{},B:{},X0:{},X1:{} ==> {}".format(
  212. tp, btm, x0, x1, b)
  213. tp_ = max(b["top"], tp)
  214. btm_ = min(b["bottom"], btm)
  215. assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
  216. tp, btm, x0, x1, b)
  217. ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
  218. x0 != 0 and btm - tp != 0 else 0
  219. if ov > 0 and ratio:
  220. ov /= (x1 - x0) * (btm - tp)
  221. return ov
  222. def __find_overlapped_with_threashold(self, box, boxes, thr=0.3):
  223. if not boxes:
  224. return
  225. max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0
  226. s, e = 0, len(boxes)
  227. for i in range(s, e):
  228. ov = self.__overlapped_area(box, boxes[i])
  229. _ov = self.__overlapped_area(boxes[i], box)
  230. if (ov, _ov) < (max_overlaped, _max_overlaped):
  231. continue
  232. max_overlaped_i = i
  233. max_overlaped = ov
  234. _max_overlaped = _ov
  235. return max_overlaped_i
  236. def __find_overlapped(self, box, boxes_sorted_by_y, naive=False):
  237. if not boxes_sorted_by_y:
  238. return
  239. bxs = boxes_sorted_by_y
  240. s, e, ii = 0, len(bxs), 0
  241. while s < e and not naive:
  242. ii = (e + s) // 2
  243. pv = bxs[ii]
  244. if box["bottom"] < pv["top"]:
  245. e = ii
  246. continue
  247. if box["top"] > pv["bottom"]:
  248. s = ii + 1
  249. continue
  250. break
  251. while s < ii:
  252. if box["top"] > bxs[s]["bottom"]:
  253. s += 1
  254. break
  255. while e - 1 > ii:
  256. if box["bottom"] < bxs[e - 1]["top"]:
  257. e -= 1
  258. break
  259. max_overlaped_i, max_overlaped = None, 0
  260. for i in range(s, e):
  261. ov = self.__overlapped_area(bxs[i], box)
  262. if ov <= max_overlaped:
  263. continue
  264. max_overlaped_i = i
  265. max_overlaped = ov
  266. return max_overlaped_i
  267. def _is_garbage(self, b):
  268. patt = [r"^•+$", r"(版权归©|免责条款|地址[::])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
  269. r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
  270. "(资料|数据)来源[::]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}",
  271. "\\(cid *: *[0-9]+ *\\)"
  272. ]
  273. return any([re.search(p, b["text"]) for p in patt])
  274. def __layouts_cleanup(self, boxes, layouts, far=2, thr=0.7):
  275. def notOverlapped(a, b):
  276. return any([a["x1"] < b["x0"],
  277. a["x0"] > b["x1"],
  278. a["bottom"] < b["top"],
  279. a["top"] > b["bottom"]])
  280. i = 0
  281. while i + 1 < len(layouts):
  282. j = i + 1
  283. while j < min(i + far, len(layouts)) \
  284. and (layouts[i].get("type", "") != layouts[j].get("type", "")
  285. or notOverlapped(layouts[i], layouts[j])):
  286. j += 1
  287. if j >= min(i + far, len(layouts)):
  288. i += 1
  289. continue
  290. if self.__overlapped_area(layouts[i], layouts[j]) < thr \
  291. and self.__overlapped_area(layouts[j], layouts[i]) < thr:
  292. i += 1
  293. continue
  294. if layouts[i].get("score") and layouts[j].get("score"):
  295. if layouts[i]["score"] > layouts[j]["score"]:
  296. layouts.pop(j)
  297. else:
  298. layouts.pop(i)
  299. continue
  300. area_i, area_i_1 = 0, 0
  301. for b in boxes:
  302. if not notOverlapped(b, layouts[i]):
  303. area_i += self.__overlapped_area(b, layouts[i], False)
  304. if not notOverlapped(b, layouts[j]):
  305. area_i_1 += self.__overlapped_area(b, layouts[j], False)
  306. if area_i > area_i_1:
  307. layouts.pop(j)
  308. else:
  309. layouts.pop(i)
  310. return layouts
  311. def __table_paddle(self, images):
  312. tbls = self.tbl_det([np.array(img) for img in images], thr=0.5)
  313. res = []
  314. # align left&right for rows, align top&bottom for columns
  315. for tbl in tbls:
  316. lts = [{"label": b["type"],
  317. "score": b["score"],
  318. "x0": b["bbox"][0], "x1": b["bbox"][2],
  319. "top": b["bbox"][1], "bottom": b["bbox"][-1]
  320. } for b in tbl]
  321. if not lts:
  322. continue
  323. left = [b["x0"] for b in lts if b["label"].find(
  324. "row") > 0 or b["label"].find("header") > 0]
  325. right = [b["x1"] for b in lts if b["label"].find(
  326. "row") > 0 or b["label"].find("header") > 0]
  327. if not left:
  328. continue
  329. left = np.median(left) if len(left) > 4 else np.min(left)
  330. right = np.median(right) if len(right) > 4 else np.max(right)
  331. for b in lts:
  332. if b["label"].find("row") > 0 or b["label"].find("header") > 0:
  333. if b["x0"] > left:
  334. b["x0"] = left
  335. if b["x1"] < right:
  336. b["x1"] = right
  337. top = [b["top"] for b in lts if b["label"] == "table column"]
  338. bottom = [b["bottom"] for b in lts if b["label"] == "table column"]
  339. if not top:
  340. res.append(lts)
  341. continue
  342. top = np.median(top) if len(top) > 4 else np.min(top)
  343. bottom = np.median(bottom) if len(bottom) > 4 else np.max(bottom)
  344. for b in lts:
  345. if b["label"] == "table column":
  346. if b["top"] > top:
  347. b["top"] = top
  348. if b["bottom"] < bottom:
  349. b["bottom"] = bottom
  350. res.append(lts)
  351. return res
  352. def _table_transformer_job(self, ZM):
  353. logging.info("Table processing...")
  354. imgs, pos = [], []
  355. tbcnt = [0]
  356. MARGIN = 10
  357. self.tb_cpns = []
  358. assert len(self.page_layout) == len(self.page_images)
  359. for p, tbls in enumerate(self.page_layout): # for page
  360. tbls = [f for f in tbls if f["type"] == "table"]
  361. tbcnt.append(len(tbls))
  362. if not tbls:
  363. continue
  364. for tb in tbls: # for table
  365. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  366. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  367. left *= ZM
  368. top *= ZM
  369. right *= ZM
  370. bott *= ZM
  371. pos.append((left, top))
  372. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  373. assert len(self.page_images) == len(tbcnt) - 1
  374. if not imgs:
  375. return
  376. recos = self.__table_paddle(imgs)
  377. tbcnt = np.cumsum(tbcnt)
  378. for i in range(len(tbcnt) - 1): # for page
  379. pg = []
  380. for j, tb_items in enumerate(
  381. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  382. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  383. for it in tb_items: # for table components
  384. it["x0"] = (it["x0"] + poss[j][0])
  385. it["x1"] = (it["x1"] + poss[j][0])
  386. it["top"] = (it["top"] + poss[j][1])
  387. it["bottom"] = (it["bottom"] + poss[j][1])
  388. for n in ["x0", "x1", "top", "bottom"]:
  389. it[n] /= ZM
  390. it["top"] += self.page_cum_height[i]
  391. it["bottom"] += self.page_cum_height[i]
  392. it["pn"] = i
  393. it["layoutno"] = j
  394. pg.append(it)
  395. self.tb_cpns.extend(pg)
  396. def gather(kwd, fzy=10, ption=0.6):
  397. eles = self.sort_Y_firstly(
  398. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  399. eles = self.__layouts_cleanup(self.boxes, eles, 5, ption)
  400. return self.sort_Y_firstly(eles, 0)
  401. # add R,H,C,SP tag to boxes within table layout
  402. headers = gather(r".*header$")
  403. rows = gather(r".* (row|header)")
  404. spans = gather(r".*spanning")
  405. clmns = sorted([r for r in self.tb_cpns if re.match(
  406. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  407. clmns = self.__layouts_cleanup(self.boxes, clmns, 5, 0.5)
  408. for b in self.boxes:
  409. if b.get("layout_type", "") != "table":
  410. continue
  411. ii = self.__find_overlapped_with_threashold(b, rows, thr=0.3)
  412. if ii is not None:
  413. b["R"] = ii
  414. b["R_top"] = rows[ii]["top"]
  415. b["R_bott"] = rows[ii]["bottom"]
  416. ii = self.__find_overlapped_with_threashold(b, headers, thr=0.3)
  417. if ii is not None:
  418. b["H_top"] = headers[ii]["top"]
  419. b["H_bott"] = headers[ii]["bottom"]
  420. b["H_left"] = headers[ii]["x0"]
  421. b["H_right"] = headers[ii]["x1"]
  422. b["H"] = ii
  423. ii = self.__find_overlapped_with_threashold(b, clmns, thr=0.3)
  424. if ii is not None:
  425. b["C"] = ii
  426. b["C_left"] = clmns[ii]["x0"]
  427. b["C_right"] = clmns[ii]["x1"]
  428. ii = self.__find_overlapped_with_threashold(b, spans, thr=0.3)
  429. if ii is not None:
  430. b["H_top"] = spans[ii]["top"]
  431. b["H_bott"] = spans[ii]["bottom"]
  432. b["H_left"] = spans[ii]["x0"]
  433. b["H_right"] = spans[ii]["x1"]
  434. b["SP"] = ii
  435. def __ocr_paddle(self, pagenum, img, chars, ZM=3):
  436. bxs = self.ocr.ocr(np.array(img), cls=True)[0]
  437. if not bxs:
  438. self.boxes.append([])
  439. return
  440. bxs = [(line[0], line[1][0]) for line in bxs]
  441. bxs = self.sort_Y_firstly(
  442. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  443. "top": b[0][1] / ZM, "text": "", "txt": t,
  444. "bottom": b[-1][1] / ZM,
  445. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  446. self.mean_height[-1] / 3
  447. )
  448. # merge chars in the same rect
  449. for c in self.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4):
  450. ii = self.__find_overlapped(c, bxs)
  451. if ii is None:
  452. self.lefted_chars.append(c)
  453. continue
  454. ch = c["bottom"] - c["top"]
  455. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  456. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  457. self.lefted_chars.append(c)
  458. continue
  459. if c["text"] == " " and bxs[ii]["text"]:
  460. if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " "
  461. else:
  462. bxs[ii]["text"] += c["text"]
  463. for b in bxs:
  464. if not b["text"]:
  465. b["text"] = b["txt"]
  466. del b["txt"]
  467. if self.mean_height[-1] == 0:
  468. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  469. for b in bxs])
  470. self.boxes.append(bxs)
  471. def _layouts_paddle(self, ZM):
  472. assert len(self.page_images) == len(self.boxes)
  473. # Tag layout type
  474. boxes = []
  475. layouts = self.layouter([np.array(img) for img in self.page_images])
  476. assert len(self.page_images) == len(layouts)
  477. for pn, lts in enumerate(layouts):
  478. bxs = self.boxes[pn]
  479. lts = [{"type": b["type"],
  480. "score": float(b["score"]),
  481. "x0": b["bbox"][0] / ZM, "x1": b["bbox"][2] / ZM,
  482. "top": b["bbox"][1] / ZM, "bottom": b["bbox"][-1] / ZM,
  483. "page_number": pn,
  484. } for b in lts]
  485. lts = self.sort_Y_firstly(lts, self.mean_height[pn] / 2)
  486. lts = self.__layouts_cleanup(bxs, lts)
  487. self.page_layout.append(lts)
  488. # Tag layout type, layouts are ready
  489. def findLayout(ty):
  490. nonlocal bxs, lts
  491. lts_ = [lt for lt in lts if lt["type"] == ty]
  492. i = 0
  493. while i < len(bxs):
  494. if bxs[i].get("layout_type"):
  495. i += 1
  496. continue
  497. if self._is_garbage(bxs[i]):
  498. logging.debug("GARBAGE: " + bxs[i]["text"])
  499. bxs.pop(i)
  500. continue
  501. ii = self.__find_overlapped_with_threashold(bxs[i], lts_,
  502. thr=0.4)
  503. if ii is None: # belong to nothing
  504. bxs[i]["layout_type"] = ""
  505. i += 1
  506. continue
  507. lts_[ii]["visited"] = True
  508. if lts_[ii]["type"] in ["footer", "header", "reference"]:
  509. if lts_[ii]["type"] not in self.garbages:
  510. self.garbages[lts_[ii]["type"]] = []
  511. self.garbages[lts_[ii]["type"]].append(bxs[i]["text"])
  512. logging.debug("GARBAGE: " + bxs[i]["text"])
  513. bxs.pop(i)
  514. continue
  515. bxs[i]["layoutno"] = f"{ty}-{ii}"
  516. bxs[i]["layout_type"] = lts_[ii]["type"]
  517. i += 1
  518. for lt in ["footer", "header", "reference", "figure caption",
  519. "table caption", "title", "text", "table", "figure"]:
  520. findLayout(lt)
  521. # add box to figure layouts which has not text box
  522. for i, lt in enumerate(
  523. [lt for lt in lts if lt["type"] == "figure"]):
  524. if lt.get("visited"):
  525. continue
  526. lt = deepcopy(lt)
  527. del lt["type"]
  528. lt["text"] = ""
  529. lt["layout_type"] = "figure"
  530. lt["layoutno"] = f"figure-{i}"
  531. bxs.append(lt)
  532. boxes.extend(bxs)
  533. self.boxes = boxes
  534. garbage = set()
  535. for k in self.garbages.keys():
  536. self.garbages[k] = Counter(self.garbages[k])
  537. for g, c in self.garbages[k].items():
  538. if c > 1:
  539. garbage.add(g)
  540. logging.debug("GARBAGE:" + ",".join(garbage))
  541. self.boxes = [b for b in self.boxes if b["text"].strip() not in garbage]
  542. # cumlative Y
  543. for i in range(len(self.boxes)):
  544. self.boxes[i]["top"] += \
  545. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  546. self.boxes[i]["bottom"] += \
  547. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  548. def _text_merge(self):
  549. # merge adjusted boxes
  550. bxs = self.boxes
  551. def end_with(b, txt):
  552. txt = txt.strip()
  553. tt = b.get("text", "").strip()
  554. return tt and tt.find(txt) == len(tt) - len(txt)
  555. def start_with(b, txts):
  556. tt = b.get("text", "").strip()
  557. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  558. # horizontally merge adjacent box with the same layout
  559. i = 0
  560. while i < len(bxs) - 1:
  561. b = bxs[i]
  562. b_ = bxs[i + 1]
  563. if b.get("layoutno", "0") != b_.get("layoutno", "1"):
  564. i += 1
  565. continue
  566. dis_thr = 1
  567. dis = b["x1"] - b_["x0"]
  568. if b.get("layout_type", "") != "text" or b_.get(
  569. "layout_type", "") != "text":
  570. if end_with(b, ",") or start_with(b_, "(,"):
  571. dis_thr = -8
  572. else:
  573. i += 1
  574. continue
  575. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  576. and dis >= dis_thr and b["x1"] < b_["x1"]:
  577. # merge
  578. bxs[i]["x1"] = b_["x1"]
  579. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  580. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  581. bxs[i]["text"] += b_["text"]
  582. bxs.pop(i + 1)
  583. continue
  584. i += 1
  585. self.boxes = bxs
  586. def _naive_vertical_merge(self):
  587. bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
  588. i = 0
  589. while i + 1 < len(bxs):
  590. b = bxs[i]
  591. b_ = bxs[i + 1]
  592. if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
  593. bxs.pop(i)
  594. continue
  595. concatting_feats = [
  596. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  597. len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
  598. b["text"].strip()[0] in "。;?!?”)),,、:",
  599. ]
  600. # features for not concating
  601. feats = [
  602. b.get("layoutno", 0) != b.get("layoutno", 0),
  603. b["text"].strip()[-1] in "。?!?",
  604. self.is_english and b["text"].strip()[-1] in ".!?",
  605. b["page_number"] == b_["page_number"] and b_["top"] - \
  606. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  607. b["page_number"] < b_["page_number"] and abs(
  608. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
  609. ]
  610. if any(feats) and not any(concatting_feats):
  611. i += 1
  612. continue
  613. # merge up and down
  614. b["bottom"] = b_["bottom"]
  615. b["text"] += b_["text"]
  616. b["x0"] = min(b["x0"], b_["x0"])
  617. b["x1"] = max(b["x1"], b_["x1"])
  618. bxs.pop(i + 1)
  619. self.boxes = bxs
  620. def _concat_downward(self, concat_between_pages=True):
  621. # count boxes in the same row as a feature
  622. for i in range(len(self.boxes)):
  623. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  624. self.boxes[i]["in_row"] = 0
  625. j = max(0, i - 12)
  626. while j < min(i + 12, len(self.boxes)):
  627. if j == i:
  628. j += 1
  629. continue
  630. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  631. if abs(ydis) < 1:
  632. self.boxes[i]["in_row"] += 1
  633. elif ydis > 0:
  634. break
  635. j += 1
  636. # concat between rows
  637. boxes = deepcopy(self.boxes)
  638. blocks = []
  639. while boxes:
  640. chunks = []
  641. def dfs(up, dp):
  642. chunks.append(up)
  643. i = dp
  644. while i < min(dp + 12, len(boxes)):
  645. ydis = self._y_dis(up, boxes[i])
  646. smpg = up["page_number"] == boxes[i]["page_number"]
  647. mh = self.mean_height[up["page_number"] - 1]
  648. mw = self.mean_width[up["page_number"] - 1]
  649. if smpg and ydis > mh * 4:
  650. break
  651. if not smpg and ydis > mh * 16:
  652. break
  653. down = boxes[i]
  654. if not concat_between_pages and down["page_number"] > up["page_number"]:
  655. break
  656. if up.get("R", "") != down.get(
  657. "R", "") and up["text"][-1] != ",":
  658. i += 1
  659. continue
  660. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  661. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
  662. i += 1
  663. continue
  664. if not down["text"].strip():
  665. i += 1
  666. continue
  667. if up["x1"] < down["x0"] - 10 * \
  668. mw or up["x0"] > down["x1"] + 10 * mw:
  669. i += 1
  670. continue
  671. if i - dp < 5 and up.get("layout_type") == "text":
  672. if up.get("layoutno", "1") == down.get(
  673. "layoutno", "2"):
  674. dfs(down, i + 1)
  675. boxes.pop(i)
  676. return
  677. i += 1
  678. continue
  679. fea = self._updown_concat_features(up, down)
  680. if self.updown_cnt_mdl.predict(
  681. xgb.DMatrix([fea]))[0] <= 0.5:
  682. i += 1
  683. continue
  684. dfs(down, i + 1)
  685. boxes.pop(i)
  686. return
  687. dfs(boxes[0], 1)
  688. boxes.pop(0)
  689. if chunks:
  690. blocks.append(chunks)
  691. # concat within each block
  692. boxes = []
  693. for b in blocks:
  694. if len(b) == 1:
  695. boxes.append(b[0])
  696. continue
  697. t = b[0]
  698. for c in b[1:]:
  699. t["text"] = t["text"].strip()
  700. c["text"] = c["text"].strip()
  701. if not c["text"]:
  702. continue
  703. if t["text"] and re.match(
  704. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  705. t["text"] += " "
  706. t["text"] += c["text"]
  707. t["x0"] = min(t["x0"], c["x0"])
  708. t["x1"] = max(t["x1"], c["x1"])
  709. t["page_number"] = min(t["page_number"], c["page_number"])
  710. t["bottom"] = c["bottom"]
  711. if not t["layout_type"] \
  712. and c["layout_type"]:
  713. t["layout_type"] = c["layout_type"]
  714. boxes.append(t)
  715. self.boxes = self.sort_Y_firstly(boxes, 0)
  716. def _filter_forpages(self):
  717. if not self.boxes:
  718. return
  719. findit = False
  720. i = 0
  721. while i < len(self.boxes):
  722. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  723. i += 1
  724. continue
  725. findit = True
  726. eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
  727. self.boxes.pop(i)
  728. if i >= len(self.boxes): break
  729. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
  730. while not prefix:
  731. self.boxes.pop(i)
  732. if i >= len(self.boxes): break
  733. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
  734. self.boxes.pop(i)
  735. if i >= len(self.boxes) or not prefix: break
  736. for j in range(i, min(i + 128, len(self.boxes))):
  737. if not re.match(prefix, self.boxes[j]["text"]):
  738. continue
  739. for k in range(i, j): self.boxes.pop(i)
  740. break
  741. if findit:return
  742. page_dirty = [0] * len(self.page_images)
  743. for b in self.boxes:
  744. if re.search(r"(··|··|··)", b["text"]):
  745. page_dirty[b["page_number"]-1] += 1
  746. page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3])
  747. if not page_dirty: return
  748. i = 0
  749. while i < len(self.boxes):
  750. if self.boxes[i]["page_number"] in page_dirty:
  751. self.boxes.pop(i)
  752. continue
  753. i += 1
  754. def _merge_with_same_bullet(self):
  755. i = 0
  756. while i + 1 < len(self.boxes):
  757. b = self.boxes[i]
  758. b_ = self.boxes[i + 1]
  759. if not b["text"].strip():
  760. self.boxes.pop(i)
  761. continue
  762. if not b_["text"].strip():
  763. self.boxes.pop(i+1)
  764. continue
  765. if b["text"].strip()[0] != b_["text"].strip()[0] \
  766. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  767. or huqie.is_chinese(b["text"].strip()[0]) \
  768. or b["top"] > b_["bottom"]:
  769. i += 1
  770. continue
  771. b_["text"] = b["text"] + "\n" + b_["text"]
  772. b_["x0"] = min(b["x0"], b_["x0"])
  773. b_["x1"] = max(b["x1"], b_["x1"])
  774. b_["top"] = b["top"]
  775. self.boxes.pop(i)
  776. def _blockType(self, b):
  777. patt = [
  778. ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
  779. (r"^(20|19)[0-9]{2}年$", "Dt"),
  780. (r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"),
  781. ("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"),
  782. (r"^第*[一二三四1-4]季度$", "Dt"),
  783. (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
  784. (r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"),
  785. ("^[0-9.,+%/ -]+$", "Nu"),
  786. (r"^[0-9A-Z/\._~-]+$", "Ca"),
  787. (r"^[A-Z]*[a-z' -]+$", "En"),
  788. (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
  789. (r"^.{1}$", "Sg")
  790. ]
  791. for p, n in patt:
  792. if re.search(p, b["text"].strip()):
  793. return n
  794. tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1]
  795. if len(tks) > 3:
  796. if len(tks) < 12:
  797. return "Tx"
  798. else:
  799. return "Lx"
  800. if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
  801. return "Nr"
  802. return "Ot"
  803. def __cal_spans(self, boxes, rows, cols, tbl, html=True):
  804. # caculate span
  805. clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
  806. for cln in cols]
  807. crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln])
  808. for cln in cols]
  809. rtop = [np.mean([c.get("R_top", c["top"]) for c in row])
  810. for row in rows]
  811. rbtm = [np.mean([c.get("R_btm", c["bottom"])
  812. for c in row]) for row in rows]
  813. for b in boxes:
  814. if "SP" not in b:
  815. continue
  816. b["colspan"] = [b["cn"]]
  817. b["rowspan"] = [b["rn"]]
  818. # col span
  819. for j in range(0, len(clft)):
  820. if j == b["cn"]:
  821. continue
  822. if clft[j] + (crgt[j] - clft[j]) / 2 < b["H_left"]:
  823. continue
  824. if crgt[j] - (crgt[j] - clft[j]) / 2 > b["H_right"]:
  825. continue
  826. b["colspan"].append(j)
  827. # row span
  828. for j in range(0, len(rtop)):
  829. if j == b["rn"]:
  830. continue
  831. if rtop[j] + (rbtm[j] - rtop[j]) / 2 < b["H_top"]:
  832. continue
  833. if rbtm[j] - (rbtm[j] - rtop[j]) / 2 > b["H_bott"]:
  834. continue
  835. b["rowspan"].append(j)
  836. def join(arr):
  837. if not arr:
  838. return ""
  839. return "".join([t["text"] for t in arr])
  840. # rm the spaning cells
  841. for i in range(len(tbl)):
  842. for j, arr in enumerate(tbl[i]):
  843. if not arr:
  844. continue
  845. if all(["rowspan" not in a and "colspan" not in a for a in arr]):
  846. continue
  847. rowspan, colspan = [], []
  848. for a in arr:
  849. if isinstance(a.get("rowspan", 0), list):
  850. rowspan.extend(a["rowspan"])
  851. if isinstance(a.get("colspan", 0), list):
  852. colspan.extend(a["colspan"])
  853. rowspan, colspan = set(rowspan), set(colspan)
  854. if len(rowspan) < 2 and len(colspan) < 2:
  855. for a in arr:
  856. if "rowspan" in a:
  857. del a["rowspan"]
  858. if "colspan" in a:
  859. del a["colspan"]
  860. continue
  861. rowspan, colspan = sorted(rowspan), sorted(colspan)
  862. rowspan = list(range(rowspan[0], rowspan[-1] + 1))
  863. colspan = list(range(colspan[0], colspan[-1] + 1))
  864. assert i in rowspan, rowspan
  865. assert j in colspan, colspan
  866. arr = []
  867. for r in rowspan:
  868. for c in colspan:
  869. arr_txt = join(arr)
  870. if tbl[r][c] and join(tbl[r][c]) != arr_txt:
  871. arr.extend(tbl[r][c])
  872. tbl[r][c] = None if html else arr
  873. for a in arr:
  874. if len(rowspan) > 1:
  875. a["rowspan"] = len(rowspan)
  876. elif "rowspan" in a:
  877. del a["rowspan"]
  878. if len(colspan) > 1:
  879. a["colspan"] = len(colspan)
  880. elif "colspan" in a:
  881. del a["colspan"]
  882. tbl[rowspan[0]][colspan[0]] = arr
  883. return tbl
  884. def __construct_table(self, boxes, html=False):
  885. cap = ""
  886. i = 0
  887. while i < len(boxes):
  888. if self.is_caption(boxes[i]):
  889. cap += boxes[i]["text"]
  890. boxes.pop(i)
  891. i -= 1
  892. i += 1
  893. if not boxes:
  894. return []
  895. for b in boxes:
  896. b["btype"] = self._blockType(b)
  897. max_type = Counter([b["btype"] for b in boxes]).items()
  898. max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
  899. logging.debug("MAXTYPE: " + max_type)
  900. rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
  901. rowh = np.min(rowh) if rowh else 0
  902. # boxes = self.sort_Y_firstly(boxes, rowh/5)
  903. boxes = self.sort_R_firstly(boxes, rowh / 2)
  904. boxes[0]["rn"] = 0
  905. rows = [[boxes[0]]]
  906. btm = boxes[0]["bottom"]
  907. for b in boxes[1:]:
  908. b["rn"] = len(rows) - 1
  909. lst_r = rows[-1]
  910. if lst_r[-1].get("R", "") != b.get("R", "") \
  911. or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
  912. ): # new row
  913. btm = b["bottom"]
  914. b["rn"] += 1
  915. rows.append([b])
  916. continue
  917. btm = (btm + b["bottom"]) / 2.
  918. rows[-1].append(b)
  919. colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b]
  920. colwm = np.min(colwm) if colwm else 0
  921. crosspage = len(set([b["page_number"] for b in boxes])) > 1
  922. if crosspage:
  923. boxes = self.sort_X_firstly(boxes, colwm / 2, False)
  924. else:
  925. boxes = self.sort_C_firstly(boxes, colwm / 2)
  926. boxes[0]["cn"] = 0
  927. cols = [[boxes[0]]]
  928. right = boxes[0]["x1"]
  929. for b in boxes[1:]:
  930. b["cn"] = len(cols) - 1
  931. lst_c = cols[-1]
  932. if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1][
  933. "page_number"]) \
  934. or (b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")): # new col
  935. right = b["x1"]
  936. b["cn"] += 1
  937. cols.append([b])
  938. continue
  939. right = (right + b["x1"]) / 2.
  940. cols[-1].append(b)
  941. tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))]
  942. for b in boxes:
  943. tbl[b["rn"]][b["cn"]].append(b)
  944. if len(rows) >= 4:
  945. # remove single in column
  946. j = 0
  947. while j < len(tbl[0]):
  948. e, ii = 0, 0
  949. for i in range(len(tbl)):
  950. if tbl[i][j]:
  951. e += 1
  952. ii = i
  953. if e > 1:
  954. break
  955. if e > 1:
  956. j += 1
  957. continue
  958. f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
  959. [j - 1][0].get("text")) or j == 0
  960. ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
  961. [j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
  962. if f and ff:
  963. j += 1
  964. continue
  965. bx = tbl[ii][j][0]
  966. logging.debug("Relocate column single: " + bx["text"])
  967. # j column only has one value
  968. left, right = 100000, 100000
  969. if j > 0 and not f:
  970. for i in range(len(tbl)):
  971. if tbl[i][j - 1]:
  972. left = min(left, np.min(
  973. [bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
  974. if j + 1 < len(tbl[0]) and not ff:
  975. for i in range(len(tbl)):
  976. if tbl[i][j + 1]:
  977. right = min(right, np.min(
  978. [a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
  979. assert left < 100000 or right < 100000
  980. if left < right:
  981. for jj in range(j, len(tbl[0])):
  982. for i in range(len(tbl)):
  983. for a in tbl[i][jj]:
  984. a["cn"] -= 1
  985. if tbl[ii][j - 1]:
  986. tbl[ii][j - 1].extend(tbl[ii][j])
  987. else:
  988. tbl[ii][j - 1] = tbl[ii][j]
  989. for i in range(len(tbl)):
  990. tbl[i].pop(j)
  991. else:
  992. for jj in range(j + 1, len(tbl[0])):
  993. for i in range(len(tbl)):
  994. for a in tbl[i][jj]:
  995. a["cn"] -= 1
  996. if tbl[ii][j + 1]:
  997. tbl[ii][j + 1].extend(tbl[ii][j])
  998. else:
  999. tbl[ii][j + 1] = tbl[ii][j]
  1000. for i in range(len(tbl)):
  1001. tbl[i].pop(j)
  1002. cols.pop(j)
  1003. assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (
  1004. len(cols), len(tbl[0]))
  1005. if len(cols) >= 4:
  1006. # remove single in row
  1007. i = 0
  1008. while i < len(tbl):
  1009. e, jj = 0, 0
  1010. for j in range(len(tbl[i])):
  1011. if tbl[i][j]:
  1012. e += 1
  1013. jj = j
  1014. if e > 1:
  1015. break
  1016. if e > 1:
  1017. i += 1
  1018. continue
  1019. f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
  1020. [jj][0].get("text")) or i == 0
  1021. ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
  1022. [jj][0].get("text")) or i + 1 >= len(tbl)
  1023. if f and ff:
  1024. i += 1
  1025. continue
  1026. bx = tbl[i][jj][0]
  1027. logging.debug("Relocate row single: " + bx["text"])
  1028. # i row only has one value
  1029. up, down = 100000, 100000
  1030. if i > 0 and not f:
  1031. for j in range(len(tbl[i - 1])):
  1032. if tbl[i - 1][j]:
  1033. up = min(up, np.min(
  1034. [bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
  1035. if i + 1 < len(tbl) and not ff:
  1036. for j in range(len(tbl[i + 1])):
  1037. if tbl[i + 1][j]:
  1038. down = min(down, np.min(
  1039. [a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
  1040. assert up < 100000 or down < 100000
  1041. if up < down:
  1042. for ii in range(i, len(tbl)):
  1043. for j in range(len(tbl[ii])):
  1044. for a in tbl[ii][j]:
  1045. a["rn"] -= 1
  1046. if tbl[i - 1][jj]:
  1047. tbl[i - 1][jj].extend(tbl[i][jj])
  1048. else:
  1049. tbl[i - 1][jj] = tbl[i][jj]
  1050. tbl.pop(i)
  1051. else:
  1052. for ii in range(i + 1, len(tbl)):
  1053. for j in range(len(tbl[ii])):
  1054. for a in tbl[ii][j]:
  1055. a["rn"] -= 1
  1056. if tbl[i + 1][jj]:
  1057. tbl[i + 1][jj].extend(tbl[i][jj])
  1058. else:
  1059. tbl[i + 1][jj] = tbl[i][jj]
  1060. tbl.pop(i)
  1061. rows.pop(i)
  1062. # which rows are headers
  1063. hdset = set([])
  1064. for i in range(len(tbl)):
  1065. cnt, h = 0, 0
  1066. for j, arr in enumerate(tbl[i]):
  1067. if not arr:
  1068. continue
  1069. cnt += 1
  1070. if max_type == "Nu" and arr[0]["btype"] == "Nu":
  1071. continue
  1072. if any([a.get("H") for a in arr]) \
  1073. or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
  1074. h += 1
  1075. if h / cnt > 0.5:
  1076. hdset.add(i)
  1077. if html:
  1078. return [self.__html_table(cap, hdset,
  1079. self.__cal_spans(boxes, rows,
  1080. cols, tbl, True)
  1081. )]
  1082. return self.__desc_table(cap, hdset,
  1083. self.__cal_spans(boxes, rows, cols, tbl, False))
  1084. def __html_table(self, cap, hdset, tbl):
  1085. # constrcut HTML
  1086. html = "<table>"
  1087. if cap:
  1088. html += f"<caption>{cap}</caption>"
  1089. for i in range(len(tbl)):
  1090. row = "<tr>"
  1091. txts = []
  1092. for j, arr in enumerate(tbl[i]):
  1093. if arr is None:
  1094. continue
  1095. if not arr:
  1096. row += "<td></td>" if i not in hdset else "<th></th>"
  1097. continue
  1098. txt = ""
  1099. if arr:
  1100. h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2,
  1101. self.mean_height[arr[0]["page_number"] - 1] / 2)
  1102. txt = "".join([c["text"]
  1103. for c in self.sort_Y_firstly(arr, h)])
  1104. txts.append(txt)
  1105. sp = ""
  1106. if arr[0].get("colspan"):
  1107. sp = "colspan={}".format(arr[0]["colspan"])
  1108. if arr[0].get("rowspan"):
  1109. sp += " rowspan={}".format(arr[0]["rowspan"])
  1110. if i in hdset:
  1111. row += f"<th {sp} >" + txt + "</th>"
  1112. else:
  1113. row += f"<td {sp} >" + txt + "</td>"
  1114. if i in hdset:
  1115. if all([t in hdset for t in txts]):
  1116. continue
  1117. for t in txts:
  1118. hdset.add(t)
  1119. if row != "<tr>":
  1120. row += "</tr>"
  1121. else:
  1122. row = ""
  1123. html += "\n" + row
  1124. html += "\n</table>"
  1125. return html
  1126. def __desc_table(self, cap, hdr_rowno, tbl):
  1127. # get text of every colomn in header row to become header text
  1128. clmno = len(tbl[0])
  1129. rowno = len(tbl)
  1130. headers = {}
  1131. hdrset = set()
  1132. lst_hdr = []
  1133. de = "的" if not self.is_english else " for "
  1134. for r in sorted(list(hdr_rowno)):
  1135. headers[r] = ["" for _ in range(clmno)]
  1136. for i in range(clmno):
  1137. if not tbl[r][i]:
  1138. continue
  1139. txt = "".join([a["text"].strip() for a in tbl[r][i]])
  1140. headers[r][i] = txt
  1141. hdrset.add(txt)
  1142. if all([not t for t in headers[r]]):
  1143. del headers[r]
  1144. hdr_rowno.remove(r)
  1145. continue
  1146. for j in range(clmno):
  1147. if headers[r][j]:
  1148. continue
  1149. if j >= len(lst_hdr):
  1150. break
  1151. headers[r][j] = lst_hdr[j]
  1152. lst_hdr = headers[r]
  1153. for i in range(rowno):
  1154. if i not in hdr_rowno:
  1155. continue
  1156. for j in range(i + 1, rowno):
  1157. if j not in hdr_rowno:
  1158. break
  1159. for k in range(clmno):
  1160. if not headers[j - 1][k]:
  1161. continue
  1162. if headers[j][k].find(headers[j - 1][k]) >= 0:
  1163. continue
  1164. if len(headers[j][k]) > len(headers[j - 1][k]):
  1165. headers[j][k] += (de if headers[j][k]
  1166. else "") + headers[j - 1][k]
  1167. else:
  1168. headers[j][k] = headers[j - 1][k] \
  1169. + (de if headers[j - 1][k] else "") \
  1170. + headers[j][k]
  1171. logging.debug(
  1172. f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
  1173. row_txt = []
  1174. for i in range(rowno):
  1175. if i in hdr_rowno:
  1176. continue
  1177. rtxt = []
  1178. def append(delimer):
  1179. nonlocal rtxt, row_txt
  1180. rtxt = delimer.join(rtxt)
  1181. if row_txt and len(row_txt[-1]) + len(rtxt) < 64:
  1182. row_txt[-1] += "\n" + rtxt
  1183. else:
  1184. row_txt.append(rtxt)
  1185. r = 0
  1186. if len(headers.items()):
  1187. _arr = [(i - r, r) for r, _ in headers.items() if r < i]
  1188. if _arr:
  1189. _, r = min(_arr, key=lambda x: x[0])
  1190. if r not in headers and clmno <= 2:
  1191. for j in range(clmno):
  1192. if not tbl[i][j]:
  1193. continue
  1194. txt = "".join([a["text"].strip() for a in tbl[i][j]])
  1195. if txt:
  1196. rtxt.append(txt)
  1197. if rtxt:
  1198. append(":")
  1199. continue
  1200. for j in range(clmno):
  1201. if not tbl[i][j]:
  1202. continue
  1203. txt = "".join([a["text"].strip() for a in tbl[i][j]])
  1204. if not txt:
  1205. continue
  1206. ctt = headers[r][j] if r in headers else ""
  1207. if ctt:
  1208. ctt += ":"
  1209. ctt += txt
  1210. if ctt:
  1211. rtxt.append(ctt)
  1212. if rtxt:
  1213. row_txt.append("; ".join(rtxt))
  1214. if cap:
  1215. if self.is_english:
  1216. from_ = " in "
  1217. else:
  1218. from_ = "来自"
  1219. row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
  1220. return row_txt
  1221. @staticmethod
  1222. def is_caption(bx):
  1223. patt = [
  1224. r"[图表]+[ 0-9::]{2,}"
  1225. ]
  1226. if any([re.match(p, bx["text"].strip()) for p in patt]) \
  1227. or bx["layout_type"].find("caption") >= 0:
  1228. return True
  1229. return False
  1230. def _extract_table_figure(self, need_image, ZM, return_html):
  1231. tables = {}
  1232. figures = {}
  1233. # extract figure and table boxes
  1234. i = 0
  1235. lst_lout_no = ""
  1236. nomerge_lout_no = []
  1237. while i < len(self.boxes):
  1238. if "layoutno" not in self.boxes[i]:
  1239. i += 1
  1240. continue
  1241. lout_no = str(self.boxes[i]["page_number"]) + \
  1242. "-" + str(self.boxes[i]["layoutno"])
  1243. if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
  1244. "figure caption", "reference"]:
  1245. nomerge_lout_no.append(lst_lout_no)
  1246. if self.boxes[i]["layout_type"] == "table":
  1247. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  1248. self.boxes.pop(i)
  1249. continue
  1250. if lout_no not in tables:
  1251. tables[lout_no] = []
  1252. tables[lout_no].append(self.boxes[i])
  1253. self.boxes.pop(i)
  1254. lst_lout_no = lout_no
  1255. continue
  1256. if need_image and self.boxes[i]["layout_type"] == "figure":
  1257. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  1258. self.boxes.pop(i)
  1259. continue
  1260. if lout_no not in figures:
  1261. figures[lout_no] = []
  1262. figures[lout_no].append(self.boxes[i])
  1263. self.boxes.pop(i)
  1264. lst_lout_no = lout_no
  1265. continue
  1266. i += 1
  1267. # merge table on different pages
  1268. nomerge_lout_no = set(nomerge_lout_no)
  1269. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  1270. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  1271. i = len(tbls) - 1
  1272. while i - 1 >= 0:
  1273. k0, bxs0 = tbls[i - 1]
  1274. k, bxs = tbls[i]
  1275. i -= 1
  1276. if k0 in nomerge_lout_no:
  1277. continue
  1278. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  1279. continue
  1280. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  1281. continue
  1282. mh = self.mean_height[bxs[0]["page_number"] - 1]
  1283. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  1284. continue
  1285. tables[k0].extend(tables[k])
  1286. del tables[k]
  1287. def x_overlapped(a, b):
  1288. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  1289. # find captions and pop out
  1290. i = 0
  1291. while i < len(self.boxes):
  1292. c = self.boxes[i]
  1293. # mh = self.mean_height[c["page_number"]-1]
  1294. if not self.is_caption(c):
  1295. i += 1
  1296. continue
  1297. # find the nearest layouts
  1298. def nearest(tbls):
  1299. nonlocal c
  1300. mink = ""
  1301. minv = 1000000000
  1302. for k, bxs in tbls.items():
  1303. for b in bxs[:10]:
  1304. if b.get("layout_type", "").find("caption") >= 0:
  1305. continue
  1306. y_dis = self._y_dis(c, b)
  1307. x_dis = self._x_dis(
  1308. c, b) if not x_overlapped(
  1309. c, b) else 0
  1310. dis = y_dis * y_dis + x_dis * x_dis
  1311. if dis < minv:
  1312. mink = k
  1313. minv = dis
  1314. return mink, minv
  1315. tk, tv = nearest(tables)
  1316. fk, fv = nearest(figures)
  1317. if min(tv, fv) > 2000:
  1318. i += 1
  1319. continue
  1320. if tv < fv:
  1321. tables[tk].insert(0, c)
  1322. logging.debug(
  1323. "TABLE:" +
  1324. self.boxes[i]["text"] +
  1325. "; Cap: " +
  1326. tk)
  1327. else:
  1328. figures[fk].insert(0, c)
  1329. logging.debug(
  1330. "FIGURE:" +
  1331. self.boxes[i]["text"] +
  1332. "; Cap: " +
  1333. tk)
  1334. self.boxes.pop(i)
  1335. res = []
  1336. def cropout(bxs, ltype):
  1337. nonlocal ZM
  1338. pn = set([b["page_number"] - 1 for b in bxs])
  1339. if len(pn) < 2:
  1340. pn = list(pn)[0]
  1341. ht = self.page_cum_height[pn]
  1342. b = {
  1343. "x0": np.min([b["x0"] for b in bxs]),
  1344. "top": np.min([b["top"] for b in bxs]) - ht,
  1345. "x1": np.max([b["x1"] for b in bxs]),
  1346. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  1347. }
  1348. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  1349. ii = self.__find_overlapped(b, louts, naive=True)
  1350. if ii is not None:
  1351. b = louts[ii]
  1352. else:
  1353. logging.warn(
  1354. f"Missing layout match: {pn + 1},%s" %
  1355. (bxs[0].get(
  1356. "layoutno", "")))
  1357. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  1358. return self.page_images[pn] \
  1359. .crop((left * ZM, top * ZM,
  1360. right * ZM, bott * ZM))
  1361. pn = {}
  1362. for b in bxs:
  1363. p = b["page_number"] - 1
  1364. if p not in pn:
  1365. pn[p] = []
  1366. pn[p].append(b)
  1367. pn = sorted(pn.items(), key=lambda x: x[0])
  1368. imgs = [cropout(arr, ltype) for p, arr in pn]
  1369. pic = Image.new("RGB",
  1370. (int(np.max([i.size[0] for i in imgs])),
  1371. int(np.sum([m.size[1] for m in imgs]))),
  1372. (245, 245, 245))
  1373. height = 0
  1374. for img in imgs:
  1375. pic.paste(img, (0, int(height)))
  1376. height += img.size[1]
  1377. return pic
  1378. # crop figure out and add caption
  1379. for k, bxs in figures.items():
  1380. txt = "\n".join(
  1381. [b["text"] for b in bxs
  1382. if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
  1383. and len(b["text"].strip()) >= 4
  1384. ]
  1385. )
  1386. if not txt:
  1387. continue
  1388. res.append(
  1389. (cropout(
  1390. bxs,
  1391. "figure"),
  1392. [txt] if not return_html else [f"<p>{txt}</p>"]))
  1393. for k, bxs in tables.items():
  1394. if not bxs:
  1395. continue
  1396. res.append((cropout(bxs, "table"),
  1397. self.__construct_table(bxs, html=return_html)))
  1398. return res
  1399. def proj_match(self, line):
  1400. if len(line) <= 2:
  1401. return
  1402. if re.match(r"[0-9 ().,%%+/-]+$", line):
  1403. return False
  1404. for p, j in [
  1405. (r"第[零一二三四五六七八九十百]+章", 1),
  1406. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  1407. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  1408. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  1409. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  1410. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  1411. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  1412. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  1413. (r".{,48}[::??]$", 9),
  1414. (r"[0-9]+)", 10),
  1415. (r"[\((][0-9]+[)\)]", 11),
  1416. (r"[零一二三四五六七八九十百]+是", 12),
  1417. (r"[⚫•➢✓]", 12)
  1418. ]:
  1419. if re.match(p, line):
  1420. return j
  1421. return
  1422. def _line_tag(self, bx, ZM):
  1423. pn = [bx["page_number"]]
  1424. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  1425. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  1426. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  1427. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  1428. pn.append(pn[-1] + 1)
  1429. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  1430. .format("-".join([str(p) for p in pn]),
  1431. bx["x0"], bx["x1"], top, bott)
  1432. def __filterout_scraps(self, boxes, ZM):
  1433. def width(b):
  1434. return b["x1"] - b["x0"]
  1435. def height(b):
  1436. return b["bottom"] - b["top"]
  1437. def usefull(b):
  1438. if b.get("layout_type"):
  1439. return True
  1440. if width(
  1441. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  1442. return True
  1443. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  1444. return True
  1445. return False
  1446. res = []
  1447. while boxes:
  1448. lines = []
  1449. widths = []
  1450. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  1451. mh = self.mean_height[boxes[0]["page_number"] - 1]
  1452. mj = self.proj_match(
  1453. boxes[0]["text"]) or boxes[0].get(
  1454. "layout_type",
  1455. "") == "title"
  1456. def dfs(line, st):
  1457. nonlocal mh, pw, lines, widths
  1458. lines.append(line)
  1459. widths.append(width(line))
  1460. width_mean = np.mean(widths)
  1461. mmj = self.proj_match(
  1462. line["text"]) or line.get(
  1463. "layout_type",
  1464. "") == "title"
  1465. for i in range(st + 1, min(st + 20, len(boxes))):
  1466. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  1467. break
  1468. if not mmj and self._y_dis(
  1469. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  1470. break
  1471. if not usefull(boxes[i]):
  1472. continue
  1473. if mmj or \
  1474. (self._x_dis(boxes[i], line) < pw / 10): \
  1475. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  1476. # concat following
  1477. dfs(boxes[i], i)
  1478. boxes.pop(i)
  1479. break
  1480. try:
  1481. if usefull(boxes[0]):
  1482. dfs(boxes[0], 0)
  1483. else:
  1484. logging.debug("WASTE: " + boxes[0]["text"])
  1485. except Exception as e:
  1486. pass
  1487. boxes.pop(0)
  1488. mw = np.mean(widths)
  1489. if mj or mw / pw >= 0.35 or mw > 200:
  1490. res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  1491. else:
  1492. logging.debug("REMOVED: " +
  1493. "<<".join([c["text"] for c in lines]))
  1494. return "\n\n".join(res)
  1495. @staticmethod
  1496. def total_page_number(fnm, binary=None):
  1497. try:
  1498. pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary))
  1499. return len(pdf.pages)
  1500. except Exception as e:
  1501. pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
  1502. return len(pdf)
  1503. def __images__(self, fnm, zoomin=3, page_from=0, page_to=299):
  1504. self.lefted_chars = []
  1505. self.mean_height = []
  1506. self.mean_width = []
  1507. self.boxes = []
  1508. self.garbages = {}
  1509. self.page_cum_height = [0]
  1510. self.page_layout = []
  1511. try:
  1512. self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
  1513. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  1514. enumerate(self.pdf.pages[page_from:page_to])]
  1515. self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
  1516. self.total_page = len(self.pdf.pages)
  1517. except Exception as e:
  1518. self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
  1519. self.page_images = []
  1520. self.page_chars = []
  1521. mat = fitz.Matrix(zoomin, zoomin)
  1522. self.total_page = len(self.pdf)
  1523. for i, page in enumerate(self.pdf):
  1524. if i < page_from:continue
  1525. if i >= page_to:break
  1526. pix = page.get_pixmap(matrix=mat)
  1527. img = Image.frombytes("RGB", [pix.width, pix.height],
  1528. pix.samples)
  1529. self.page_images.append(img)
  1530. self.page_chars.append([])
  1531. logging.info("Images converted.")
  1532. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))]
  1533. if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
  1534. self.is_english = True
  1535. else:
  1536. self.is_english = False
  1537. for i, img in enumerate(self.page_images):
  1538. chars = self.page_chars[i] if not self.is_english else []
  1539. self.mean_height.append(
  1540. np.median(sorted([c["height"] for c in chars])) if chars else 0
  1541. )
  1542. self.mean_width.append(
  1543. np.median(sorted([c["width"] for c in chars])) if chars else 8
  1544. )
  1545. self.page_cum_height.append(img.size[1] / zoomin)
  1546. j = 0
  1547. while j + 1 < len(chars):
  1548. if chars[j]["text"] and chars[j + 1]["text"] \
  1549. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  1550. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  1551. chars[j]["width"]) / 2:
  1552. chars[j]["text"] += " "
  1553. j += 1
  1554. # if i > 0:
  1555. # if not chars:
  1556. # self.page_cum_height.append(img.size[1] / zoomin)
  1557. # else:
  1558. # self.page_cum_height.append(
  1559. # np.max([c["bottom"] for c in chars]))
  1560. self.__ocr_paddle(i + 1, img, chars, zoomin)
  1561. if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
  1562. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices([b for bxs in self.boxes for b in bxs], k=30)]))
  1563. logging.info("Is it English:", self.is_english)
  1564. self.page_cum_height = np.cumsum(self.page_cum_height)
  1565. assert len(self.page_cum_height) == len(self.page_images) + 1
  1566. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  1567. self.__images__(fnm, zoomin)
  1568. self._layouts_paddle(zoomin)
  1569. self._table_transformer_job(zoomin)
  1570. self._text_merge()
  1571. self._concat_downward()
  1572. self._filter_forpages()
  1573. tbls = self._extract_table_figure(need_image, zoomin, return_html)
  1574. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  1575. def remove_tag(self, txt):
  1576. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  1577. def crop(self, text, ZM=3):
  1578. imgs = []
  1579. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  1580. pn, left, right, top, bottom = tag.strip(
  1581. "#").strip("@").split("\t")
  1582. left, right, top, bottom = float(left), float(
  1583. right), float(top), float(bottom)
  1584. bottom *= ZM
  1585. pns = [int(p) - 1 for p in pn.split("-")]
  1586. for pn in pns[1:]:
  1587. bottom += self.page_images[pn - 1].size[1]
  1588. imgs.append(
  1589. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  1590. right *
  1591. ZM, min(
  1592. bottom, self.page_images[pns[0]].size[1])
  1593. ))
  1594. )
  1595. bottom -= self.page_images[pns[0]].size[1]
  1596. for pn in pns[1:]:
  1597. imgs.append(
  1598. self.page_images[pn].crop((left * ZM, 0,
  1599. right * ZM,
  1600. min(bottom,
  1601. self.page_images[pn].size[1])
  1602. ))
  1603. )
  1604. bottom -= self.page_images[pn].size[1]
  1605. if not imgs:
  1606. return
  1607. GAP = 2
  1608. height = 0
  1609. for img in imgs:
  1610. height += img.size[1] + GAP
  1611. height = int(height)
  1612. pic = Image.new("RGB",
  1613. (int(np.max([i.size[0] for i in imgs])), height),
  1614. (245, 245, 245))
  1615. height = 0
  1616. for img in imgs:
  1617. pic.paste(img, (0, int(height)))
  1618. height += img.size[1] + GAP
  1619. return pic
  1620. if __name__ == "__main__":
  1621. pass