Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import random
  4. import fitz
  5. import requests
  6. import xgboost as xgb
  7. from io import BytesIO
  8. import torch
  9. import re
  10. import pdfplumber
  11. import logging
  12. from PIL import Image
  13. import numpy as np
  14. from api.db import ParserType
  15. from deepdoc.visual import OCR, Recognizer
  16. from rag.nlp import huqie
  17. from collections import Counter
  18. from copy import deepcopy
  19. from huggingface_hub import hf_hub_download
  20. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  21. class HuParser:
  22. def __init__(self):
  23. self.ocr = OCR()
  24. if not hasattr(self, "model_speciess"):
  25. self.model_speciess = ParserType.GENERAL.value
  26. self.layout_labels = [
  27. "_background_",
  28. "Text",
  29. "Title",
  30. "Figure",
  31. "Figure caption",
  32. "Table",
  33. "Table caption",
  34. "Header",
  35. "Footer",
  36. "Reference",
  37. "Equation",
  38. ]
  39. self.tsr_labels = [
  40. "table",
  41. "table column",
  42. "table row",
  43. "table column header",
  44. "table projected row header",
  45. "table spanning cell",
  46. ]
  47. self.layouter = Recognizer(self.layout_labels, "layout", "/data/newpeak/medical-gpt/res/ppdet/")
  48. self.tbl_det = Recognizer(self.tsr_labels, "tsr", "/data/newpeak/medical-gpt/res/ppdet.tbl/")
  49. self.updown_cnt_mdl = xgb.Booster()
  50. if torch.cuda.is_available():
  51. self.updown_cnt_mdl.set_param({"device": "cuda"})
  52. self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
  53. filename="updown_concat_xgb.model"))
  54. """
  55. If you have trouble downloading HuggingFace models, -_^ this might help!!
  56. For Linux:
  57. export HF_ENDPOINT=https://hf-mirror.com
  58. For Windows:
  59. Good luck
  60. ^_-
  61. """
  62. def __remote_call(self, species, images, thr=0.7):
  63. url = os.environ.get("INFINIFLOW_SERVER")
  64. token = os.environ.get("INFINIFLOW_TOKEN")
  65. if not url or not token:
  66. logging.warning("INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It's FREE! Using 'export' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN.")
  67. return [[] for _ in range(len(images))]
  68. def convert_image_to_bytes(PILimage):
  69. image = BytesIO()
  70. PILimage.save(image, format='png')
  71. image.seek(0)
  72. return image.getvalue()
  73. images = [convert_image_to_bytes(img) for img in images]
  74. def remote_call():
  75. nonlocal images, thr
  76. res = requests.post(url+"/v1/layout/detect/"+species, files=[("image", img) for img in images], data={"threashold": thr},
  77. headers={"Authorization": token}, timeout=len(images) * 10)
  78. res = res.json()
  79. if res["retcode"] != 0: raise RuntimeError(res["retmsg"])
  80. return res["data"]
  81. for _ in range(3):
  82. try:
  83. return remote_call()
  84. except RuntimeError as e:
  85. raise e
  86. except Exception as e:
  87. logging.error("layout_predict:"+str(e))
  88. return remote_call()
  89. def __char_width(self, c):
  90. return (c["x1"] - c["x0"]) // len(c["text"])
  91. def __height(self, c):
  92. return c["bottom"] - c["top"]
  93. def _x_dis(self, a, b):
  94. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  95. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  96. def _y_dis(
  97. self, a, b):
  98. return (
  99. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  100. def _match_proj(self, b):
  101. proj_patt = [
  102. r"第[零一二三四五六七八九十百]+章",
  103. r"第[零一二三四五六七八九十百]+[条节]",
  104. r"[零一二三四五六七八九十百]+[、是  ]",
  105. r"[\((][零一二三四五六七八九十百]+[)\)]",
  106. r"[\((][0-9]+[)\)]",
  107. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  108. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  109. r"[⚫•➢①② ]",
  110. ]
  111. return any([re.match(p, b["text"]) for p in proj_patt])
  112. def _updown_concat_features(self, up, down):
  113. w = max(self.__char_width(up), self.__char_width(down))
  114. h = max(self.__height(up), self.__height(down))
  115. y_dis = self._y_dis(up, down)
  116. LEN = 6
  117. tks_down = huqie.qie(down["text"][:LEN]).split(" ")
  118. tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
  119. tks_all = up["text"][-LEN:].strip() \
  120. + (" " if re.match(r"[a-zA-Z0-9]+",
  121. up["text"][-1] + down["text"][0]) else "") \
  122. + down["text"][:LEN].strip()
  123. tks_all = huqie.qie(tks_all).split(" ")
  124. fea = [
  125. up.get("R", -1) == down.get("R", -1),
  126. y_dis / h,
  127. down["page_number"] - up["page_number"],
  128. up["layout_type"] == down["layout_type"],
  129. up["layout_type"] == "text",
  130. down["layout_type"] == "text",
  131. up["layout_type"] == "table",
  132. down["layout_type"] == "table",
  133. True if re.search(
  134. r"([。?!;!?;+))]|[a-z]\.)$",
  135. up["text"]) else False,
  136. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  137. True if re.search(
  138. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  139. down["text"]) else False,
  140. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  141. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  142. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  143. True if re.search(r"[\((][^\))]+$", up["text"])
  144. and re.search(r"[\))]", down["text"]) else False,
  145. self._match_proj(down),
  146. True if re.match(r"[A-Z]", down["text"]) else False,
  147. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  148. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  149. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  150. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  151. ) > 1 and len(
  152. down["text"].strip()) > 1 else False,
  153. up["x0"] > down["x1"],
  154. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  155. self.__height(down)),
  156. self._x_dis(up, down) / max(w, 0.000001),
  157. (len(up["text"]) - len(down["text"])) /
  158. max(len(up["text"]), len(down["text"])),
  159. len(tks_all) - len(tks_up) - len(tks_down),
  160. len(tks_down) - len(tks_up),
  161. tks_down[-1] == tks_up[-1],
  162. max(down["in_row"], up["in_row"]),
  163. abs(down["in_row"] - up["in_row"]),
  164. len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
  165. len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
  166. ]
  167. return fea
  168. @staticmethod
  169. def sort_Y_firstly(arr, threashold):
  170. # sort using y1 first and then x1
  171. arr = sorted(arr, key=lambda r: (r["top"], r["x0"]))
  172. for i in range(len(arr) - 1):
  173. for j in range(i, -1, -1):
  174. # restore the order using th
  175. if abs(arr[j + 1]["top"] - arr[j]["top"]) < threashold \
  176. and arr[j + 1]["x0"] < arr[j]["x0"]:
  177. tmp = deepcopy(arr[j])
  178. arr[j] = deepcopy(arr[j + 1])
  179. arr[j + 1] = deepcopy(tmp)
  180. return arr
  181. @staticmethod
  182. def sort_X_by_page(arr, threashold):
  183. # sort using y1 first and then x1
  184. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  185. for i in range(len(arr) - 1):
  186. for j in range(i, -1, -1):
  187. # restore the order using th
  188. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  189. and arr[j + 1]["top"] < arr[j]["top"]\
  190. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  191. tmp = arr[j]
  192. arr[j] = arr[j + 1]
  193. arr[j + 1] = tmp
  194. return arr
  195. @staticmethod
  196. def sort_R_firstly(arr, thr=0):
  197. # sort using y1 first and then x1
  198. # sorted(arr, key=lambda r: (r["top"], r["x0"]))
  199. arr = HuParser.sort_Y_firstly(arr, thr)
  200. for i in range(len(arr) - 1):
  201. for j in range(i, -1, -1):
  202. if "R" not in arr[j] or "R" not in arr[j + 1]:
  203. continue
  204. if arr[j + 1]["R"] < arr[j]["R"] \
  205. or (
  206. arr[j + 1]["R"] == arr[j]["R"]
  207. and arr[j + 1]["x0"] < arr[j]["x0"]
  208. ):
  209. tmp = arr[j]
  210. arr[j] = arr[j + 1]
  211. arr[j + 1] = tmp
  212. return arr
  213. @staticmethod
  214. def sort_X_firstly(arr, threashold, copy=True):
  215. # sort using y1 first and then x1
  216. arr = sorted(arr, key=lambda r: (r["x0"], r["top"]))
  217. for i in range(len(arr) - 1):
  218. for j in range(i, -1, -1):
  219. # restore the order using th
  220. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  221. and arr[j + 1]["top"] < arr[j]["top"]:
  222. tmp = deepcopy(arr[j]) if copy else arr[j]
  223. arr[j] = deepcopy(arr[j + 1]) if copy else arr[j + 1]
  224. arr[j + 1] = deepcopy(tmp) if copy else tmp
  225. return arr
  226. @staticmethod
  227. def sort_C_firstly(arr, thr=0):
  228. # sort using y1 first and then x1
  229. # sorted(arr, key=lambda r: (r["x0"], r["top"]))
  230. arr = HuParser.sort_X_firstly(arr, thr)
  231. for i in range(len(arr) - 1):
  232. for j in range(i, -1, -1):
  233. # restore the order using th
  234. if "C" not in arr[j] or "C" not in arr[j + 1]:
  235. continue
  236. if arr[j + 1]["C"] < arr[j]["C"] \
  237. or (
  238. arr[j + 1]["C"] == arr[j]["C"]
  239. and arr[j + 1]["top"] < arr[j]["top"]
  240. ):
  241. tmp = arr[j]
  242. arr[j] = arr[j + 1]
  243. arr[j + 1] = tmp
  244. return arr
  245. return sorted(arr, key=lambda r: (r.get("C", r["x0"]), r["top"]))
  246. def _has_color(self, o):
  247. if o.get("ncs", "") == "DeviceGray":
  248. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  249. o["non_stroking_color"][0] == 1:
  250. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  251. return False
  252. return True
  253. def __overlapped_area(self, a, b, ratio=True):
  254. tp, btm, x0, x1 = a["top"], a["bottom"], a["x0"], a["x1"]
  255. if b["x0"] > x1 or b["x1"] < x0:
  256. return 0
  257. if b["bottom"] < tp or b["top"] > btm:
  258. return 0
  259. x0_ = max(b["x0"], x0)
  260. x1_ = min(b["x1"], x1)
  261. assert x0_ <= x1_, "Fuckedup! T:{},B:{},X0:{},X1:{} ==> {}".format(
  262. tp, btm, x0, x1, b)
  263. tp_ = max(b["top"], tp)
  264. btm_ = min(b["bottom"], btm)
  265. assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
  266. tp, btm, x0, x1, b)
  267. ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
  268. x0 != 0 and btm - tp != 0 else 0
  269. if ov > 0 and ratio:
  270. ov /= (x1 - x0) * (btm - tp)
  271. return ov
  272. def __find_overlapped_with_threashold(self, box, boxes, thr=0.3):
  273. if not boxes:
  274. return
  275. max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0
  276. s, e = 0, len(boxes)
  277. for i in range(s, e):
  278. ov = self.__overlapped_area(box, boxes[i])
  279. _ov = self.__overlapped_area(boxes[i], box)
  280. if (ov, _ov) < (max_overlaped, _max_overlaped):
  281. continue
  282. max_overlaped_i = i
  283. max_overlaped = ov
  284. _max_overlaped = _ov
  285. return max_overlaped_i
  286. def __find_overlapped(self, box, boxes_sorted_by_y, naive=False):
  287. if not boxes_sorted_by_y:
  288. return
  289. bxs = boxes_sorted_by_y
  290. s, e, ii = 0, len(bxs), 0
  291. while s < e and not naive:
  292. ii = (e + s) // 2
  293. pv = bxs[ii]
  294. if box["bottom"] < pv["top"]:
  295. e = ii
  296. continue
  297. if box["top"] > pv["bottom"]:
  298. s = ii + 1
  299. continue
  300. break
  301. while s < ii:
  302. if box["top"] > bxs[s]["bottom"]:
  303. s += 1
  304. break
  305. while e - 1 > ii:
  306. if box["bottom"] < bxs[e - 1]["top"]:
  307. e -= 1
  308. break
  309. max_overlaped_i, max_overlaped = None, 0
  310. for i in range(s, e):
  311. ov = self.__overlapped_area(bxs[i], box)
  312. if ov <= max_overlaped:
  313. continue
  314. max_overlaped_i = i
  315. max_overlaped = ov
  316. return max_overlaped_i
  317. def _is_garbage(self, b):
  318. patt = [r"^•+$", r"(版权归©|免责条款|地址[::])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
  319. r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
  320. "(资料|数据)来源[::]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}",
  321. "\\(cid *: *[0-9]+ *\\)"
  322. ]
  323. return any([re.search(p, b["text"]) for p in patt])
  324. def __layouts_cleanup(self, boxes, layouts, far=2, thr=0.7):
  325. def notOverlapped(a, b):
  326. return any([a["x1"] < b["x0"],
  327. a["x0"] > b["x1"],
  328. a["bottom"] < b["top"],
  329. a["top"] > b["bottom"]])
  330. i = 0
  331. while i + 1 < len(layouts):
  332. j = i + 1
  333. while j < min(i + far, len(layouts)) \
  334. and (layouts[i].get("type", "") != layouts[j].get("type", "")
  335. or notOverlapped(layouts[i], layouts[j])):
  336. j += 1
  337. if j >= min(i + far, len(layouts)):
  338. i += 1
  339. continue
  340. if self.__overlapped_area(layouts[i], layouts[j]) < thr \
  341. and self.__overlapped_area(layouts[j], layouts[i]) < thr:
  342. i += 1
  343. continue
  344. if layouts[i].get("score") and layouts[j].get("score"):
  345. if layouts[i]["score"] > layouts[j]["score"]:
  346. layouts.pop(j)
  347. else:
  348. layouts.pop(i)
  349. continue
  350. area_i, area_i_1 = 0, 0
  351. for b in boxes:
  352. if not notOverlapped(b, layouts[i]):
  353. area_i += self.__overlapped_area(b, layouts[i], False)
  354. if not notOverlapped(b, layouts[j]):
  355. area_i_1 += self.__overlapped_area(b, layouts[j], False)
  356. if area_i > area_i_1:
  357. layouts.pop(j)
  358. else:
  359. layouts.pop(i)
  360. return layouts
  361. def __table_tsr(self, images):
  362. tbls = self.tbl_det(images, thr=0.5)
  363. res = []
  364. # align left&right for rows, align top&bottom for columns
  365. for tbl in tbls:
  366. lts = [{"label": b["type"],
  367. "score": b["score"],
  368. "x0": b["bbox"][0], "x1": b["bbox"][2],
  369. "top": b["bbox"][1], "bottom": b["bbox"][-1]
  370. } for b in tbl]
  371. if not lts:
  372. continue
  373. left = [b["x0"] for b in lts if b["label"].find(
  374. "row") > 0 or b["label"].find("header") > 0]
  375. right = [b["x1"] for b in lts if b["label"].find(
  376. "row") > 0 or b["label"].find("header") > 0]
  377. if not left:
  378. continue
  379. left = np.median(left) if len(left) > 4 else np.min(left)
  380. right = np.median(right) if len(right) > 4 else np.max(right)
  381. for b in lts:
  382. if b["label"].find("row") > 0 or b["label"].find("header") > 0:
  383. if b["x0"] > left:
  384. b["x0"] = left
  385. if b["x1"] < right:
  386. b["x1"] = right
  387. top = [b["top"] for b in lts if b["label"] == "table column"]
  388. bottom = [b["bottom"] for b in lts if b["label"] == "table column"]
  389. if not top:
  390. res.append(lts)
  391. continue
  392. top = np.median(top) if len(top) > 4 else np.min(top)
  393. bottom = np.median(bottom) if len(bottom) > 4 else np.max(bottom)
  394. for b in lts:
  395. if b["label"] == "table column":
  396. if b["top"] > top:
  397. b["top"] = top
  398. if b["bottom"] < bottom:
  399. b["bottom"] = bottom
  400. res.append(lts)
  401. return res
  402. def _table_transformer_job(self, ZM):
  403. logging.info("Table processing...")
  404. imgs, pos = [], []
  405. tbcnt = [0]
  406. MARGIN = 10
  407. self.tb_cpns = []
  408. assert len(self.page_layout) == len(self.page_images)
  409. for p, tbls in enumerate(self.page_layout): # for page
  410. tbls = [f for f in tbls if f["type"] == "table"]
  411. tbcnt.append(len(tbls))
  412. if not tbls:
  413. continue
  414. for tb in tbls: # for table
  415. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  416. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  417. left *= ZM
  418. top *= ZM
  419. right *= ZM
  420. bott *= ZM
  421. pos.append((left, top))
  422. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  423. assert len(self.page_images) == len(tbcnt) - 1
  424. if not imgs:
  425. return
  426. recos = self.__table_tsr(imgs)
  427. tbcnt = np.cumsum(tbcnt)
  428. for i in range(len(tbcnt) - 1): # for page
  429. pg = []
  430. for j, tb_items in enumerate(
  431. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  432. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  433. for it in tb_items: # for table components
  434. it["x0"] = (it["x0"] + poss[j][0])
  435. it["x1"] = (it["x1"] + poss[j][0])
  436. it["top"] = (it["top"] + poss[j][1])
  437. it["bottom"] = (it["bottom"] + poss[j][1])
  438. for n in ["x0", "x1", "top", "bottom"]:
  439. it[n] /= ZM
  440. it["top"] += self.page_cum_height[i]
  441. it["bottom"] += self.page_cum_height[i]
  442. it["pn"] = i
  443. it["layoutno"] = j
  444. pg.append(it)
  445. self.tb_cpns.extend(pg)
  446. def gather(kwd, fzy=10, ption=0.6):
  447. eles = self.sort_Y_firstly(
  448. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  449. eles = self.__layouts_cleanup(self.boxes, eles, 5, ption)
  450. return self.sort_Y_firstly(eles, 0)
  451. # add R,H,C,SP tag to boxes within table layout
  452. headers = gather(r".*header$")
  453. rows = gather(r".* (row|header)")
  454. spans = gather(r".*spanning")
  455. clmns = sorted([r for r in self.tb_cpns if re.match(
  456. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  457. clmns = self.__layouts_cleanup(self.boxes, clmns, 5, 0.5)
  458. for b in self.boxes:
  459. if b.get("layout_type", "") != "table":
  460. continue
  461. ii = self.__find_overlapped_with_threashold(b, rows, thr=0.3)
  462. if ii is not None:
  463. b["R"] = ii
  464. b["R_top"] = rows[ii]["top"]
  465. b["R_bott"] = rows[ii]["bottom"]
  466. ii = self.__find_overlapped_with_threashold(b, headers, thr=0.3)
  467. if ii is not None:
  468. b["H_top"] = headers[ii]["top"]
  469. b["H_bott"] = headers[ii]["bottom"]
  470. b["H_left"] = headers[ii]["x0"]
  471. b["H_right"] = headers[ii]["x1"]
  472. b["H"] = ii
  473. ii = self.__find_overlapped_with_threashold(b, clmns, thr=0.3)
  474. if ii is not None:
  475. b["C"] = ii
  476. b["C_left"] = clmns[ii]["x0"]
  477. b["C_right"] = clmns[ii]["x1"]
  478. ii = self.__find_overlapped_with_threashold(b, spans, thr=0.3)
  479. if ii is not None:
  480. b["H_top"] = spans[ii]["top"]
  481. b["H_bott"] = spans[ii]["bottom"]
  482. b["H_left"] = spans[ii]["x0"]
  483. b["H_right"] = spans[ii]["x1"]
  484. b["SP"] = ii
  485. def __ocr(self, pagenum, img, chars, ZM=3):
  486. bxs = self.ocr(np.array(img))
  487. if not bxs:
  488. self.boxes.append([])
  489. return
  490. bxs = [(line[0], line[1][0]) for line in bxs]
  491. bxs = self.sort_Y_firstly(
  492. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  493. "top": b[0][1] / ZM, "text": "", "txt": t,
  494. "bottom": b[-1][1] / ZM,
  495. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  496. self.mean_height[-1] / 3
  497. )
  498. # merge chars in the same rect
  499. for c in self.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4):
  500. ii = self.__find_overlapped(c, bxs)
  501. if ii is None:
  502. self.lefted_chars.append(c)
  503. continue
  504. ch = c["bottom"] - c["top"]
  505. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  506. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  507. self.lefted_chars.append(c)
  508. continue
  509. if c["text"] == " " and bxs[ii]["text"]:
  510. if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " "
  511. else:
  512. bxs[ii]["text"] += c["text"]
  513. for b in bxs:
  514. if not b["text"]:
  515. b["text"] = b["txt"]
  516. del b["txt"]
  517. if self.mean_height[-1] == 0:
  518. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  519. for b in bxs])
  520. self.boxes.append(bxs)
  521. def _layouts_rec(self, ZM):
  522. assert len(self.page_images) == len(self.boxes)
  523. # Tag layout type
  524. boxes = []
  525. layouts = self.layouter(self.page_images)
  526. #save_results(self.page_images, layouts, self.layout_labels, output_dir='output/', threshold=0.7)
  527. assert len(self.page_images) == len(layouts)
  528. for pn, lts in enumerate(layouts):
  529. bxs = self.boxes[pn]
  530. lts = [{"type": b["type"],
  531. "score": float(b["score"]),
  532. "x0": b["bbox"][0] / ZM, "x1": b["bbox"][2] / ZM,
  533. "top": b["bbox"][1] / ZM, "bottom": b["bbox"][-1] / ZM,
  534. "page_number": pn,
  535. } for b in lts]
  536. lts = self.sort_Y_firstly(lts, self.mean_height[pn] / 2)
  537. lts = self.__layouts_cleanup(bxs, lts)
  538. self.page_layout.append(lts)
  539. # Tag layout type, layouts are ready
  540. def findLayout(ty):
  541. nonlocal bxs, lts
  542. lts_ = [lt for lt in lts if lt["type"] == ty]
  543. i = 0
  544. while i < len(bxs):
  545. if bxs[i].get("layout_type"):
  546. i += 1
  547. continue
  548. if self._is_garbage(bxs[i]):
  549. logging.debug("GARBAGE: " + bxs[i]["text"])
  550. bxs.pop(i)
  551. continue
  552. ii = self.__find_overlapped_with_threashold(bxs[i], lts_,
  553. thr=0.4)
  554. if ii is None: # belong to nothing
  555. bxs[i]["layout_type"] = ""
  556. i += 1
  557. continue
  558. lts_[ii]["visited"] = True
  559. if lts_[ii]["type"] in ["footer", "header", "reference"]:
  560. if lts_[ii]["type"] not in self.garbages:
  561. self.garbages[lts_[ii]["type"]] = []
  562. self.garbages[lts_[ii]["type"]].append(bxs[i]["text"])
  563. logging.debug("GARBAGE: " + bxs[i]["text"])
  564. bxs.pop(i)
  565. continue
  566. bxs[i]["layoutno"] = f"{ty}-{ii}"
  567. bxs[i]["layout_type"] = lts_[ii]["type"]
  568. i += 1
  569. for lt in ["footer", "header", "reference", "figure caption",
  570. "table caption", "title", "text", "table", "figure"]:
  571. findLayout(lt)
  572. # add box to figure layouts which has not text box
  573. for i, lt in enumerate(
  574. [lt for lt in lts if lt["type"] == "figure"]):
  575. if lt.get("visited"):
  576. continue
  577. lt = deepcopy(lt)
  578. del lt["type"]
  579. lt["text"] = ""
  580. lt["layout_type"] = "figure"
  581. lt["layoutno"] = f"figure-{i}"
  582. bxs.append(lt)
  583. boxes.extend(bxs)
  584. self.boxes = boxes
  585. garbage = set()
  586. for k in self.garbages.keys():
  587. self.garbages[k] = Counter(self.garbages[k])
  588. for g, c in self.garbages[k].items():
  589. if c > 1:
  590. garbage.add(g)
  591. logging.debug("GARBAGE:" + ",".join(garbage))
  592. self.boxes = [b for b in self.boxes if b["text"].strip() not in garbage]
  593. # cumlative Y
  594. for i in range(len(self.boxes)):
  595. self.boxes[i]["top"] += \
  596. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  597. self.boxes[i]["bottom"] += \
  598. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  599. def _text_merge(self):
  600. # merge adjusted boxes
  601. bxs = self.boxes
  602. def end_with(b, txt):
  603. txt = txt.strip()
  604. tt = b.get("text", "").strip()
  605. return tt and tt.find(txt) == len(tt) - len(txt)
  606. def start_with(b, txts):
  607. tt = b.get("text", "").strip()
  608. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  609. # horizontally merge adjacent box with the same layout
  610. i = 0
  611. while i < len(bxs) - 1:
  612. b = bxs[i]
  613. b_ = bxs[i + 1]
  614. if b.get("layoutno", "0") != b_.get("layoutno", "1"):
  615. i += 1
  616. continue
  617. dis_thr = 1
  618. dis = b["x1"] - b_["x0"]
  619. if b.get("layout_type", "") != "text" or b_.get(
  620. "layout_type", "") != "text":
  621. if end_with(b, ",") or start_with(b_, "(,"):
  622. dis_thr = -8
  623. else:
  624. i += 1
  625. continue
  626. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  627. and dis >= dis_thr and b["x1"] < b_["x1"]:
  628. # merge
  629. bxs[i]["x1"] = b_["x1"]
  630. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  631. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  632. bxs[i]["text"] += b_["text"]
  633. bxs.pop(i + 1)
  634. continue
  635. i += 1
  636. self.boxes = bxs
  637. def _naive_vertical_merge(self):
  638. bxs = self.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
  639. i = 0
  640. while i + 1 < len(bxs):
  641. b = bxs[i]
  642. b_ = bxs[i + 1]
  643. if b["page_number"] < b_["page_number"] and re.match(r"[0-9 •一—-]+$", b["text"]):
  644. bxs.pop(i)
  645. continue
  646. concatting_feats = [
  647. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  648. len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\",‘“、;:",
  649. b["text"].strip()[0] in "。;?!?”)),,、:",
  650. ]
  651. # features for not concating
  652. feats = [
  653. b.get("layoutno", 0) != b.get("layoutno", 0),
  654. b["text"].strip()[-1] in "。?!?",
  655. self.is_english and b["text"].strip()[-1] in ".!?",
  656. b["page_number"] == b_["page_number"] and b_["top"] - \
  657. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  658. b["page_number"] < b_["page_number"] and abs(
  659. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
  660. ]
  661. if any(feats) and not any(concatting_feats):
  662. i += 1
  663. continue
  664. # merge up and down
  665. b["bottom"] = b_["bottom"]
  666. b["text"] += b_["text"]
  667. b["x0"] = min(b["x0"], b_["x0"])
  668. b["x1"] = max(b["x1"], b_["x1"])
  669. bxs.pop(i + 1)
  670. self.boxes = bxs
  671. def _concat_downward(self, concat_between_pages=True):
  672. # count boxes in the same row as a feature
  673. for i in range(len(self.boxes)):
  674. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  675. self.boxes[i]["in_row"] = 0
  676. j = max(0, i - 12)
  677. while j < min(i + 12, len(self.boxes)):
  678. if j == i:
  679. j += 1
  680. continue
  681. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  682. if abs(ydis) < 1:
  683. self.boxes[i]["in_row"] += 1
  684. elif ydis > 0:
  685. break
  686. j += 1
  687. # concat between rows
  688. boxes = deepcopy(self.boxes)
  689. blocks = []
  690. while boxes:
  691. chunks = []
  692. def dfs(up, dp):
  693. chunks.append(up)
  694. i = dp
  695. while i < min(dp + 12, len(boxes)):
  696. ydis = self._y_dis(up, boxes[i])
  697. smpg = up["page_number"] == boxes[i]["page_number"]
  698. mh = self.mean_height[up["page_number"] - 1]
  699. mw = self.mean_width[up["page_number"] - 1]
  700. if smpg and ydis > mh * 4:
  701. break
  702. if not smpg and ydis > mh * 16:
  703. break
  704. down = boxes[i]
  705. if not concat_between_pages and down["page_number"] > up["page_number"]:
  706. break
  707. if up.get("R", "") != down.get(
  708. "R", "") and up["text"][-1] != ",":
  709. i += 1
  710. continue
  711. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  712. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
  713. i += 1
  714. continue
  715. if not down["text"].strip():
  716. i += 1
  717. continue
  718. if up["x1"] < down["x0"] - 10 * \
  719. mw or up["x0"] > down["x1"] + 10 * mw:
  720. i += 1
  721. continue
  722. if i - dp < 5 and up.get("layout_type") == "text":
  723. if up.get("layoutno", "1") == down.get(
  724. "layoutno", "2"):
  725. dfs(down, i + 1)
  726. boxes.pop(i)
  727. return
  728. i += 1
  729. continue
  730. fea = self._updown_concat_features(up, down)
  731. if self.updown_cnt_mdl.predict(
  732. xgb.DMatrix([fea]))[0] <= 0.5:
  733. i += 1
  734. continue
  735. dfs(down, i + 1)
  736. boxes.pop(i)
  737. return
  738. dfs(boxes[0], 1)
  739. boxes.pop(0)
  740. if chunks:
  741. blocks.append(chunks)
  742. # concat within each block
  743. boxes = []
  744. for b in blocks:
  745. if len(b) == 1:
  746. boxes.append(b[0])
  747. continue
  748. t = b[0]
  749. for c in b[1:]:
  750. t["text"] = t["text"].strip()
  751. c["text"] = c["text"].strip()
  752. if not c["text"]:
  753. continue
  754. if t["text"] and re.match(
  755. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  756. t["text"] += " "
  757. t["text"] += c["text"]
  758. t["x0"] = min(t["x0"], c["x0"])
  759. t["x1"] = max(t["x1"], c["x1"])
  760. t["page_number"] = min(t["page_number"], c["page_number"])
  761. t["bottom"] = c["bottom"]
  762. if not t["layout_type"] \
  763. and c["layout_type"]:
  764. t["layout_type"] = c["layout_type"]
  765. boxes.append(t)
  766. self.boxes = self.sort_Y_firstly(boxes, 0)
  767. def _filter_forpages(self):
  768. if not self.boxes:
  769. return
  770. findit = False
  771. i = 0
  772. while i < len(self.boxes):
  773. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  774. i += 1
  775. continue
  776. findit = True
  777. eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
  778. self.boxes.pop(i)
  779. if i >= len(self.boxes): break
  780. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
  781. while not prefix:
  782. self.boxes.pop(i)
  783. if i >= len(self.boxes): break
  784. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
  785. self.boxes.pop(i)
  786. if i >= len(self.boxes) or not prefix: break
  787. for j in range(i, min(i + 128, len(self.boxes))):
  788. if not re.match(prefix, self.boxes[j]["text"]):
  789. continue
  790. for k in range(i, j): self.boxes.pop(i)
  791. break
  792. if findit:return
  793. page_dirty = [0] * len(self.page_images)
  794. for b in self.boxes:
  795. if re.search(r"(··|··|··)", b["text"]):
  796. page_dirty[b["page_number"]-1] += 1
  797. page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3])
  798. if not page_dirty: return
  799. i = 0
  800. while i < len(self.boxes):
  801. if self.boxes[i]["page_number"] in page_dirty:
  802. self.boxes.pop(i)
  803. continue
  804. i += 1
  805. def _merge_with_same_bullet(self):
  806. i = 0
  807. while i + 1 < len(self.boxes):
  808. b = self.boxes[i]
  809. b_ = self.boxes[i + 1]
  810. if not b["text"].strip():
  811. self.boxes.pop(i)
  812. continue
  813. if not b_["text"].strip():
  814. self.boxes.pop(i+1)
  815. continue
  816. if b["text"].strip()[0] != b_["text"].strip()[0] \
  817. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  818. or huqie.is_chinese(b["text"].strip()[0]) \
  819. or b["top"] > b_["bottom"]:
  820. i += 1
  821. continue
  822. b_["text"] = b["text"] + "\n" + b_["text"]
  823. b_["x0"] = min(b["x0"], b_["x0"])
  824. b_["x1"] = max(b["x1"], b_["x1"])
  825. b_["top"] = b["top"]
  826. self.boxes.pop(i)
  827. def _blockType(self, b):
  828. patt = [
  829. ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
  830. (r"^(20|19)[0-9]{2}年$", "Dt"),
  831. (r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"),
  832. ("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"),
  833. (r"^第*[一二三四1-4]季度$", "Dt"),
  834. (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
  835. (r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"),
  836. ("^[0-9.,+%/ -]+$", "Nu"),
  837. (r"^[0-9A-Z/\._~-]+$", "Ca"),
  838. (r"^[A-Z]*[a-z' -]+$", "En"),
  839. (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
  840. (r"^.{1}$", "Sg")
  841. ]
  842. for p, n in patt:
  843. if re.search(p, b["text"].strip()):
  844. return n
  845. tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1]
  846. if len(tks) > 3:
  847. if len(tks) < 12:
  848. return "Tx"
  849. else:
  850. return "Lx"
  851. if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
  852. return "Nr"
  853. return "Ot"
  854. def __cal_spans(self, boxes, rows, cols, tbl, html=True):
  855. # caculate span
  856. clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
  857. for cln in cols]
  858. crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln])
  859. for cln in cols]
  860. rtop = [np.mean([c.get("R_top", c["top"]) for c in row])
  861. for row in rows]
  862. rbtm = [np.mean([c.get("R_btm", c["bottom"])
  863. for c in row]) for row in rows]
  864. for b in boxes:
  865. if "SP" not in b:
  866. continue
  867. b["colspan"] = [b["cn"]]
  868. b["rowspan"] = [b["rn"]]
  869. # col span
  870. for j in range(0, len(clft)):
  871. if j == b["cn"]:
  872. continue
  873. if clft[j] + (crgt[j] - clft[j]) / 2 < b["H_left"]:
  874. continue
  875. if crgt[j] - (crgt[j] - clft[j]) / 2 > b["H_right"]:
  876. continue
  877. b["colspan"].append(j)
  878. # row span
  879. for j in range(0, len(rtop)):
  880. if j == b["rn"]:
  881. continue
  882. if rtop[j] + (rbtm[j] - rtop[j]) / 2 < b["H_top"]:
  883. continue
  884. if rbtm[j] - (rbtm[j] - rtop[j]) / 2 > b["H_bott"]:
  885. continue
  886. b["rowspan"].append(j)
  887. def join(arr):
  888. if not arr:
  889. return ""
  890. return "".join([t["text"] for t in arr])
  891. # rm the spaning cells
  892. for i in range(len(tbl)):
  893. for j, arr in enumerate(tbl[i]):
  894. if not arr:
  895. continue
  896. if all(["rowspan" not in a and "colspan" not in a for a in arr]):
  897. continue
  898. rowspan, colspan = [], []
  899. for a in arr:
  900. if isinstance(a.get("rowspan", 0), list):
  901. rowspan.extend(a["rowspan"])
  902. if isinstance(a.get("colspan", 0), list):
  903. colspan.extend(a["colspan"])
  904. rowspan, colspan = set(rowspan), set(colspan)
  905. if len(rowspan) < 2 and len(colspan) < 2:
  906. for a in arr:
  907. if "rowspan" in a:
  908. del a["rowspan"]
  909. if "colspan" in a:
  910. del a["colspan"]
  911. continue
  912. rowspan, colspan = sorted(rowspan), sorted(colspan)
  913. rowspan = list(range(rowspan[0], rowspan[-1] + 1))
  914. colspan = list(range(colspan[0], colspan[-1] + 1))
  915. assert i in rowspan, rowspan
  916. assert j in colspan, colspan
  917. arr = []
  918. for r in rowspan:
  919. for c in colspan:
  920. arr_txt = join(arr)
  921. if tbl[r][c] and join(tbl[r][c]) != arr_txt:
  922. arr.extend(tbl[r][c])
  923. tbl[r][c] = None if html else arr
  924. for a in arr:
  925. if len(rowspan) > 1:
  926. a["rowspan"] = len(rowspan)
  927. elif "rowspan" in a:
  928. del a["rowspan"]
  929. if len(colspan) > 1:
  930. a["colspan"] = len(colspan)
  931. elif "colspan" in a:
  932. del a["colspan"]
  933. tbl[rowspan[0]][colspan[0]] = arr
  934. return tbl
  935. def __construct_table(self, boxes, html=False):
  936. cap = ""
  937. i = 0
  938. while i < len(boxes):
  939. if self.is_caption(boxes[i]):
  940. cap += boxes[i]["text"]
  941. boxes.pop(i)
  942. i -= 1
  943. i += 1
  944. if not boxes:
  945. return []
  946. for b in boxes:
  947. b["btype"] = self._blockType(b)
  948. max_type = Counter([b["btype"] for b in boxes]).items()
  949. max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
  950. logging.debug("MAXTYPE: " + max_type)
  951. rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
  952. rowh = np.min(rowh) if rowh else 0
  953. # boxes = self.sort_Y_firstly(boxes, rowh/5)
  954. boxes = self.sort_R_firstly(boxes, rowh / 2)
  955. boxes[0]["rn"] = 0
  956. rows = [[boxes[0]]]
  957. btm = boxes[0]["bottom"]
  958. for b in boxes[1:]:
  959. b["rn"] = len(rows) - 1
  960. lst_r = rows[-1]
  961. if lst_r[-1].get("R", "") != b.get("R", "") \
  962. or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
  963. ): # new row
  964. btm = b["bottom"]
  965. b["rn"] += 1
  966. rows.append([b])
  967. continue
  968. btm = (btm + b["bottom"]) / 2.
  969. rows[-1].append(b)
  970. colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b]
  971. colwm = np.min(colwm) if colwm else 0
  972. crosspage = len(set([b["page_number"] for b in boxes])) > 1
  973. if crosspage:
  974. boxes = self.sort_X_firstly(boxes, colwm / 2, False)
  975. else:
  976. boxes = self.sort_C_firstly(boxes, colwm / 2)
  977. boxes[0]["cn"] = 0
  978. cols = [[boxes[0]]]
  979. right = boxes[0]["x1"]
  980. for b in boxes[1:]:
  981. b["cn"] = len(cols) - 1
  982. lst_c = cols[-1]
  983. if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1][
  984. "page_number"]) \
  985. or (b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")): # new col
  986. right = b["x1"]
  987. b["cn"] += 1
  988. cols.append([b])
  989. continue
  990. right = (right + b["x1"]) / 2.
  991. cols[-1].append(b)
  992. tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))]
  993. for b in boxes:
  994. tbl[b["rn"]][b["cn"]].append(b)
  995. if len(rows) >= 4:
  996. # remove single in column
  997. j = 0
  998. while j < len(tbl[0]):
  999. e, ii = 0, 0
  1000. for i in range(len(tbl)):
  1001. if tbl[i][j]:
  1002. e += 1
  1003. ii = i
  1004. if e > 1:
  1005. break
  1006. if e > 1:
  1007. j += 1
  1008. continue
  1009. f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
  1010. [j - 1][0].get("text")) or j == 0
  1011. ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
  1012. [j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
  1013. if f and ff:
  1014. j += 1
  1015. continue
  1016. bx = tbl[ii][j][0]
  1017. logging.debug("Relocate column single: " + bx["text"])
  1018. # j column only has one value
  1019. left, right = 100000, 100000
  1020. if j > 0 and not f:
  1021. for i in range(len(tbl)):
  1022. if tbl[i][j - 1]:
  1023. left = min(left, np.min(
  1024. [bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
  1025. if j + 1 < len(tbl[0]) and not ff:
  1026. for i in range(len(tbl)):
  1027. if tbl[i][j + 1]:
  1028. right = min(right, np.min(
  1029. [a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
  1030. assert left < 100000 or right < 100000
  1031. if left < right:
  1032. for jj in range(j, len(tbl[0])):
  1033. for i in range(len(tbl)):
  1034. for a in tbl[i][jj]:
  1035. a["cn"] -= 1
  1036. if tbl[ii][j - 1]:
  1037. tbl[ii][j - 1].extend(tbl[ii][j])
  1038. else:
  1039. tbl[ii][j - 1] = tbl[ii][j]
  1040. for i in range(len(tbl)):
  1041. tbl[i].pop(j)
  1042. else:
  1043. for jj in range(j + 1, len(tbl[0])):
  1044. for i in range(len(tbl)):
  1045. for a in tbl[i][jj]:
  1046. a["cn"] -= 1
  1047. if tbl[ii][j + 1]:
  1048. tbl[ii][j + 1].extend(tbl[ii][j])
  1049. else:
  1050. tbl[ii][j + 1] = tbl[ii][j]
  1051. for i in range(len(tbl)):
  1052. tbl[i].pop(j)
  1053. cols.pop(j)
  1054. assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (
  1055. len(cols), len(tbl[0]))
  1056. if len(cols) >= 4:
  1057. # remove single in row
  1058. i = 0
  1059. while i < len(tbl):
  1060. e, jj = 0, 0
  1061. for j in range(len(tbl[i])):
  1062. if tbl[i][j]:
  1063. e += 1
  1064. jj = j
  1065. if e > 1:
  1066. break
  1067. if e > 1:
  1068. i += 1
  1069. continue
  1070. f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
  1071. [jj][0].get("text")) or i == 0
  1072. ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
  1073. [jj][0].get("text")) or i + 1 >= len(tbl)
  1074. if f and ff:
  1075. i += 1
  1076. continue
  1077. bx = tbl[i][jj][0]
  1078. logging.debug("Relocate row single: " + bx["text"])
  1079. # i row only has one value
  1080. up, down = 100000, 100000
  1081. if i > 0 and not f:
  1082. for j in range(len(tbl[i - 1])):
  1083. if tbl[i - 1][j]:
  1084. up = min(up, np.min(
  1085. [bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
  1086. if i + 1 < len(tbl) and not ff:
  1087. for j in range(len(tbl[i + 1])):
  1088. if tbl[i + 1][j]:
  1089. down = min(down, np.min(
  1090. [a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
  1091. assert up < 100000 or down < 100000
  1092. if up < down:
  1093. for ii in range(i, len(tbl)):
  1094. for j in range(len(tbl[ii])):
  1095. for a in tbl[ii][j]:
  1096. a["rn"] -= 1
  1097. if tbl[i - 1][jj]:
  1098. tbl[i - 1][jj].extend(tbl[i][jj])
  1099. else:
  1100. tbl[i - 1][jj] = tbl[i][jj]
  1101. tbl.pop(i)
  1102. else:
  1103. for ii in range(i + 1, len(tbl)):
  1104. for j in range(len(tbl[ii])):
  1105. for a in tbl[ii][j]:
  1106. a["rn"] -= 1
  1107. if tbl[i + 1][jj]:
  1108. tbl[i + 1][jj].extend(tbl[i][jj])
  1109. else:
  1110. tbl[i + 1][jj] = tbl[i][jj]
  1111. tbl.pop(i)
  1112. rows.pop(i)
  1113. # which rows are headers
  1114. hdset = set([])
  1115. for i in range(len(tbl)):
  1116. cnt, h = 0, 0
  1117. for j, arr in enumerate(tbl[i]):
  1118. if not arr:
  1119. continue
  1120. cnt += 1
  1121. if max_type == "Nu" and arr[0]["btype"] == "Nu":
  1122. continue
  1123. if any([a.get("H") for a in arr]) \
  1124. or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
  1125. h += 1
  1126. if h / cnt > 0.5:
  1127. hdset.add(i)
  1128. if html:
  1129. return [self.__html_table(cap, hdset,
  1130. self.__cal_spans(boxes, rows,
  1131. cols, tbl, True)
  1132. )]
  1133. return self.__desc_table(cap, hdset,
  1134. self.__cal_spans(boxes, rows, cols, tbl, False))
  1135. def __html_table(self, cap, hdset, tbl):
  1136. # constrcut HTML
  1137. html = "<table>"
  1138. if cap:
  1139. html += f"<caption>{cap}</caption>"
  1140. for i in range(len(tbl)):
  1141. row = "<tr>"
  1142. txts = []
  1143. for j, arr in enumerate(tbl[i]):
  1144. if arr is None:
  1145. continue
  1146. if not arr:
  1147. row += "<td></td>" if i not in hdset else "<th></th>"
  1148. continue
  1149. txt = ""
  1150. if arr:
  1151. h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2,
  1152. self.mean_height[arr[0]["page_number"] - 1] / 2)
  1153. txt = "".join([c["text"]
  1154. for c in self.sort_Y_firstly(arr, h)])
  1155. txts.append(txt)
  1156. sp = ""
  1157. if arr[0].get("colspan"):
  1158. sp = "colspan={}".format(arr[0]["colspan"])
  1159. if arr[0].get("rowspan"):
  1160. sp += " rowspan={}".format(arr[0]["rowspan"])
  1161. if i in hdset:
  1162. row += f"<th {sp} >" + txt + "</th>"
  1163. else:
  1164. row += f"<td {sp} >" + txt + "</td>"
  1165. if i in hdset:
  1166. if all([t in hdset for t in txts]):
  1167. continue
  1168. for t in txts:
  1169. hdset.add(t)
  1170. if row != "<tr>":
  1171. row += "</tr>"
  1172. else:
  1173. row = ""
  1174. html += "\n" + row
  1175. html += "\n</table>"
  1176. return html
  1177. def __desc_table(self, cap, hdr_rowno, tbl):
  1178. # get text of every colomn in header row to become header text
  1179. clmno = len(tbl[0])
  1180. rowno = len(tbl)
  1181. headers = {}
  1182. hdrset = set()
  1183. lst_hdr = []
  1184. de = "的" if not self.is_english else " for "
  1185. for r in sorted(list(hdr_rowno)):
  1186. headers[r] = ["" for _ in range(clmno)]
  1187. for i in range(clmno):
  1188. if not tbl[r][i]:
  1189. continue
  1190. txt = "".join([a["text"].strip() for a in tbl[r][i]])
  1191. headers[r][i] = txt
  1192. hdrset.add(txt)
  1193. if all([not t for t in headers[r]]):
  1194. del headers[r]
  1195. hdr_rowno.remove(r)
  1196. continue
  1197. for j in range(clmno):
  1198. if headers[r][j]:
  1199. continue
  1200. if j >= len(lst_hdr):
  1201. break
  1202. headers[r][j] = lst_hdr[j]
  1203. lst_hdr = headers[r]
  1204. for i in range(rowno):
  1205. if i not in hdr_rowno:
  1206. continue
  1207. for j in range(i + 1, rowno):
  1208. if j not in hdr_rowno:
  1209. break
  1210. for k in range(clmno):
  1211. if not headers[j - 1][k]:
  1212. continue
  1213. if headers[j][k].find(headers[j - 1][k]) >= 0:
  1214. continue
  1215. if len(headers[j][k]) > len(headers[j - 1][k]):
  1216. headers[j][k] += (de if headers[j][k]
  1217. else "") + headers[j - 1][k]
  1218. else:
  1219. headers[j][k] = headers[j - 1][k] \
  1220. + (de if headers[j - 1][k] else "") \
  1221. + headers[j][k]
  1222. logging.debug(
  1223. f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
  1224. row_txt = []
  1225. for i in range(rowno):
  1226. if i in hdr_rowno:
  1227. continue
  1228. rtxt = []
  1229. def append(delimer):
  1230. nonlocal rtxt, row_txt
  1231. rtxt = delimer.join(rtxt)
  1232. if row_txt and len(row_txt[-1]) + len(rtxt) < 64:
  1233. row_txt[-1] += "\n" + rtxt
  1234. else:
  1235. row_txt.append(rtxt)
  1236. r = 0
  1237. if len(headers.items()):
  1238. _arr = [(i - r, r) for r, _ in headers.items() if r < i]
  1239. if _arr:
  1240. _, r = min(_arr, key=lambda x: x[0])
  1241. if r not in headers and clmno <= 2:
  1242. for j in range(clmno):
  1243. if not tbl[i][j]:
  1244. continue
  1245. txt = "".join([a["text"].strip() for a in tbl[i][j]])
  1246. if txt:
  1247. rtxt.append(txt)
  1248. if rtxt:
  1249. append(":")
  1250. continue
  1251. for j in range(clmno):
  1252. if not tbl[i][j]:
  1253. continue
  1254. txt = "".join([a["text"].strip() for a in tbl[i][j]])
  1255. if not txt:
  1256. continue
  1257. ctt = headers[r][j] if r in headers else ""
  1258. if ctt:
  1259. ctt += ":"
  1260. ctt += txt
  1261. if ctt:
  1262. rtxt.append(ctt)
  1263. if rtxt:
  1264. row_txt.append("; ".join(rtxt))
  1265. if cap:
  1266. if self.is_english:
  1267. from_ = " in "
  1268. else:
  1269. from_ = "来自"
  1270. row_txt = [t + f"\t——{from_}“{cap}”" for t in row_txt]
  1271. return row_txt
  1272. @staticmethod
  1273. def is_caption(bx):
  1274. patt = [
  1275. r"[图表]+[ 0-9::]{2,}"
  1276. ]
  1277. if any([re.match(p, bx["text"].strip()) for p in patt]) \
  1278. or bx["layout_type"].find("caption") >= 0:
  1279. return True
  1280. return False
  1281. def _extract_table_figure(self, need_image, ZM, return_html):
  1282. tables = {}
  1283. figures = {}
  1284. # extract figure and table boxes
  1285. i = 0
  1286. lst_lout_no = ""
  1287. nomerge_lout_no = []
  1288. while i < len(self.boxes):
  1289. if "layoutno" not in self.boxes[i]:
  1290. i += 1
  1291. continue
  1292. lout_no = str(self.boxes[i]["page_number"]) + \
  1293. "-" + str(self.boxes[i]["layoutno"])
  1294. if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
  1295. "figure caption", "reference"]:
  1296. nomerge_lout_no.append(lst_lout_no)
  1297. if self.boxes[i]["layout_type"] == "table":
  1298. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  1299. self.boxes.pop(i)
  1300. continue
  1301. if lout_no not in tables:
  1302. tables[lout_no] = []
  1303. tables[lout_no].append(self.boxes[i])
  1304. self.boxes.pop(i)
  1305. lst_lout_no = lout_no
  1306. continue
  1307. if need_image and self.boxes[i]["layout_type"] == "figure":
  1308. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  1309. self.boxes.pop(i)
  1310. continue
  1311. if lout_no not in figures:
  1312. figures[lout_no] = []
  1313. figures[lout_no].append(self.boxes[i])
  1314. self.boxes.pop(i)
  1315. lst_lout_no = lout_no
  1316. continue
  1317. i += 1
  1318. # merge table on different pages
  1319. nomerge_lout_no = set(nomerge_lout_no)
  1320. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  1321. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  1322. i = len(tbls) - 1
  1323. while i - 1 >= 0:
  1324. k0, bxs0 = tbls[i - 1]
  1325. k, bxs = tbls[i]
  1326. i -= 1
  1327. if k0 in nomerge_lout_no:
  1328. continue
  1329. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  1330. continue
  1331. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  1332. continue
  1333. mh = self.mean_height[bxs[0]["page_number"] - 1]
  1334. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  1335. continue
  1336. tables[k0].extend(tables[k])
  1337. del tables[k]
  1338. def x_overlapped(a, b):
  1339. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  1340. # find captions and pop out
  1341. i = 0
  1342. while i < len(self.boxes):
  1343. c = self.boxes[i]
  1344. # mh = self.mean_height[c["page_number"]-1]
  1345. if not self.is_caption(c):
  1346. i += 1
  1347. continue
  1348. # find the nearest layouts
  1349. def nearest(tbls):
  1350. nonlocal c
  1351. mink = ""
  1352. minv = 1000000000
  1353. for k, bxs in tbls.items():
  1354. for b in bxs[:10]:
  1355. if b.get("layout_type", "").find("caption") >= 0:
  1356. continue
  1357. y_dis = self._y_dis(c, b)
  1358. x_dis = self._x_dis(
  1359. c, b) if not x_overlapped(
  1360. c, b) else 0
  1361. dis = y_dis * y_dis + x_dis * x_dis
  1362. if dis < minv:
  1363. mink = k
  1364. minv = dis
  1365. return mink, minv
  1366. tk, tv = nearest(tables)
  1367. fk, fv = nearest(figures)
  1368. if min(tv, fv) > 2000:
  1369. i += 1
  1370. continue
  1371. if tv < fv:
  1372. tables[tk].insert(0, c)
  1373. logging.debug(
  1374. "TABLE:" +
  1375. self.boxes[i]["text"] +
  1376. "; Cap: " +
  1377. tk)
  1378. else:
  1379. figures[fk].insert(0, c)
  1380. logging.debug(
  1381. "FIGURE:" +
  1382. self.boxes[i]["text"] +
  1383. "; Cap: " +
  1384. tk)
  1385. self.boxes.pop(i)
  1386. res = []
  1387. def cropout(bxs, ltype):
  1388. nonlocal ZM
  1389. pn = set([b["page_number"] - 1 for b in bxs])
  1390. if len(pn) < 2:
  1391. pn = list(pn)[0]
  1392. ht = self.page_cum_height[pn]
  1393. b = {
  1394. "x0": np.min([b["x0"] for b in bxs]),
  1395. "top": np.min([b["top"] for b in bxs]) - ht,
  1396. "x1": np.max([b["x1"] for b in bxs]),
  1397. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  1398. }
  1399. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  1400. ii = self.__find_overlapped(b, louts, naive=True)
  1401. if ii is not None:
  1402. b = louts[ii]
  1403. else:
  1404. logging.warn(
  1405. f"Missing layout match: {pn + 1},%s" %
  1406. (bxs[0].get(
  1407. "layoutno", "")))
  1408. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  1409. return self.page_images[pn] \
  1410. .crop((left * ZM, top * ZM,
  1411. right * ZM, bott * ZM))
  1412. pn = {}
  1413. for b in bxs:
  1414. p = b["page_number"] - 1
  1415. if p not in pn:
  1416. pn[p] = []
  1417. pn[p].append(b)
  1418. pn = sorted(pn.items(), key=lambda x: x[0])
  1419. imgs = [cropout(arr, ltype) for p, arr in pn]
  1420. pic = Image.new("RGB",
  1421. (int(np.max([i.size[0] for i in imgs])),
  1422. int(np.sum([m.size[1] for m in imgs]))),
  1423. (245, 245, 245))
  1424. height = 0
  1425. for img in imgs:
  1426. pic.paste(img, (0, int(height)))
  1427. height += img.size[1]
  1428. return pic
  1429. # crop figure out and add caption
  1430. for k, bxs in figures.items():
  1431. txt = "\n".join(
  1432. [b["text"] for b in bxs
  1433. if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
  1434. and len(b["text"].strip()) >= 4
  1435. ]
  1436. )
  1437. if not txt:
  1438. continue
  1439. res.append(
  1440. (cropout(
  1441. bxs,
  1442. "figure"),
  1443. [txt] if not return_html else [f"<p>{txt}</p>"]))
  1444. for k, bxs in tables.items():
  1445. if not bxs:
  1446. continue
  1447. res.append((cropout(bxs, "table"),
  1448. self.__construct_table(bxs, html=return_html)))
  1449. return res
  1450. def proj_match(self, line):
  1451. if len(line) <= 2:
  1452. return
  1453. if re.match(r"[0-9 ().,%%+/-]+$", line):
  1454. return False
  1455. for p, j in [
  1456. (r"第[零一二三四五六七八九十百]+章", 1),
  1457. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  1458. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  1459. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  1460. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  1461. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  1462. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  1463. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  1464. (r".{,48}[::??]$", 9),
  1465. (r"[0-9]+)", 10),
  1466. (r"[\((][0-9]+[)\)]", 11),
  1467. (r"[零一二三四五六七八九十百]+是", 12),
  1468. (r"[⚫•➢✓]", 12)
  1469. ]:
  1470. if re.match(p, line):
  1471. return j
  1472. return
  1473. def _line_tag(self, bx, ZM):
  1474. pn = [bx["page_number"]]
  1475. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  1476. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  1477. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  1478. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  1479. pn.append(pn[-1] + 1)
  1480. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  1481. .format("-".join([str(p) for p in pn]),
  1482. bx["x0"], bx["x1"], top, bott)
  1483. def __filterout_scraps(self, boxes, ZM):
  1484. def width(b):
  1485. return b["x1"] - b["x0"]
  1486. def height(b):
  1487. return b["bottom"] - b["top"]
  1488. def usefull(b):
  1489. if b.get("layout_type"):
  1490. return True
  1491. if width(
  1492. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  1493. return True
  1494. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  1495. return True
  1496. return False
  1497. res = []
  1498. while boxes:
  1499. lines = []
  1500. widths = []
  1501. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  1502. mh = self.mean_height[boxes[0]["page_number"] - 1]
  1503. mj = self.proj_match(
  1504. boxes[0]["text"]) or boxes[0].get(
  1505. "layout_type",
  1506. "") == "title"
  1507. def dfs(line, st):
  1508. nonlocal mh, pw, lines, widths
  1509. lines.append(line)
  1510. widths.append(width(line))
  1511. width_mean = np.mean(widths)
  1512. mmj = self.proj_match(
  1513. line["text"]) or line.get(
  1514. "layout_type",
  1515. "") == "title"
  1516. for i in range(st + 1, min(st + 20, len(boxes))):
  1517. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  1518. break
  1519. if not mmj and self._y_dis(
  1520. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  1521. break
  1522. if not usefull(boxes[i]):
  1523. continue
  1524. if mmj or \
  1525. (self._x_dis(boxes[i], line) < pw / 10): \
  1526. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  1527. # concat following
  1528. dfs(boxes[i], i)
  1529. boxes.pop(i)
  1530. break
  1531. try:
  1532. if usefull(boxes[0]):
  1533. dfs(boxes[0], 0)
  1534. else:
  1535. logging.debug("WASTE: " + boxes[0]["text"])
  1536. except Exception as e:
  1537. pass
  1538. boxes.pop(0)
  1539. mw = np.mean(widths)
  1540. if mj or mw / pw >= 0.35 or mw > 200:
  1541. res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  1542. else:
  1543. logging.debug("REMOVED: " +
  1544. "<<".join([c["text"] for c in lines]))
  1545. return "\n\n".join(res)
  1546. @staticmethod
  1547. def total_page_number(fnm, binary=None):
  1548. try:
  1549. pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary))
  1550. return len(pdf.pages)
  1551. except Exception as e:
  1552. pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
  1553. return len(pdf)
  1554. def __images__(self, fnm, zoomin=3, page_from=0, page_to=299):
  1555. self.lefted_chars = []
  1556. self.mean_height = []
  1557. self.mean_width = []
  1558. self.boxes = []
  1559. self.garbages = {}
  1560. self.page_cum_height = [0]
  1561. self.page_layout = []
  1562. try:
  1563. self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
  1564. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  1565. enumerate(self.pdf.pages[page_from:page_to])]
  1566. self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
  1567. self.total_page = len(self.pdf.pages)
  1568. except Exception as e:
  1569. self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
  1570. self.page_images = []
  1571. self.page_chars = []
  1572. mat = fitz.Matrix(zoomin, zoomin)
  1573. self.total_page = len(self.pdf)
  1574. for i, page in enumerate(self.pdf):
  1575. if i < page_from:continue
  1576. if i >= page_to:break
  1577. pix = page.get_pixmap(matrix=mat)
  1578. img = Image.frombytes("RGB", [pix.width, pix.height],
  1579. pix.samples)
  1580. self.page_images.append(img)
  1581. self.page_chars.append([])
  1582. logging.info("Images converted.")
  1583. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))]
  1584. if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
  1585. self.is_english = True
  1586. else:
  1587. self.is_english = False
  1588. for i, img in enumerate(self.page_images):
  1589. chars = self.page_chars[i] if not self.is_english else []
  1590. self.mean_height.append(
  1591. np.median(sorted([c["height"] for c in chars])) if chars else 0
  1592. )
  1593. self.mean_width.append(
  1594. np.median(sorted([c["width"] for c in chars])) if chars else 8
  1595. )
  1596. self.page_cum_height.append(img.size[1] / zoomin)
  1597. j = 0
  1598. while j + 1 < len(chars):
  1599. if chars[j]["text"] and chars[j + 1]["text"] \
  1600. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  1601. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  1602. chars[j]["width"]) / 2:
  1603. chars[j]["text"] += " "
  1604. j += 1
  1605. # if i > 0:
  1606. # if not chars:
  1607. # self.page_cum_height.append(img.size[1] / zoomin)
  1608. # else:
  1609. # self.page_cum_height.append(
  1610. # np.max([c["bottom"] for c in chars]))
  1611. self.__ocr(i + 1, img, chars, zoomin)
  1612. if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
  1613. bxes = [b for bxs in self.boxes for b in bxs]
  1614. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  1615. logging.info("Is it English:", self.is_english)
  1616. self.page_cum_height = np.cumsum(self.page_cum_height)
  1617. assert len(self.page_cum_height) == len(self.page_images) + 1
  1618. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  1619. self.__images__(fnm, zoomin)
  1620. self._layouts_rec(zoomin)
  1621. self._table_transformer_job(zoomin)
  1622. self._text_merge()
  1623. self._concat_downward()
  1624. self._filter_forpages()
  1625. tbls = self._extract_table_figure(need_image, zoomin, return_html)
  1626. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  1627. def remove_tag(self, txt):
  1628. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  1629. def crop(self, text, ZM=3):
  1630. imgs = []
  1631. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  1632. pn, left, right, top, bottom = tag.strip(
  1633. "#").strip("@").split("\t")
  1634. left, right, top, bottom = float(left), float(
  1635. right), float(top), float(bottom)
  1636. bottom *= ZM
  1637. pns = [int(p) - 1 for p in pn.split("-")]
  1638. for pn in pns[1:]:
  1639. bottom += self.page_images[pn - 1].size[1]
  1640. imgs.append(
  1641. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  1642. right *
  1643. ZM, min(
  1644. bottom, self.page_images[pns[0]].size[1])
  1645. ))
  1646. )
  1647. bottom -= self.page_images[pns[0]].size[1]
  1648. for pn in pns[1:]:
  1649. imgs.append(
  1650. self.page_images[pn].crop((left * ZM, 0,
  1651. right * ZM,
  1652. min(bottom,
  1653. self.page_images[pn].size[1])
  1654. ))
  1655. )
  1656. bottom -= self.page_images[pn].size[1]
  1657. if not imgs:
  1658. return
  1659. GAP = 2
  1660. height = 0
  1661. for img in imgs:
  1662. height += img.size[1] + GAP
  1663. height = int(height)
  1664. pic = Image.new("RGB",
  1665. (int(np.max([i.size[0] for i in imgs])), height),
  1666. (245, 245, 245))
  1667. height = 0
  1668. for img in imgs:
  1669. pic.paste(img, (0, int(height)))
  1670. height += img.size[1] + GAP
  1671. return pic
  1672. if __name__ == "__main__":
  1673. pass