You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638
  1. # -*- coding: utf-8 -*-
  2. import xgboost as xgb
  3. from io import BytesIO
  4. import torch
  5. import re
  6. import pdfplumber
  7. import logging
  8. from PIL import Image
  9. import numpy as np
  10. from rag.nlp import huqie
  11. from collections import Counter
  12. from copy import deepcopy
  13. from rag.cv.table_recognize import TableTransformer
  14. from rag.cv.ppdetection import PPDet
  15. from huggingface_hub import hf_hub_download
  16. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  17. class HuParser:
  18. def __init__(self):
  19. from paddleocr import PaddleOCR
  20. logging.getLogger("ppocr").setLevel(logging.ERROR)
  21. self.ocr = PaddleOCR(use_angle_cls=False, lang="ch")
  22. self.layouter = PPDet()
  23. self.tbl_det = TableTransformer()
  24. self.updown_cnt_mdl = xgb.Booster()
  25. if torch.cuda.is_available():
  26. self.updown_cnt_mdl.set_param({"device": "cuda"})
  27. self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
  28. filename="updown_concat_xgb.model"))
  29. """
  30. If you have trouble downloading HuggingFace models, -_^ this might help!!
  31. For Linux:
  32. export HF_ENDPOINT=https://hf-mirror.com
  33. For Windows:
  34. Good luck
  35. ^_-
  36. """
  37. def __char_width(self, c):
  38. return (c["x1"] - c["x0"]) // len(c["text"])
  39. def __height(self, c):
  40. return c["bottom"] - c["top"]
  41. def _x_dis(self, a, b):
  42. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  43. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  44. def _y_dis(
  45. self, a, b):
  46. return (
  47. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  48. def _match_proj(self, b):
  49. proj_patt = [
  50. r"第[零一二三四五六七八九十百]+章",
  51. r"第[零一二三四五六七八九十百]+[条节]",
  52. r"[零一二三四五六七八九十百]+[、是  ]",
  53. r"[\((][零一二三四五六七八九十百]+[)\)]",
  54. r"[\((][0-9]+[)\)]",
  55. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  56. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  57. r"[⚫•➢①② ]",
  58. ]
  59. return any([re.match(p, b["text"]) for p in proj_patt])
  60. def _updown_concat_features(self, up, down):
  61. w = max(self.__char_width(up), self.__char_width(down))
  62. h = max(self.__height(up), self.__height(down))
  63. y_dis = self._y_dis(up, down)
  64. LEN = 6
  65. tks_down = huqie.qie(down["text"][:LEN]).split(" ")
  66. tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
  67. tks_all = up["text"][-LEN:].strip() \
  68. + (" " if re.match(r"[a-zA-Z0-9]+",
  69. up["text"][-1] + down["text"][0]) else "") \
  70. + down["text"][:LEN].strip()
  71. tks_all = huqie.qie(tks_all).split(" ")
  72. fea = [
  73. up.get("R", -1) == down.get("R", -1),
  74. y_dis / h,
  75. down["page_number"] - up["page_number"],
  76. up["layout_type"] == down["layout_type"],
  77. up["layout_type"] == "text",
  78. down["layout_type"] == "text",
  79. up["layout_type"] == "table",
  80. down["layout_type"] == "table",
  81. True if re.search(
  82. r"([。?!;!?;+))]|[a-z]\.)$",
  83. up["text"]) else False,
  84. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  85. True if re.search(
  86. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  87. down["text"]) else False,
  88. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  89. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  90. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  91. True if re.search(r"[\((][^\))]+$", up["text"])
  92. and re.search(r"[\))]", down["text"]) else False,
  93. self._match_proj(down),
  94. True if re.match(r"[A-Z]", down["text"]) else False,
  95. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  96. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  97. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  98. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  99. ) > 1 and len(
  100. down["text"].strip()) > 1 else False,
  101. up["x0"] > down["x1"],
  102. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  103. self.__height(down)),
  104. self._x_dis(up, down) / max(w, 0.000001),
  105. (len(up["text"]) - len(down["text"])) /
  106. max(len(up["text"]), len(down["text"])),
  107. len(tks_all) - len(tks_up) - len(tks_down),
  108. len(tks_down) - len(tks_up),
  109. tks_down[-1] == tks_up[-1],
  110. max(down["in_row"], up["in_row"]),
  111. abs(down["in_row"] - up["in_row"]),
  112. len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
  113. len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
  114. ]
  115. return fea
  116. @staticmethod
  117. def sort_Y_firstly(arr, threashold):
  118. # sort using y1 first and then x1
  119. arr = sorted(arr, key=lambda r: (r["top"], r["x0"]))
  120. for i in range(len(arr) - 1):
  121. for j in range(i, -1, -1):
  122. # restore the order using th
  123. if abs(arr[j + 1]["top"] - arr[j]["top"]) < threashold \
  124. and arr[j + 1]["x0"] < arr[j]["x0"]:
  125. tmp = deepcopy(arr[j])
  126. arr[j] = deepcopy(arr[j + 1])
  127. arr[j + 1] = deepcopy(tmp)
  128. return arr
  129. @staticmethod
  130. def sort_R_firstly(arr, thr=0):
  131. # sort using y1 first and then x1
  132. # sorted(arr, key=lambda r: (r["top"], r["x0"]))
  133. arr = HuParser.sort_Y_firstly(arr, thr)
  134. for i in range(len(arr) - 1):
  135. for j in range(i, -1, -1):
  136. if "R" not in arr[j] or "R" not in arr[j + 1]:
  137. continue
  138. if arr[j + 1]["R"] < arr[j]["R"] \
  139. or (
  140. arr[j + 1]["R"] == arr[j]["R"]
  141. and arr[j + 1]["x0"] < arr[j]["x0"]
  142. ):
  143. tmp = arr[j]
  144. arr[j] = arr[j + 1]
  145. arr[j + 1] = tmp
  146. return arr
  147. @staticmethod
  148. def sort_X_firstly(arr, threashold, copy=True):
  149. # sort using y1 first and then x1
  150. arr = sorted(arr, key=lambda r: (r["x0"], r["top"]))
  151. for i in range(len(arr) - 1):
  152. for j in range(i, -1, -1):
  153. # restore the order using th
  154. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  155. and arr[j + 1]["top"] < arr[j]["top"]:
  156. tmp = deepcopy(arr[j]) if copy else arr[j]
  157. arr[j] = deepcopy(arr[j + 1]) if copy else arr[j + 1]
  158. arr[j + 1] = deepcopy(tmp) if copy else tmp
  159. return arr
  160. @staticmethod
  161. def sort_C_firstly(arr, thr=0):
  162. # sort using y1 first and then x1
  163. # sorted(arr, key=lambda r: (r["x0"], r["top"]))
  164. arr = HuParser.sort_X_firstly(arr, thr)
  165. for i in range(len(arr) - 1):
  166. for j in range(i, -1, -1):
  167. # restore the order using th
  168. if "C" not in arr[j] or "C" not in arr[j + 1]:
  169. continue
  170. if arr[j + 1]["C"] < arr[j]["C"] \
  171. or (
  172. arr[j + 1]["C"] == arr[j]["C"]
  173. and arr[j + 1]["top"] < arr[j]["top"]
  174. ):
  175. tmp = arr[j]
  176. arr[j] = arr[j + 1]
  177. arr[j + 1] = tmp
  178. return arr
  179. return sorted(arr, key=lambda r: (r.get("C", r["x0"]), r["top"]))
  180. def _has_color(self, o):
  181. if o.get("ncs", "") == "DeviceGray":
  182. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  183. o["non_stroking_color"][0] == 1:
  184. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  185. return False
  186. return True
  187. def __overlapped_area(self, a, b, ratio=True):
  188. tp, btm, x0, x1 = a["top"], a["bottom"], a["x0"], a["x1"]
  189. if b["x0"] > x1 or b["x1"] < x0:
  190. return 0
  191. if b["bottom"] < tp or b["top"] > btm:
  192. return 0
  193. x0_ = max(b["x0"], x0)
  194. x1_ = min(b["x1"], x1)
  195. assert x0_ <= x1_, "Fuckedup! T:{},B:{},X0:{},X1:{} ==> {}".format(
  196. tp, btm, x0, x1, b)
  197. tp_ = max(b["top"], tp)
  198. btm_ = min(b["bottom"], btm)
  199. assert tp_ <= btm_, "Fuckedup! T:{},B:{},X0:{},X1:{} => {}".format(
  200. tp, btm, x0, x1, b)
  201. ov = (btm_ - tp_) * (x1_ - x0_) if x1 - \
  202. x0 != 0 and btm - tp != 0 else 0
  203. if ov > 0 and ratio:
  204. ov /= (x1 - x0) * (btm - tp)
  205. return ov
  206. def __find_overlapped_with_threashold(self, box, boxes, thr=0.3):
  207. if not boxes:
  208. return
  209. max_overlaped_i, max_overlaped, _max_overlaped = None, thr, 0
  210. s, e = 0, len(boxes)
  211. for i in range(s, e):
  212. ov = self.__overlapped_area(box, boxes[i])
  213. _ov = self.__overlapped_area(boxes[i], box)
  214. if (ov, _ov) < (max_overlaped, _max_overlaped):
  215. continue
  216. max_overlaped_i = i
  217. max_overlaped = ov
  218. _max_overlaped = _ov
  219. return max_overlaped_i
  220. def __find_overlapped(self, box, boxes_sorted_by_y, naive=False):
  221. if not boxes_sorted_by_y:
  222. return
  223. bxs = boxes_sorted_by_y
  224. s, e, ii = 0, len(bxs), 0
  225. while s < e and not naive:
  226. ii = (e + s) // 2
  227. pv = bxs[ii]
  228. if box["bottom"] < pv["top"]:
  229. e = ii
  230. continue
  231. if box["top"] > pv["bottom"]:
  232. s = ii + 1
  233. continue
  234. break
  235. while s < ii:
  236. if box["top"] > bxs[s]["bottom"]:
  237. s += 1
  238. break
  239. while e - 1 > ii:
  240. if box["bottom"] < bxs[e - 1]["top"]:
  241. e -= 1
  242. break
  243. max_overlaped_i, max_overlaped = None, 0
  244. for i in range(s, e):
  245. ov = self.__overlapped_area(bxs[i], box)
  246. if ov <= max_overlaped:
  247. continue
  248. max_overlaped_i = i
  249. max_overlaped = ov
  250. return max_overlaped_i
  251. def _is_garbage(self, b):
  252. patt = [r"^•+$", r"(版权归©|免责条款|地址[::])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
  253. r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
  254. "(资料|数据)来源[::]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}",
  255. "\\(cid *: *[0-9]+ *\\)"
  256. ]
  257. return any([re.search(p, b["text"]) for p in patt])
  258. def __layouts_cleanup(self, boxes, layouts, far=2, thr=0.7):
  259. def notOverlapped(a, b):
  260. return any([a["x1"] < b["x0"],
  261. a["x0"] > b["x1"],
  262. a["bottom"] < b["top"],
  263. a["top"] > b["bottom"]])
  264. i = 0
  265. while i + 1 < len(layouts):
  266. j = i + 1
  267. while j < min(i + far, len(layouts)) \
  268. and (layouts[i].get("type", "") != layouts[j].get("type", "")
  269. or notOverlapped(layouts[i], layouts[j])):
  270. j += 1
  271. if j >= min(i + far, len(layouts)):
  272. i += 1
  273. continue
  274. if self.__overlapped_area(layouts[i], layouts[j]) < thr \
  275. and self.__overlapped_area(layouts[j], layouts[i]) < thr:
  276. i += 1
  277. continue
  278. if layouts[i].get("score") and layouts[j].get("score"):
  279. if layouts[i]["score"] > layouts[j]["score"]:
  280. layouts.pop(j)
  281. else:
  282. layouts.pop(i)
  283. continue
  284. area_i, area_i_1 = 0, 0
  285. for b in boxes:
  286. if not notOverlapped(b, layouts[i]):
  287. area_i += self.__overlapped_area(b, layouts[i], False)
  288. if not notOverlapped(b, layouts[j]):
  289. area_i_1 += self.__overlapped_area(b, layouts[j], False)
  290. if area_i > area_i_1:
  291. layouts.pop(j)
  292. else:
  293. layouts.pop(i)
  294. return layouts
  295. def __table_paddle(self, images):
  296. tbls = self.tbl_det([img for img in images], threshold=0.5)
  297. res = []
  298. # align left&right for rows, align top&bottom for columns
  299. for tbl in tbls:
  300. lts = [{"label": b["type"],
  301. "score": b["score"],
  302. "x0": b["bbox"][0], "x1": b["bbox"][2],
  303. "top": b["bbox"][1], "bottom": b["bbox"][-1]
  304. } for b in tbl]
  305. if not lts:
  306. continue
  307. left = [b["x0"] for b in lts if b["label"].find(
  308. "row") > 0 or b["label"].find("header") > 0]
  309. right = [b["x1"] for b in lts if b["label"].find(
  310. "row") > 0 or b["label"].find("header") > 0]
  311. if not left:
  312. continue
  313. left = np.median(left) if len(left) > 4 else np.min(left)
  314. right = np.median(right) if len(right) > 4 else np.max(right)
  315. for b in lts:
  316. if b["label"].find("row") > 0 or b["label"].find("header") > 0:
  317. if b["x0"] > left:
  318. b["x0"] = left
  319. if b["x1"] < right:
  320. b["x1"] = right
  321. top = [b["top"] for b in lts if b["label"] == "table column"]
  322. bottom = [b["bottom"] for b in lts if b["label"] == "table column"]
  323. if not top:
  324. res.append(lts)
  325. continue
  326. top = np.median(top) if len(top) > 4 else np.min(top)
  327. bottom = np.median(bottom) if len(bottom) > 4 else np.max(bottom)
  328. for b in lts:
  329. if b["label"] == "table column":
  330. if b["top"] > top:
  331. b["top"] = top
  332. if b["bottom"] < bottom:
  333. b["bottom"] = bottom
  334. res.append(lts)
  335. return res
  336. def __table_transformer_job(self, ZM):
  337. logging.info("Table processing...")
  338. imgs, pos = [], []
  339. tbcnt = [0]
  340. MARGIN = 10
  341. self.tb_cpns = []
  342. assert len(self.page_layout) == len(self.page_images)
  343. for p, tbls in enumerate(self.page_layout): # for page
  344. tbls = [f for f in tbls if f["type"] == "table"]
  345. tbcnt.append(len(tbls))
  346. if not tbls:
  347. continue
  348. for tb in tbls: # for table
  349. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  350. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  351. left *= ZM
  352. top *= ZM
  353. right *= ZM
  354. bott *= ZM
  355. pos.append((left, top))
  356. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  357. assert len(self.page_images) == len(tbcnt) - 1
  358. if not imgs:
  359. return
  360. recos = self.__table_paddle(imgs)
  361. tbcnt = np.cumsum(tbcnt)
  362. for i in range(len(tbcnt) - 1): # for page
  363. pg = []
  364. for j, tb_items in enumerate(
  365. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  366. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  367. for it in tb_items: # for table components
  368. it["x0"] = (it["x0"] + poss[j][0])
  369. it["x1"] = (it["x1"] + poss[j][0])
  370. it["top"] = (it["top"] + poss[j][1])
  371. it["bottom"] = (it["bottom"] + poss[j][1])
  372. for n in ["x0", "x1", "top", "bottom"]:
  373. it[n] /= ZM
  374. it["top"] += self.page_cum_height[i]
  375. it["bottom"] += self.page_cum_height[i]
  376. it["pn"] = i
  377. it["layoutno"] = j
  378. pg.append(it)
  379. self.tb_cpns.extend(pg)
  380. def __ocr_paddle(self, pagenum, img, chars, ZM=3):
  381. bxs = self.ocr.ocr(np.array(img), cls=True)[0]
  382. if not bxs:
  383. self.boxes.append([])
  384. return
  385. bxs = [(line[0], line[1][0]) for line in bxs]
  386. bxs = self.sort_Y_firstly(
  387. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  388. "top": b[0][1] / ZM, "text": "", "txt": t,
  389. "bottom": b[-1][1] / ZM,
  390. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  391. self.mean_height[-1] / 3
  392. )
  393. # merge chars in the same rect
  394. for c in self.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4):
  395. ii = self.__find_overlapped(c, bxs)
  396. if ii is None:
  397. self.lefted_chars.append(c)
  398. continue
  399. ch = c["bottom"] - c["top"]
  400. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  401. if abs(ch - bh) / max(ch, bh) >= 0.7:
  402. self.lefted_chars.append(c)
  403. continue
  404. bxs[ii]["text"] += c["text"]
  405. for b in bxs:
  406. if not b["text"]:
  407. b["text"] = b["txt"]
  408. del b["txt"]
  409. if self.mean_height[-1] == 0:
  410. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  411. for b in bxs])
  412. self.boxes.append(bxs)
  413. def __layouts_paddle(self, ZM):
  414. assert len(self.page_images) == len(self.boxes)
  415. # Tag layout type
  416. boxes = []
  417. layouts = self.layouter([np.array(img) for img in self.page_images])
  418. assert len(self.page_images) == len(layouts)
  419. for pn, lts in enumerate(layouts):
  420. bxs = self.boxes[pn]
  421. lts = [{"type": b["type"],
  422. "score": float(b["score"]),
  423. "x0": b["bbox"][0] / ZM, "x1": b["bbox"][2] / ZM,
  424. "top": b["bbox"][1] / ZM, "bottom": b["bbox"][-1] / ZM,
  425. "page_number": pn,
  426. } for b in lts]
  427. lts = self.sort_Y_firstly(lts, self.mean_height[pn] / 2)
  428. lts = self.__layouts_cleanup(bxs, lts)
  429. self.page_layout.append(lts)
  430. # Tag layout type, layouts are ready
  431. def findLayout(ty):
  432. nonlocal bxs, lts
  433. lts_ = [lt for lt in lts if lt["type"] == ty]
  434. i = 0
  435. while i < len(bxs):
  436. if bxs[i].get("layout_type"):
  437. i += 1
  438. continue
  439. if self._is_garbage(bxs[i]):
  440. logging.debug("GARBAGE: " + bxs[i]["text"])
  441. bxs.pop(i)
  442. continue
  443. ii = self.__find_overlapped_with_threashold(bxs[i], lts_,
  444. thr=0.4)
  445. if ii is None: # belong to nothing
  446. bxs[i]["layout_type"] = ""
  447. i += 1
  448. continue
  449. lts_[ii]["visited"] = True
  450. if lts_[ii]["type"] in ["footer", "header", "reference"]:
  451. if lts_[ii]["type"] not in self.garbages:
  452. self.garbages[lts_[ii]["type"]] = []
  453. self.garbages[lts_[ii]["type"]].append(bxs[i]["text"])
  454. logging.debug("GARBAGE: " + bxs[i]["text"])
  455. bxs.pop(i)
  456. continue
  457. bxs[i]["layoutno"] = f"{ty}-{ii}"
  458. bxs[i]["layout_type"] = lts_[ii]["type"]
  459. i += 1
  460. for lt in ["footer", "header", "reference", "figure caption",
  461. "table caption", "title", "text", "table", "figure"]:
  462. findLayout(lt)
  463. # add box to figure layouts which has not text box
  464. for i, lt in enumerate(
  465. [lt for lt in lts if lt["type"] == "figure"]):
  466. if lt.get("visited"):
  467. continue
  468. lt = deepcopy(lt)
  469. del lt["type"]
  470. lt["text"] = ""
  471. lt["layout_type"] = "figure"
  472. lt["layoutno"] = f"figure-{i}"
  473. bxs.append(lt)
  474. boxes.extend(bxs)
  475. self.boxes = boxes
  476. def __text_merge(self, garbage):
  477. # merge adjusted boxes
  478. bxs = self.boxes
  479. def end_with(b, txt):
  480. txt = txt.strip()
  481. tt = b.get("text", "").strip()
  482. return tt and tt.find(txt) == len(tt) - len(txt)
  483. def start_with(b, txts):
  484. tt = b.get("text", "").strip()
  485. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  486. i = 0
  487. while i < len(bxs) - 1:
  488. b = bxs[i]
  489. b_ = bxs[i + 1]
  490. if b.get("layoutno", "0") != b_.get("layoutno", "1"):
  491. i += 1
  492. continue
  493. dis_thr = 1
  494. dis = b["x1"] - b_["x0"]
  495. if b.get("layout_type", "") != "text" or b_.get(
  496. "layout_type", "") != "text":
  497. if end_with(b, ",") or start_with(b_, "(,"):
  498. dis_thr = -8
  499. else:
  500. i += 1
  501. continue
  502. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  503. and dis >= dis_thr and b["x1"] < b_["x1"]:
  504. # merge
  505. bxs[i]["x1"] = b_["x1"]
  506. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  507. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  508. bxs[i]["text"] += b_["text"]
  509. bxs.pop(i + 1)
  510. continue
  511. i += 1
  512. self.boxes = bxs
  513. # count boxes in the same row
  514. for i in range(len(self.boxes)):
  515. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  516. self.boxes[i]["in_row"] = 0
  517. j = max(0, i - 12)
  518. while j < min(i + 12, len(self.boxes)):
  519. if j == i:
  520. j += 1
  521. continue
  522. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  523. if abs(ydis) < 1:
  524. self.boxes[i]["in_row"] += 1
  525. elif ydis > 0:
  526. break
  527. j += 1
  528. def gather(kwd, fzy=10, ption=0.6):
  529. eles = self.sort_Y_firstly(
  530. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  531. eles = self.__layouts_cleanup(self.boxes, eles, 5, ption)
  532. return self.sort_Y_firstly(eles, 0)
  533. headers = gather(r".*header$")
  534. rows = gather(r".* (row|header)")
  535. spans = gather(r".*spanning")
  536. clmns = sorted([r for r in self.tb_cpns if re.match(
  537. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  538. clmns = self.__layouts_cleanup(self.boxes, clmns, 5, 0.5)
  539. for b in self.boxes:
  540. if b.get("layout_type", "") != "table":
  541. continue
  542. ii = self.__find_overlapped_with_threashold(b, rows, thr=0.3)
  543. if ii is not None:
  544. b["R"] = ii
  545. b["R_top"] = rows[ii]["top"]
  546. b["R_bott"] = rows[ii]["bottom"]
  547. ii = self.__find_overlapped_with_threashold(b, headers, thr=0.3)
  548. if ii is not None:
  549. b["H_top"] = headers[ii]["top"]
  550. b["H_bott"] = headers[ii]["bottom"]
  551. b["H_left"] = headers[ii]["x0"]
  552. b["H_right"] = headers[ii]["x1"]
  553. b["H"] = ii
  554. ii = self.__find_overlapped_with_threashold(b, clmns, thr=0.3)
  555. if ii is not None:
  556. b["C"] = ii
  557. b["C_left"] = clmns[ii]["x0"]
  558. b["C_right"] = clmns[ii]["x1"]
  559. ii = self.__find_overlapped_with_threashold(b, spans, thr=0.3)
  560. if ii is not None:
  561. b["H_top"] = spans[ii]["top"]
  562. b["H_bott"] = spans[ii]["bottom"]
  563. b["H_left"] = spans[ii]["x0"]
  564. b["H_right"] = spans[ii]["x1"]
  565. b["SP"] = ii
  566. # concat between rows
  567. boxes = deepcopy(self.boxes)
  568. blocks = []
  569. while boxes:
  570. chunks = []
  571. def dfs(up, dp):
  572. if not up["text"].strip() or up["text"].strip() in garbage:
  573. return
  574. chunks.append(up)
  575. i = dp
  576. while i < min(dp + 12, len(boxes)):
  577. ydis = self._y_dis(up, boxes[i])
  578. smpg = up["page_number"] == boxes[i]["page_number"]
  579. mh = self.mean_height[up["page_number"] - 1]
  580. mw = self.mean_width[up["page_number"] - 1]
  581. if smpg and ydis > mh * 4:
  582. break
  583. if not smpg and ydis > mh * 16:
  584. break
  585. down = boxes[i]
  586. if up.get("R", "") != down.get(
  587. "R", "") and up["text"][-1] != ",":
  588. i += 1
  589. continue
  590. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  591. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
  592. i += 1
  593. continue
  594. if not down["text"].strip() \
  595. or down["text"].strip() in garbage:
  596. i += 1
  597. continue
  598. if up["x1"] < down["x0"] - 10 * \
  599. mw or up["x0"] > down["x1"] + 10 * mw:
  600. i += 1
  601. continue
  602. if i - dp < 5 and up.get("layout_type") == "text":
  603. if up.get("layoutno", "1") == down.get(
  604. "layoutno", "2"):
  605. dfs(down, i + 1)
  606. boxes.pop(i)
  607. return
  608. i += 1
  609. continue
  610. fea = self._updown_concat_features(up, down)
  611. if self.updown_cnt_mdl.predict(
  612. xgb.DMatrix([fea]))[0] <= 0.5:
  613. i += 1
  614. continue
  615. dfs(down, i + 1)
  616. boxes.pop(i)
  617. return
  618. dfs(boxes[0], 1)
  619. boxes.pop(0)
  620. if chunks:
  621. blocks.append(chunks)
  622. # concat within each block
  623. boxes = []
  624. for b in blocks:
  625. if len(b) == 1:
  626. boxes.append(b[0])
  627. continue
  628. t = b[0]
  629. for c in b[1:]:
  630. t["text"] = t["text"].strip()
  631. c["text"] = c["text"].strip()
  632. if not c["text"]:
  633. continue
  634. if t["text"] and re.match(
  635. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  636. t["text"] += " "
  637. t["text"] += c["text"]
  638. t["x0"] = min(t["x0"], c["x0"])
  639. t["x1"] = max(t["x1"], c["x1"])
  640. t["page_number"] = min(t["page_number"], c["page_number"])
  641. t["bottom"] = c["bottom"]
  642. if not t["layout_type"] \
  643. and c["layout_type"]:
  644. t["layout_type"] = c["layout_type"]
  645. boxes.append(t)
  646. self.boxes = self.sort_Y_firstly(boxes, 0)
  647. def __filter_forpages(self):
  648. if not self.boxes:
  649. return
  650. to = min(7, len(self.page_images) // 5)
  651. pg_hits = [0 for _ in range(to)]
  652. def possible(c):
  653. if c.get("layout_type", "") == "reference":
  654. return True
  655. if c["bottom"] - c["top"] >= 2 * \
  656. self.mean_height[c["page_number"] - 1]:
  657. return False
  658. if c["text"].find("....") >= 0 \
  659. or (c["x1"] - c["x0"] > 250 and re.search(r"[0-9]+$",
  660. c["text"].strip())):
  661. return True
  662. return self.is_caption(c) and re.search(
  663. r"[0-9]+$", c["text"].strip())
  664. for c in self.boxes:
  665. if c["page_number"] >= to:
  666. break
  667. if possible(c):
  668. pg_hits[c["page_number"] - 1] += 1
  669. st, ed = -1, -1
  670. for i in range(len(self.boxes)):
  671. c = self.boxes[i]
  672. if c["page_number"] >= to:
  673. break
  674. if pg_hits[c["page_number"] - 1] >= 3 and possible(c):
  675. if st < 0:
  676. st = i
  677. else:
  678. ed = i
  679. for _ in range(st, ed + 1):
  680. self.boxes.pop(st)
  681. def _blockType(self, b):
  682. patt = [
  683. ("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
  684. (r"^(20|19)[0-9]{2}年$", "Dt"),
  685. (r"^(20|19)[0-9]{2}[年-][0-9]{1,2}月*$", "Dt"),
  686. ("^[0-9]{1,2}[月-][0-9]{1,2}日*$", "Dt"),
  687. (r"^第*[一二三四1-4]季度$", "Dt"),
  688. (r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
  689. (r"^(20|19)[0-9]{2}[ABCDE]$", "Dt"),
  690. ("^[0-9.,+%/ -]+$", "Nu"),
  691. (r"^[0-9A-Z/\._~-]+$", "Ca"),
  692. (r"^[A-Z]*[a-z' -]+$", "En"),
  693. (r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
  694. (r"^.{1}$", "Sg")
  695. ]
  696. for p, n in patt:
  697. if re.search(p, b["text"].strip()):
  698. return n
  699. tks = [t for t in huqie.qie(b["text"]).split(" ") if len(t) > 1]
  700. if len(tks) > 3:
  701. if len(tks) < 12:
  702. return "Tx"
  703. else:
  704. return "Lx"
  705. if len(tks) == 1 and huqie.tag(tks[0]) == "nr":
  706. return "Nr"
  707. return "Ot"
  708. def __cal_spans(self, boxes, rows, cols, tbl, html=True):
  709. # caculate span
  710. clft = [np.mean([c.get("C_left", c["x0"]) for c in cln])
  711. for cln in cols]
  712. crgt = [np.mean([c.get("C_right", c["x1"]) for c in cln])
  713. for cln in cols]
  714. rtop = [np.mean([c.get("R_top", c["top"]) for c in row])
  715. for row in rows]
  716. rbtm = [np.mean([c.get("R_btm", c["bottom"])
  717. for c in row]) for row in rows]
  718. for b in boxes:
  719. if "SP" not in b:
  720. continue
  721. b["colspan"] = [b["cn"]]
  722. b["rowspan"] = [b["rn"]]
  723. # col span
  724. for j in range(0, len(clft)):
  725. if j == b["cn"]:
  726. continue
  727. if clft[j] + (crgt[j] - clft[j]) / 2 < b["H_left"]:
  728. continue
  729. if crgt[j] - (crgt[j] - clft[j]) / 2 > b["H_right"]:
  730. continue
  731. b["colspan"].append(j)
  732. # row span
  733. for j in range(0, len(rtop)):
  734. if j == b["rn"]:
  735. continue
  736. if rtop[j] + (rbtm[j] - rtop[j]) / 2 < b["H_top"]:
  737. continue
  738. if rbtm[j] - (rbtm[j] - rtop[j]) / 2 > b["H_bott"]:
  739. continue
  740. b["rowspan"].append(j)
  741. def join(arr):
  742. if not arr:
  743. return ""
  744. return "".join([t["text"] for t in arr])
  745. # rm the spaning cells
  746. for i in range(len(tbl)):
  747. for j, arr in enumerate(tbl[i]):
  748. if not arr:
  749. continue
  750. if all(["rowspan" not in a and "colspan" not in a for a in arr]):
  751. continue
  752. rowspan, colspan = [], []
  753. for a in arr:
  754. if isinstance(a.get("rowspan", 0), list):
  755. rowspan.extend(a["rowspan"])
  756. if isinstance(a.get("colspan", 0), list):
  757. colspan.extend(a["colspan"])
  758. rowspan, colspan = set(rowspan), set(colspan)
  759. if len(rowspan) < 2 and len(colspan) < 2:
  760. for a in arr:
  761. if "rowspan" in a:
  762. del a["rowspan"]
  763. if "colspan" in a:
  764. del a["colspan"]
  765. continue
  766. rowspan, colspan = sorted(rowspan), sorted(colspan)
  767. rowspan = list(range(rowspan[0], rowspan[-1] + 1))
  768. colspan = list(range(colspan[0], colspan[-1] + 1))
  769. assert i in rowspan, rowspan
  770. assert j in colspan, colspan
  771. arr = []
  772. for r in rowspan:
  773. for c in colspan:
  774. arr_txt = join(arr)
  775. if tbl[r][c] and join(tbl[r][c]) != arr_txt:
  776. arr.extend(tbl[r][c])
  777. tbl[r][c] = None if html else arr
  778. for a in arr:
  779. if len(rowspan) > 1:
  780. a["rowspan"] = len(rowspan)
  781. elif "rowspan" in a:
  782. del a["rowspan"]
  783. if len(colspan) > 1:
  784. a["colspan"] = len(colspan)
  785. elif "colspan" in a:
  786. del a["colspan"]
  787. tbl[rowspan[0]][colspan[0]] = arr
  788. return tbl
  789. def __construct_table(self, boxes, html=False):
  790. cap = ""
  791. i = 0
  792. while i < len(boxes):
  793. if self.is_caption(boxes[i]):
  794. cap += boxes[i]["text"]
  795. boxes.pop(i)
  796. i -= 1
  797. i += 1
  798. if not boxes:
  799. return []
  800. for b in boxes:
  801. b["btype"] = self._blockType(b)
  802. max_type = Counter([b["btype"] for b in boxes]).items()
  803. max_type = max(max_type, key=lambda x: x[1])[0] if max_type else ""
  804. logging.debug("MAXTYPE: " + max_type)
  805. rowh = [b["R_bott"] - b["R_top"] for b in boxes if "R" in b]
  806. rowh = np.min(rowh) if rowh else 0
  807. # boxes = self.sort_Y_firstly(boxes, rowh/5)
  808. boxes = self.sort_R_firstly(boxes, rowh / 2)
  809. boxes[0]["rn"] = 0
  810. rows = [[boxes[0]]]
  811. btm = boxes[0]["bottom"]
  812. for b in boxes[1:]:
  813. b["rn"] = len(rows) - 1
  814. lst_r = rows[-1]
  815. if lst_r[-1].get("R", "") != b.get("R", "") \
  816. or (b["top"] >= btm - 3 and lst_r[-1].get("R", "-1") != b.get("R", "-2")
  817. ): # new row
  818. btm = b["bottom"]
  819. b["rn"] += 1
  820. rows.append([b])
  821. continue
  822. btm = (btm + b["bottom"]) / 2.
  823. rows[-1].append(b)
  824. colwm = [b["C_right"] - b["C_left"] for b in boxes if "C" in b]
  825. colwm = np.min(colwm) if colwm else 0
  826. crosspage = len(set([b["page_number"] for b in boxes])) > 1
  827. if crosspage:
  828. boxes = self.sort_X_firstly(boxes, colwm / 2, False)
  829. else:
  830. boxes = self.sort_C_firstly(boxes, colwm / 2)
  831. boxes[0]["cn"] = 0
  832. cols = [[boxes[0]]]
  833. right = boxes[0]["x1"]
  834. for b in boxes[1:]:
  835. b["cn"] = len(cols) - 1
  836. lst_c = cols[-1]
  837. if (int(b.get("C", "1")) - int(lst_c[-1].get("C", "1")) == 1 and b["page_number"] == lst_c[-1][
  838. "page_number"]) \
  839. or (b["x0"] >= right and lst_c[-1].get("C", "-1") != b.get("C", "-2")): # new col
  840. right = b["x1"]
  841. b["cn"] += 1
  842. cols.append([b])
  843. continue
  844. right = (right + b["x1"]) / 2.
  845. cols[-1].append(b)
  846. tbl = [[[] for _ in range(len(cols))] for _ in range(len(rows))]
  847. for b in boxes:
  848. tbl[b["rn"]][b["cn"]].append(b)
  849. if len(rows) >= 4:
  850. # remove single in column
  851. j = 0
  852. while j < len(tbl[0]):
  853. e, ii = 0, 0
  854. for i in range(len(tbl)):
  855. if tbl[i][j]:
  856. e += 1
  857. ii = i
  858. if e > 1:
  859. break
  860. if e > 1:
  861. j += 1
  862. continue
  863. f = (j > 0 and tbl[ii][j - 1] and tbl[ii]
  864. [j - 1][0].get("text")) or j == 0
  865. ff = (j + 1 < len(tbl[ii]) and tbl[ii][j + 1] and tbl[ii]
  866. [j + 1][0].get("text")) or j + 1 >= len(tbl[ii])
  867. if f and ff:
  868. j += 1
  869. continue
  870. bx = tbl[ii][j][0]
  871. logging.debug("Relocate column single: " + bx["text"])
  872. # j column only has one value
  873. left, right = 100000, 100000
  874. if j > 0 and not f:
  875. for i in range(len(tbl)):
  876. if tbl[i][j - 1]:
  877. left = min(left, np.min(
  878. [bx["x0"] - a["x1"] for a in tbl[i][j - 1]]))
  879. if j + 1 < len(tbl[0]) and not ff:
  880. for i in range(len(tbl)):
  881. if tbl[i][j + 1]:
  882. right = min(right, np.min(
  883. [a["x0"] - bx["x1"] for a in tbl[i][j + 1]]))
  884. assert left < 100000 or right < 100000
  885. if left < right:
  886. for jj in range(j, len(tbl[0])):
  887. for i in range(len(tbl)):
  888. for a in tbl[i][jj]:
  889. a["cn"] -= 1
  890. if tbl[ii][j - 1]:
  891. tbl[ii][j - 1].extend(tbl[ii][j])
  892. else:
  893. tbl[ii][j - 1] = tbl[ii][j]
  894. for i in range(len(tbl)):
  895. tbl[i].pop(j)
  896. else:
  897. for jj in range(j + 1, len(tbl[0])):
  898. for i in range(len(tbl)):
  899. for a in tbl[i][jj]:
  900. a["cn"] -= 1
  901. if tbl[ii][j + 1]:
  902. tbl[ii][j + 1].extend(tbl[ii][j])
  903. else:
  904. tbl[ii][j + 1] = tbl[ii][j]
  905. for i in range(len(tbl)):
  906. tbl[i].pop(j)
  907. cols.pop(j)
  908. assert len(cols) == len(tbl[0]), "Column NO. miss matched: %d vs %d" % (
  909. len(cols), len(tbl[0]))
  910. if len(cols) >= 4:
  911. # remove single in row
  912. i = 0
  913. while i < len(tbl):
  914. e, jj = 0, 0
  915. for j in range(len(tbl[i])):
  916. if tbl[i][j]:
  917. e += 1
  918. jj = j
  919. if e > 1:
  920. break
  921. if e > 1:
  922. i += 1
  923. continue
  924. f = (i > 0 and tbl[i - 1][jj] and tbl[i - 1]
  925. [jj][0].get("text")) or i == 0
  926. ff = (i + 1 < len(tbl) and tbl[i + 1][jj] and tbl[i + 1]
  927. [jj][0].get("text")) or i + 1 >= len(tbl)
  928. if f and ff:
  929. i += 1
  930. continue
  931. bx = tbl[i][jj][0]
  932. logging.debug("Relocate row single: " + bx["text"])
  933. # i row only has one value
  934. up, down = 100000, 100000
  935. if i > 0 and not f:
  936. for j in range(len(tbl[i - 1])):
  937. if tbl[i - 1][j]:
  938. up = min(up, np.min(
  939. [bx["top"] - a["bottom"] for a in tbl[i - 1][j]]))
  940. if i + 1 < len(tbl) and not ff:
  941. for j in range(len(tbl[i + 1])):
  942. if tbl[i + 1][j]:
  943. down = min(down, np.min(
  944. [a["top"] - bx["bottom"] for a in tbl[i + 1][j]]))
  945. assert up < 100000 or down < 100000
  946. if up < down:
  947. for ii in range(i, len(tbl)):
  948. for j in range(len(tbl[ii])):
  949. for a in tbl[ii][j]:
  950. a["rn"] -= 1
  951. if tbl[i - 1][jj]:
  952. tbl[i - 1][jj].extend(tbl[i][jj])
  953. else:
  954. tbl[i - 1][jj] = tbl[i][jj]
  955. tbl.pop(i)
  956. else:
  957. for ii in range(i + 1, len(tbl)):
  958. for j in range(len(tbl[ii])):
  959. for a in tbl[ii][j]:
  960. a["rn"] -= 1
  961. if tbl[i + 1][jj]:
  962. tbl[i + 1][jj].extend(tbl[i][jj])
  963. else:
  964. tbl[i + 1][jj] = tbl[i][jj]
  965. tbl.pop(i)
  966. rows.pop(i)
  967. # which rows are headers
  968. hdset = set([])
  969. for i in range(len(tbl)):
  970. cnt, h = 0, 0
  971. for j, arr in enumerate(tbl[i]):
  972. if not arr:
  973. continue
  974. cnt += 1
  975. if max_type == "Nu" and arr[0]["btype"] == "Nu":
  976. continue
  977. if any([a.get("H") for a in arr]) \
  978. or (max_type == "Nu" and arr[0]["btype"] != "Nu"):
  979. h += 1
  980. if h / cnt > 0.5:
  981. hdset.add(i)
  982. if html:
  983. return [self.__html_table(cap, hdset,
  984. self.__cal_spans(boxes, rows,
  985. cols, tbl, True)
  986. )]
  987. return self.__desc_table(cap, hdset,
  988. self.__cal_spans(boxes, rows, cols, tbl, False))
  989. def __html_table(self, cap, hdset, tbl):
  990. # constrcut HTML
  991. html = "<table>"
  992. if cap:
  993. html += f"<caption>{cap}</caption>"
  994. for i in range(len(tbl)):
  995. row = "<tr>"
  996. txts = []
  997. for j, arr in enumerate(tbl[i]):
  998. if arr is None:
  999. continue
  1000. if not arr:
  1001. row += "<td></td>" if i not in hdset else "<th></th>"
  1002. continue
  1003. txt = ""
  1004. if arr:
  1005. h = min(np.min([c["bottom"] - c["top"] for c in arr]) / 2,
  1006. self.mean_height[arr[0]["page_number"] - 1] / 2)
  1007. txt = "".join([c["text"]
  1008. for c in self.sort_Y_firstly(arr, h)])
  1009. txts.append(txt)
  1010. sp = ""
  1011. if arr[0].get("colspan"):
  1012. sp = "colspan={}".format(arr[0]["colspan"])
  1013. if arr[0].get("rowspan"):
  1014. sp += " rowspan={}".format(arr[0]["rowspan"])
  1015. if i in hdset:
  1016. row += f"<th {sp} >" + txt + "</th>"
  1017. else:
  1018. row += f"<td {sp} >" + txt + "</td>"
  1019. if i in hdset:
  1020. if all([t in hdset for t in txts]):
  1021. continue
  1022. for t in txts:
  1023. hdset.add(t)
  1024. if row != "<tr>":
  1025. row += "</tr>"
  1026. else:
  1027. row = ""
  1028. html += "\n" + row
  1029. html += "\n</table>"
  1030. return html
  1031. def __desc_table(self, cap, hdr_rowno, tbl):
  1032. # get text of every colomn in header row to become header text
  1033. clmno = len(tbl[0])
  1034. rowno = len(tbl)
  1035. headers = {}
  1036. hdrset = set()
  1037. lst_hdr = []
  1038. for r in sorted(list(hdr_rowno)):
  1039. headers[r] = ["" for _ in range(clmno)]
  1040. for i in range(clmno):
  1041. if not tbl[r][i]:
  1042. continue
  1043. txt = "".join([a["text"].strip() for a in tbl[r][i]])
  1044. headers[r][i] = txt
  1045. hdrset.add(txt)
  1046. if all([not t for t in headers[r]]):
  1047. del headers[r]
  1048. hdr_rowno.remove(r)
  1049. continue
  1050. for j in range(clmno):
  1051. if headers[r][j]:
  1052. continue
  1053. if j >= len(lst_hdr):
  1054. break
  1055. headers[r][j] = lst_hdr[j]
  1056. lst_hdr = headers[r]
  1057. for i in range(rowno):
  1058. if i not in hdr_rowno:
  1059. continue
  1060. for j in range(i + 1, rowno):
  1061. if j not in hdr_rowno:
  1062. break
  1063. for k in range(clmno):
  1064. if not headers[j - 1][k]:
  1065. continue
  1066. if headers[j][k].find(headers[j - 1][k]) >= 0:
  1067. continue
  1068. if len(headers[j][k]) > len(headers[j - 1][k]):
  1069. headers[j][k] += ("的" if headers[j][k]
  1070. else "") + headers[j - 1][k]
  1071. else:
  1072. headers[j][k] = headers[j - 1][k] \
  1073. + ("的" if headers[j - 1][k] else "") \
  1074. + headers[j][k]
  1075. logging.debug(
  1076. f">>>>>>>>>>>>>>>>>{cap}:SIZE:{rowno}X{clmno} Header: {hdr_rowno}")
  1077. row_txt = []
  1078. for i in range(rowno):
  1079. if i in hdr_rowno:
  1080. continue
  1081. rtxt = []
  1082. def append(delimer):
  1083. nonlocal rtxt, row_txt
  1084. rtxt = delimer.join(rtxt)
  1085. if row_txt and len(row_txt[-1]) + len(rtxt) < 64:
  1086. row_txt[-1] += "\n" + rtxt
  1087. else:
  1088. row_txt.append(rtxt)
  1089. r = 0
  1090. if len(headers.items()):
  1091. _arr = [(i - r, r) for r, _ in headers.items() if r < i]
  1092. if _arr:
  1093. _, r = min(_arr, key=lambda x: x[0])
  1094. if r not in headers and clmno <= 2:
  1095. for j in range(clmno):
  1096. if not tbl[i][j]:
  1097. continue
  1098. txt = "".join([a["text"].strip() for a in tbl[i][j]])
  1099. if txt:
  1100. rtxt.append(txt)
  1101. if rtxt:
  1102. append(":")
  1103. continue
  1104. for j in range(clmno):
  1105. if not tbl[i][j]:
  1106. continue
  1107. txt = "".join([a["text"].strip() for a in tbl[i][j]])
  1108. if not txt:
  1109. continue
  1110. ctt = headers[r][j] if r in headers else ""
  1111. if ctt:
  1112. ctt += ":"
  1113. ctt += txt
  1114. if ctt:
  1115. rtxt.append(ctt)
  1116. if rtxt:
  1117. row_txt.append("; ".join(rtxt))
  1118. if cap:
  1119. row_txt = [t + f"\t——来自“{cap}”" for t in row_txt]
  1120. return row_txt
  1121. @staticmethod
  1122. def is_caption(bx):
  1123. patt = [
  1124. r"[图表]+[ 0-9::]{2,}"
  1125. ]
  1126. if any([re.match(p, bx["text"].strip()) for p in patt]) \
  1127. or bx["layout_type"].find("caption") >= 0:
  1128. return True
  1129. return False
  1130. def __extract_table_figure(self, need_image, ZM, return_html):
  1131. tables = {}
  1132. figures = {}
  1133. # extract figure and table boxes
  1134. i = 0
  1135. lst_lout_no = ""
  1136. nomerge_lout_no = []
  1137. while i < len(self.boxes):
  1138. if "layoutno" not in self.boxes[i]:
  1139. i += 1
  1140. continue
  1141. lout_no = str(self.boxes[i]["page_number"]) + \
  1142. "-" + str(self.boxes[i]["layoutno"])
  1143. if self.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
  1144. "figure caption", "reference"]:
  1145. nomerge_lout_no.append(lst_lout_no)
  1146. if self.boxes[i]["layout_type"] == "table":
  1147. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  1148. self.boxes.pop(i)
  1149. continue
  1150. if lout_no not in tables:
  1151. tables[lout_no] = []
  1152. tables[lout_no].append(self.boxes[i])
  1153. self.boxes.pop(i)
  1154. lst_lout_no = lout_no
  1155. continue
  1156. if need_image and self.boxes[i]["layout_type"] == "figure":
  1157. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  1158. self.boxes.pop(i)
  1159. continue
  1160. if lout_no not in figures:
  1161. figures[lout_no] = []
  1162. figures[lout_no].append(self.boxes[i])
  1163. self.boxes.pop(i)
  1164. lst_lout_no = lout_no
  1165. continue
  1166. i += 1
  1167. # merge table on different pages
  1168. nomerge_lout_no = set(nomerge_lout_no)
  1169. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  1170. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  1171. i = len(tbls) - 1
  1172. while i - 1 >= 0:
  1173. k0, bxs0 = tbls[i - 1]
  1174. k, bxs = tbls[i]
  1175. i -= 1
  1176. if k0 in nomerge_lout_no:
  1177. continue
  1178. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  1179. continue
  1180. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  1181. continue
  1182. mh = self.mean_height[bxs[0]["page_number"] - 1]
  1183. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  1184. continue
  1185. tables[k0].extend(tables[k])
  1186. del tables[k]
  1187. def x_overlapped(a, b):
  1188. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  1189. # find captions and pop out
  1190. i = 0
  1191. while i < len(self.boxes):
  1192. c = self.boxes[i]
  1193. # mh = self.mean_height[c["page_number"]-1]
  1194. if not self.is_caption(c):
  1195. i += 1
  1196. continue
  1197. # find the nearest layouts
  1198. def nearest(tbls):
  1199. nonlocal c
  1200. mink = ""
  1201. minv = 1000000000
  1202. for k, bxs in tbls.items():
  1203. for b in bxs[:10]:
  1204. if b.get("layout_type", "").find("caption") >= 0:
  1205. continue
  1206. y_dis = self._y_dis(c, b)
  1207. x_dis = self._x_dis(
  1208. c, b) if not x_overlapped(
  1209. c, b) else 0
  1210. dis = y_dis * y_dis + x_dis * x_dis
  1211. if dis < minv:
  1212. mink = k
  1213. minv = dis
  1214. return mink, minv
  1215. tk, tv = nearest(tables)
  1216. fk, fv = nearest(figures)
  1217. if min(tv, fv) > 2000:
  1218. i += 1
  1219. continue
  1220. if tv < fv:
  1221. tables[tk].insert(0, c)
  1222. logging.debug(
  1223. "TABLE:" +
  1224. self.boxes[i]["text"] +
  1225. "; Cap: " +
  1226. tk)
  1227. else:
  1228. figures[fk].insert(0, c)
  1229. logging.debug(
  1230. "FIGURE:" +
  1231. self.boxes[i]["text"] +
  1232. "; Cap: " +
  1233. tk)
  1234. self.boxes.pop(i)
  1235. res = []
  1236. def cropout(bxs, ltype):
  1237. nonlocal ZM
  1238. pn = set([b["page_number"] - 1 for b in bxs])
  1239. if len(pn) < 2:
  1240. pn = list(pn)[0]
  1241. ht = self.page_cum_height[pn]
  1242. b = {
  1243. "x0": np.min([b["x0"] for b in bxs]),
  1244. "top": np.min([b["top"] for b in bxs]) - ht,
  1245. "x1": np.max([b["x1"] for b in bxs]),
  1246. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  1247. }
  1248. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  1249. ii = self.__find_overlapped(b, louts, naive=True)
  1250. if ii is not None:
  1251. b = louts[ii]
  1252. else:
  1253. logging.warn(
  1254. f"Missing layout match: {pn + 1},%s" %
  1255. (bxs[0].get(
  1256. "layoutno", "")))
  1257. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  1258. return self.page_images[pn] \
  1259. .crop((left * ZM, top * ZM,
  1260. right * ZM, bott * ZM))
  1261. pn = {}
  1262. for b in bxs:
  1263. p = b["page_number"] - 1
  1264. if p not in pn:
  1265. pn[p] = []
  1266. pn[p].append(b)
  1267. pn = sorted(pn.items(), key=lambda x: x[0])
  1268. imgs = [cropout(arr, ltype) for p, arr in pn]
  1269. pic = Image.new("RGB",
  1270. (int(np.max([i.size[0] for i in imgs])),
  1271. int(np.sum([m.size[1] for m in imgs]))),
  1272. (245, 245, 245))
  1273. height = 0
  1274. for img in imgs:
  1275. pic.paste(img, (0, int(height)))
  1276. height += img.size[1]
  1277. return pic
  1278. # crop figure out and add caption
  1279. for k, bxs in figures.items():
  1280. txt = "\n".join(
  1281. [b["text"] for b in bxs
  1282. if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
  1283. and len(b["text"].strip()) >= 4
  1284. ]
  1285. )
  1286. if not txt:
  1287. continue
  1288. res.append(
  1289. (cropout(
  1290. bxs,
  1291. "figure"),
  1292. [txt] if not return_html else [f"<p>{txt}</p>"]))
  1293. for k, bxs in tables.items():
  1294. if not bxs:
  1295. continue
  1296. res.append((cropout(bxs, "table"),
  1297. self.__construct_table(bxs, html=return_html)))
  1298. return res
  1299. def proj_match(self, line):
  1300. if len(line) <= 2:
  1301. return
  1302. if re.match(r"[0-9 ().,%%+/-]+$", line):
  1303. return False
  1304. for p, j in [
  1305. (r"第[零一二三四五六七八九十百]+章", 1),
  1306. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  1307. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  1308. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  1309. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  1310. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  1311. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  1312. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  1313. (r".{,48}[::??]$", 9),
  1314. (r"[0-9]+)", 10),
  1315. (r"[\((][0-9]+[)\)]", 11),
  1316. (r"[零一二三四五六七八九十百]+是", 12),
  1317. (r"[⚫•➢✓]", 12)
  1318. ]:
  1319. if re.match(p, line):
  1320. return j
  1321. return
  1322. def __filterout_scraps(self, boxes, ZM):
  1323. def line_tag(bx):
  1324. pn = [bx["page_number"]]
  1325. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  1326. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  1327. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  1328. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  1329. pn.append(pn[-1] + 1)
  1330. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  1331. .format("-".join([str(p) for p in pn]),
  1332. bx["x0"], bx["x1"], top, bott)
  1333. def width(b):
  1334. return b["x1"] - b["x0"]
  1335. def height(b):
  1336. return b["bottom"] - b["top"]
  1337. def usefull(b):
  1338. if b.get("layout_type"):
  1339. return True
  1340. if width(
  1341. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  1342. return True
  1343. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  1344. return True
  1345. return False
  1346. res = []
  1347. while boxes:
  1348. lines = []
  1349. widths = []
  1350. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  1351. mh = self.mean_height[boxes[0]["page_number"] - 1]
  1352. mj = self.proj_match(
  1353. boxes[0]["text"]) or boxes[0].get(
  1354. "layout_type",
  1355. "") == "title"
  1356. def dfs(line, st):
  1357. nonlocal mh, pw, lines, widths
  1358. lines.append(line)
  1359. widths.append(width(line))
  1360. width_mean = np.mean(widths)
  1361. mmj = self.proj_match(
  1362. line["text"]) or line.get(
  1363. "layout_type",
  1364. "") == "title"
  1365. for i in range(st + 1, min(st + 20, len(boxes))):
  1366. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  1367. break
  1368. if not mmj and self._y_dis(
  1369. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  1370. break
  1371. if not usefull(boxes[i]):
  1372. continue
  1373. if mmj or \
  1374. (self._x_dis(boxes[i], line) < pw / 10): \
  1375. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  1376. # concat following
  1377. dfs(boxes[i], i)
  1378. boxes.pop(i)
  1379. break
  1380. try:
  1381. if usefull(boxes[0]):
  1382. dfs(boxes[0], 0)
  1383. else:
  1384. logging.debug("WASTE: " + boxes[0]["text"])
  1385. except Exception as e:
  1386. pass
  1387. boxes.pop(0)
  1388. mw = np.mean(widths)
  1389. if mj or mw / pw >= 0.35 or mw > 200:
  1390. res.append("\n".join([c["text"] + line_tag(c) for c in lines]))
  1391. else:
  1392. logging.debug("REMOVED: " +
  1393. "<<".join([c["text"] for c in lines]))
  1394. return "\n\n".join(res)
  1395. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  1396. self.pdf = pdfplumber.open(fnm) if isinstance(
  1397. fnm, str) else pdfplumber.open(BytesIO(fnm))
  1398. self.lefted_chars = []
  1399. self.mean_height = []
  1400. self.mean_width = []
  1401. self.boxes = []
  1402. self.garbages = {}
  1403. self.page_cum_height = [0]
  1404. self.page_layout = []
  1405. self.page_images = [p.to_image(
  1406. resolution=72 * zoomin).annotated for i, p in enumerate(self.pdf.pages[:299])]
  1407. logging.info("Images converted.")
  1408. logging.info("Table processed.")
  1409. for i, img in enumerate(self.page_images):
  1410. chars = [c for c in self.pdf.pages[i].chars if self._has_color(c)]
  1411. self.mean_height.append(
  1412. np.median(sorted([c["height"] for c in chars])) if chars else 0
  1413. )
  1414. self.mean_width.append(
  1415. np.median(sorted([c["width"] for c in chars])) if chars else 8
  1416. )
  1417. if i > 0:
  1418. if not chars:
  1419. self.page_cum_height.append(img.size[1] / zoomin)
  1420. else:
  1421. self.page_cum_height.append(
  1422. np.max([c["bottom"] for c in chars]))
  1423. self.__ocr_paddle(i + 1, img, chars, zoomin)
  1424. self.__layouts_paddle(zoomin)
  1425. self.page_cum_height = np.cumsum(self.page_cum_height)
  1426. assert len(self.page_cum_height) == len(self.page_images)
  1427. garbage = set()
  1428. for k in self.garbages.keys():
  1429. self.garbages[k] = Counter(self.garbages[k])
  1430. for g, c in self.garbages[k].items():
  1431. if c > 1:
  1432. garbage.add(g)
  1433. logging.debug("GARBAGE:" + ",".join(garbage))
  1434. self.boxes = [b for b in self.boxes if b["text"] not in garbage]
  1435. # cumlative Y
  1436. for i in range(len(self.boxes)):
  1437. self.boxes[i]["top"] += \
  1438. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  1439. self.boxes[i]["bottom"] += \
  1440. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  1441. self.__table_transformer_job(zoomin)
  1442. self.__text_merge(garbage)
  1443. self.__filter_forpages()
  1444. tbls = self.__extract_table_figure(need_image, zoomin, return_html)
  1445. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  1446. def remove_tag(self, txt):
  1447. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  1448. def crop(self, text, ZM=3):
  1449. imgs = []
  1450. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  1451. pn, left, right, top, bottom = tag.strip(
  1452. "#").strip("@").split("\t")
  1453. left, right, top, bottom = float(left), float(
  1454. right), float(top), float(bottom)
  1455. bottom *= ZM
  1456. pns = [int(p) - 1 for p in pn.split("-")]
  1457. for pn in pns[1:]:
  1458. bottom += self.page_images[pn - 1].size[1]
  1459. imgs.append(
  1460. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  1461. right *
  1462. ZM, min(
  1463. bottom, self.page_images[pns[0]].size[1])
  1464. ))
  1465. )
  1466. bottom -= self.page_images[pns[0]].size[1]
  1467. for pn in pns[1:]:
  1468. imgs.append(
  1469. self.page_images[pn].crop((left * ZM, 0,
  1470. right * ZM,
  1471. min(bottom,
  1472. self.page_images[pn].size[1])
  1473. ))
  1474. )
  1475. bottom -= self.page_images[pn].size[1]
  1476. if not imgs:
  1477. return
  1478. GAP = 2
  1479. height = 0
  1480. for img in imgs:
  1481. height += img.size[1] + GAP
  1482. height = int(height)
  1483. pic = Image.new("RGB",
  1484. (int(np.max([i.size[0] for i in imgs])), height),
  1485. (245, 245, 245))
  1486. height = 0
  1487. for img in imgs:
  1488. pic.paste(img, (0, int(height)))
  1489. height += img.size[1] + GAP
  1490. return pic
  1491. if __name__ == "__main__":
  1492. pass