You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import random
  4. import fitz
  5. import xgboost as xgb
  6. from io import BytesIO
  7. import torch
  8. import re
  9. import pdfplumber
  10. import logging
  11. from PIL import Image, ImageDraw
  12. import numpy as np
  13. from PyPDF2 import PdfReader as pdf2_read
  14. from api.utils.file_utils import get_project_base_directory
  15. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  16. from rag.nlp import huqie
  17. from copy import deepcopy
  18. from huggingface_hub import hf_hub_download, snapshot_download
  19. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  20. class HuParser:
  21. def __init__(self):
  22. self.ocr = OCR()
  23. if hasattr(self, "model_speciess"):
  24. self.layouter = LayoutRecognizer("layout." + self.model_speciess)
  25. else:
  26. self.layouter = LayoutRecognizer("layout")
  27. self.tbl_det = TableStructureRecognizer()
  28. self.updown_cnt_mdl = xgb.Booster()
  29. if torch.cuda.is_available():
  30. self.updown_cnt_mdl.set_param({"device": "cuda"})
  31. try:
  32. model_dir = snapshot_download(
  33. repo_id="InfiniFlow/text_concat_xgb_v1.0",
  34. local_dir=os.path.join(
  35. get_project_base_directory(),
  36. "rag/res/deepdoc"),
  37. local_files_only=True)
  38. except Exception as e:
  39. model_dir = snapshot_download(
  40. repo_id="InfiniFlow/text_concat_xgb_v1.0")
  41. self.updown_cnt_mdl.load_model(os.path.join(
  42. model_dir, "updown_concat_xgb.model"))
  43. self.page_from = 0
  44. """
  45. If you have trouble downloading HuggingFace models, -_^ this might help!!
  46. For Linux:
  47. export HF_ENDPOINT=https://hf-mirror.com
  48. For Windows:
  49. Good luck
  50. ^_-
  51. """
  52. def __char_width(self, c):
  53. return (c["x1"] - c["x0"]) // len(c["text"])
  54. def __height(self, c):
  55. return c["bottom"] - c["top"]
  56. def _x_dis(self, a, b):
  57. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  58. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  59. def _y_dis(
  60. self, a, b):
  61. return (
  62. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  63. def _match_proj(self, b):
  64. proj_patt = [
  65. r"第[零一二三四五六七八九十百]+章",
  66. r"第[零一二三四五六七八九十百]+[条节]",
  67. r"[零一二三四五六七八九十百]+[、是  ]",
  68. r"[\((][零一二三四五六七八九十百]+[)\)]",
  69. r"[\((][0-9]+[)\)]",
  70. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  71. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  72. r"[⚫•➢①② ]",
  73. ]
  74. return any([re.match(p, b["text"]) for p in proj_patt])
  75. def _updown_concat_features(self, up, down):
  76. w = max(self.__char_width(up), self.__char_width(down))
  77. h = max(self.__height(up), self.__height(down))
  78. y_dis = self._y_dis(up, down)
  79. LEN = 6
  80. tks_down = huqie.qie(down["text"][:LEN]).split(" ")
  81. tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
  82. tks_all = up["text"][-LEN:].strip() \
  83. + (" " if re.match(r"[a-zA-Z0-9]+",
  84. up["text"][-1] + down["text"][0]) else "") \
  85. + down["text"][:LEN].strip()
  86. tks_all = huqie.qie(tks_all).split(" ")
  87. fea = [
  88. up.get("R", -1) == down.get("R", -1),
  89. y_dis / h,
  90. down["page_number"] - up["page_number"],
  91. up["layout_type"] == down["layout_type"],
  92. up["layout_type"] == "text",
  93. down["layout_type"] == "text",
  94. up["layout_type"] == "table",
  95. down["layout_type"] == "table",
  96. True if re.search(
  97. r"([。?!;!?;+))]|[a-z]\.)$",
  98. up["text"]) else False,
  99. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  100. True if re.search(
  101. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  102. down["text"]) else False,
  103. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  104. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  105. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  106. True if re.search(r"[\((][^\))]+$", up["text"])
  107. and re.search(r"[\))]", down["text"]) else False,
  108. self._match_proj(down),
  109. True if re.match(r"[A-Z]", down["text"]) else False,
  110. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  111. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  112. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  113. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  114. ) > 1 and len(
  115. down["text"].strip()) > 1 else False,
  116. up["x0"] > down["x1"],
  117. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  118. self.__height(down)),
  119. self._x_dis(up, down) / max(w, 0.000001),
  120. (len(up["text"]) - len(down["text"])) /
  121. max(len(up["text"]), len(down["text"])),
  122. len(tks_all) - len(tks_up) - len(tks_down),
  123. len(tks_down) - len(tks_up),
  124. tks_down[-1] == tks_up[-1],
  125. max(down["in_row"], up["in_row"]),
  126. abs(down["in_row"] - up["in_row"]),
  127. len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
  128. len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
  129. ]
  130. return fea
  131. @staticmethod
  132. def sort_X_by_page(arr, threashold):
  133. # sort using y1 first and then x1
  134. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  135. for i in range(len(arr) - 1):
  136. for j in range(i, -1, -1):
  137. # restore the order using th
  138. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  139. and arr[j + 1]["top"] < arr[j]["top"] \
  140. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  141. tmp = arr[j]
  142. arr[j] = arr[j + 1]
  143. arr[j + 1] = tmp
  144. return arr
  145. def _has_color(self, o):
  146. if o.get("ncs", "") == "DeviceGray":
  147. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  148. o["non_stroking_color"][0] == 1:
  149. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  150. return False
  151. return True
  152. def _table_transformer_job(self, ZM):
  153. logging.info("Table processing...")
  154. imgs, pos = [], []
  155. tbcnt = [0]
  156. MARGIN = 10
  157. self.tb_cpns = []
  158. assert len(self.page_layout) == len(self.page_images)
  159. for p, tbls in enumerate(self.page_layout): # for page
  160. tbls = [f for f in tbls if f["type"] == "table"]
  161. tbcnt.append(len(tbls))
  162. if not tbls:
  163. continue
  164. for tb in tbls: # for table
  165. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  166. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  167. left *= ZM
  168. top *= ZM
  169. right *= ZM
  170. bott *= ZM
  171. pos.append((left, top))
  172. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  173. assert len(self.page_images) == len(tbcnt) - 1
  174. if not imgs:
  175. return
  176. recos = self.tbl_det(imgs)
  177. tbcnt = np.cumsum(tbcnt)
  178. for i in range(len(tbcnt) - 1): # for page
  179. pg = []
  180. for j, tb_items in enumerate(
  181. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  182. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  183. for it in tb_items: # for table components
  184. it["x0"] = (it["x0"] + poss[j][0])
  185. it["x1"] = (it["x1"] + poss[j][0])
  186. it["top"] = (it["top"] + poss[j][1])
  187. it["bottom"] = (it["bottom"] + poss[j][1])
  188. for n in ["x0", "x1", "top", "bottom"]:
  189. it[n] /= ZM
  190. it["top"] += self.page_cum_height[i]
  191. it["bottom"] += self.page_cum_height[i]
  192. it["pn"] = i
  193. it["layoutno"] = j
  194. pg.append(it)
  195. self.tb_cpns.extend(pg)
  196. def gather(kwd, fzy=10, ption=0.6):
  197. eles = Recognizer.sort_Y_firstly(
  198. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  199. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  200. return Recognizer.sort_Y_firstly(eles, 0)
  201. # add R,H,C,SP tag to boxes within table layout
  202. headers = gather(r".*header$")
  203. rows = gather(r".* (row|header)")
  204. spans = gather(r".*spanning")
  205. clmns = sorted([r for r in self.tb_cpns if re.match(
  206. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  207. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  208. for b in self.boxes:
  209. if b.get("layout_type", "") != "table":
  210. continue
  211. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  212. if ii is not None:
  213. b["R"] = ii
  214. b["R_top"] = rows[ii]["top"]
  215. b["R_bott"] = rows[ii]["bottom"]
  216. ii = Recognizer.find_overlapped_with_threashold(
  217. b, headers, thr=0.3)
  218. if ii is not None:
  219. b["H_top"] = headers[ii]["top"]
  220. b["H_bott"] = headers[ii]["bottom"]
  221. b["H_left"] = headers[ii]["x0"]
  222. b["H_right"] = headers[ii]["x1"]
  223. b["H"] = ii
  224. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  225. if ii is not None:
  226. b["C"] = ii
  227. b["C_left"] = clmns[ii]["x0"]
  228. b["C_right"] = clmns[ii]["x1"]
  229. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  230. if ii is not None:
  231. b["H_top"] = spans[ii]["top"]
  232. b["H_bott"] = spans[ii]["bottom"]
  233. b["H_left"] = spans[ii]["x0"]
  234. b["H_right"] = spans[ii]["x1"]
  235. b["SP"] = ii
  236. def __ocr(self, pagenum, img, chars, ZM=3):
  237. bxs = self.ocr.detect(np.array(img))
  238. if not bxs:
  239. self.boxes.append([])
  240. return
  241. bxs = [(line[0], line[1][0]) for line in bxs]
  242. bxs = Recognizer.sort_Y_firstly(
  243. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  244. "top": b[0][1] / ZM, "text": "", "txt": t,
  245. "bottom": b[-1][1] / ZM,
  246. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  247. self.mean_height[-1] / 3
  248. )
  249. # merge chars in the same rect
  250. for c in Recognizer.sort_X_firstly(
  251. chars, self.mean_width[pagenum - 1] // 4):
  252. ii = Recognizer.find_overlapped(c, bxs)
  253. if ii is None:
  254. self.lefted_chars.append(c)
  255. continue
  256. ch = c["bottom"] - c["top"]
  257. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  258. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  259. self.lefted_chars.append(c)
  260. continue
  261. if c["text"] == " " and bxs[ii]["text"]:
  262. if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]):
  263. bxs[ii]["text"] += " "
  264. else:
  265. bxs[ii]["text"] += c["text"]
  266. for b in bxs:
  267. if not b["text"]:
  268. left, right, top, bott = b["x0"] * ZM, b["x1"] * \
  269. ZM, b["top"] * ZM, b["bottom"] * ZM
  270. b["text"] = self.ocr.recognize(np.array(img),
  271. np.array([[left, top], [right, top], [right, bott], [left, bott]],
  272. dtype=np.float32))
  273. del b["txt"]
  274. bxs = [b for b in bxs if b["text"]]
  275. if self.mean_height[-1] == 0:
  276. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  277. for b in bxs])
  278. self.boxes.append(bxs)
  279. def _layouts_rec(self, ZM, drop=True):
  280. assert len(self.page_images) == len(self.boxes)
  281. self.boxes, self.page_layout = self.layouter(
  282. self.page_images, self.boxes, ZM, drop=drop)
  283. # cumlative Y
  284. for i in range(len(self.boxes)):
  285. self.boxes[i]["top"] += \
  286. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  287. self.boxes[i]["bottom"] += \
  288. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  289. def _text_merge(self):
  290. # merge adjusted boxes
  291. bxs = self.boxes
  292. def end_with(b, txt):
  293. txt = txt.strip()
  294. tt = b.get("text", "").strip()
  295. return tt and tt.find(txt) == len(tt) - len(txt)
  296. def start_with(b, txts):
  297. tt = b.get("text", "").strip()
  298. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  299. # horizontally merge adjacent box with the same layout
  300. i = 0
  301. while i < len(bxs) - 1:
  302. b = bxs[i]
  303. b_ = bxs[i + 1]
  304. if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
  305. "equation"]:
  306. i += 1
  307. continue
  308. if abs(self._y_dis(b, b_)
  309. ) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
  310. # merge
  311. bxs[i]["x1"] = b_["x1"]
  312. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  313. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  314. bxs[i]["text"] += b_["text"]
  315. bxs.pop(i + 1)
  316. continue
  317. i += 1
  318. continue
  319. dis_thr = 1
  320. dis = b["x1"] - b_["x0"]
  321. if b.get("layout_type", "") != "text" or b_.get(
  322. "layout_type", "") != "text":
  323. if end_with(b, ",") or start_with(b_, "(,"):
  324. dis_thr = -8
  325. else:
  326. i += 1
  327. continue
  328. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  329. and dis >= dis_thr and b["x1"] < b_["x1"]:
  330. # merge
  331. bxs[i]["x1"] = b_["x1"]
  332. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  333. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  334. bxs[i]["text"] += b_["text"]
  335. bxs.pop(i + 1)
  336. continue
  337. i += 1
  338. self.boxes = bxs
  339. def _naive_vertical_merge(self):
  340. bxs = Recognizer.sort_Y_firstly(
  341. self.boxes, np.median(
  342. self.mean_height) / 3)
  343. i = 0
  344. while i + 1 < len(bxs):
  345. b = bxs[i]
  346. b_ = bxs[i + 1]
  347. if b["page_number"] < b_["page_number"] and re.match(
  348. r"[0-9 •一—-]+$", b["text"]):
  349. bxs.pop(i)
  350. continue
  351. if not b["text"].strip():
  352. bxs.pop(i)
  353. continue
  354. concatting_feats = [
  355. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  356. len(b["text"].strip()) > 1 and b["text"].strip(
  357. )[-2] in ",;:'\",‘“、;:",
  358. b["text"].strip()[0] in "。;?!?”)),,、:",
  359. ]
  360. # features for not concating
  361. feats = [
  362. b.get("layoutno", 0) != b.get("layoutno", 0),
  363. b["text"].strip()[-1] in "。?!?",
  364. self.is_english and b["text"].strip()[-1] in ".!?",
  365. b["page_number"] == b_["page_number"] and b_["top"] -
  366. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  367. b["page_number"] < b_["page_number"] and abs(
  368. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
  369. ]
  370. # split features
  371. detach_feats = [b["x1"] < b_["x0"],
  372. b["x0"] > b_["x1"]]
  373. if (any(feats) and not any(concatting_feats)) or any(detach_feats):
  374. print(
  375. b["text"],
  376. b_["text"],
  377. any(feats),
  378. any(concatting_feats),
  379. any(detach_feats))
  380. i += 1
  381. continue
  382. # merge up and down
  383. b["bottom"] = b_["bottom"]
  384. b["text"] += b_["text"]
  385. b["x0"] = min(b["x0"], b_["x0"])
  386. b["x1"] = max(b["x1"], b_["x1"])
  387. bxs.pop(i + 1)
  388. self.boxes = bxs
  389. def _concat_downward(self, concat_between_pages=True):
  390. # count boxes in the same row as a feature
  391. for i in range(len(self.boxes)):
  392. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  393. self.boxes[i]["in_row"] = 0
  394. j = max(0, i - 12)
  395. while j < min(i + 12, len(self.boxes)):
  396. if j == i:
  397. j += 1
  398. continue
  399. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  400. if abs(ydis) < 1:
  401. self.boxes[i]["in_row"] += 1
  402. elif ydis > 0:
  403. break
  404. j += 1
  405. # concat between rows
  406. boxes = deepcopy(self.boxes)
  407. blocks = []
  408. while boxes:
  409. chunks = []
  410. def dfs(up, dp):
  411. chunks.append(up)
  412. i = dp
  413. while i < min(dp + 12, len(boxes)):
  414. ydis = self._y_dis(up, boxes[i])
  415. smpg = up["page_number"] == boxes[i]["page_number"]
  416. mh = self.mean_height[up["page_number"] - 1]
  417. mw = self.mean_width[up["page_number"] - 1]
  418. if smpg and ydis > mh * 4:
  419. break
  420. if not smpg and ydis > mh * 16:
  421. break
  422. down = boxes[i]
  423. if not concat_between_pages and down["page_number"] > up["page_number"]:
  424. break
  425. if up.get("R", "") != down.get(
  426. "R", "") and up["text"][-1] != ",":
  427. i += 1
  428. continue
  429. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  430. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
  431. i += 1
  432. continue
  433. if not down["text"].strip():
  434. i += 1
  435. continue
  436. if up["x1"] < down["x0"] - 10 * \
  437. mw or up["x0"] > down["x1"] + 10 * mw:
  438. i += 1
  439. continue
  440. if i - dp < 5 and up.get("layout_type") == "text":
  441. if up.get("layoutno", "1") == down.get(
  442. "layoutno", "2"):
  443. dfs(down, i + 1)
  444. boxes.pop(i)
  445. return
  446. i += 1
  447. continue
  448. fea = self._updown_concat_features(up, down)
  449. if self.updown_cnt_mdl.predict(
  450. xgb.DMatrix([fea]))[0] <= 0.5:
  451. i += 1
  452. continue
  453. dfs(down, i + 1)
  454. boxes.pop(i)
  455. return
  456. dfs(boxes[0], 1)
  457. boxes.pop(0)
  458. if chunks:
  459. blocks.append(chunks)
  460. # concat within each block
  461. boxes = []
  462. for b in blocks:
  463. if len(b) == 1:
  464. boxes.append(b[0])
  465. continue
  466. t = b[0]
  467. for c in b[1:]:
  468. t["text"] = t["text"].strip()
  469. c["text"] = c["text"].strip()
  470. if not c["text"]:
  471. continue
  472. if t["text"] and re.match(
  473. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  474. t["text"] += " "
  475. t["text"] += c["text"]
  476. t["x0"] = min(t["x0"], c["x0"])
  477. t["x1"] = max(t["x1"], c["x1"])
  478. t["page_number"] = min(t["page_number"], c["page_number"])
  479. t["bottom"] = c["bottom"]
  480. if not t["layout_type"] \
  481. and c["layout_type"]:
  482. t["layout_type"] = c["layout_type"]
  483. boxes.append(t)
  484. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  485. def _filter_forpages(self):
  486. if not self.boxes:
  487. return
  488. findit = False
  489. i = 0
  490. while i < len(self.boxes):
  491. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
  492. re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  493. i += 1
  494. continue
  495. findit = True
  496. eng = re.match(
  497. r"[0-9a-zA-Z :'.-]{5,}",
  498. self.boxes[i]["text"].strip())
  499. self.boxes.pop(i)
  500. if i >= len(self.boxes):
  501. break
  502. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  503. self.boxes[i]["text"].strip().split(" ")[:2])
  504. while not prefix:
  505. self.boxes.pop(i)
  506. if i >= len(self.boxes):
  507. break
  508. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  509. self.boxes[i]["text"].strip().split(" ")[:2])
  510. self.boxes.pop(i)
  511. if i >= len(self.boxes) or not prefix:
  512. break
  513. for j in range(i, min(i + 128, len(self.boxes))):
  514. if not re.match(prefix, self.boxes[j]["text"]):
  515. continue
  516. for k in range(i, j):
  517. self.boxes.pop(i)
  518. break
  519. if findit:
  520. return
  521. page_dirty = [0] * len(self.page_images)
  522. for b in self.boxes:
  523. if re.search(r"(··|··|··)", b["text"]):
  524. page_dirty[b["page_number"] - 1] += 1
  525. page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
  526. if not page_dirty:
  527. return
  528. i = 0
  529. while i < len(self.boxes):
  530. if self.boxes[i]["page_number"] in page_dirty:
  531. self.boxes.pop(i)
  532. continue
  533. i += 1
  534. def _merge_with_same_bullet(self):
  535. i = 0
  536. while i + 1 < len(self.boxes):
  537. b = self.boxes[i]
  538. b_ = self.boxes[i + 1]
  539. if not b["text"].strip():
  540. self.boxes.pop(i)
  541. continue
  542. if not b_["text"].strip():
  543. self.boxes.pop(i + 1)
  544. continue
  545. if b["text"].strip()[0] != b_["text"].strip()[0] \
  546. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  547. or huqie.is_chinese(b["text"].strip()[0]) \
  548. or b["top"] > b_["bottom"]:
  549. i += 1
  550. continue
  551. b_["text"] = b["text"] + "\n" + b_["text"]
  552. b_["x0"] = min(b["x0"], b_["x0"])
  553. b_["x1"] = max(b["x1"], b_["x1"])
  554. b_["top"] = b["top"]
  555. self.boxes.pop(i)
  556. def _extract_table_figure(self, need_image, ZM,
  557. return_html, need_position):
  558. tables = {}
  559. figures = {}
  560. # extract figure and table boxes
  561. i = 0
  562. lst_lout_no = ""
  563. nomerge_lout_no = []
  564. while i < len(self.boxes):
  565. if "layoutno" not in self.boxes[i]:
  566. i += 1
  567. continue
  568. lout_no = str(self.boxes[i]["page_number"]) + \
  569. "-" + str(self.boxes[i]["layoutno"])
  570. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
  571. "title",
  572. "figure caption",
  573. "reference"]:
  574. nomerge_lout_no.append(lst_lout_no)
  575. if self.boxes[i]["layout_type"] == "table":
  576. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  577. self.boxes.pop(i)
  578. continue
  579. if lout_no not in tables:
  580. tables[lout_no] = []
  581. tables[lout_no].append(self.boxes[i])
  582. self.boxes.pop(i)
  583. lst_lout_no = lout_no
  584. continue
  585. if need_image and self.boxes[i]["layout_type"] == "figure":
  586. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  587. self.boxes.pop(i)
  588. continue
  589. if lout_no not in figures:
  590. figures[lout_no] = []
  591. figures[lout_no].append(self.boxes[i])
  592. self.boxes.pop(i)
  593. lst_lout_no = lout_no
  594. continue
  595. i += 1
  596. # merge table on different pages
  597. nomerge_lout_no = set(nomerge_lout_no)
  598. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  599. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  600. i = len(tbls) - 1
  601. while i - 1 >= 0:
  602. k0, bxs0 = tbls[i - 1]
  603. k, bxs = tbls[i]
  604. i -= 1
  605. if k0 in nomerge_lout_no:
  606. continue
  607. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  608. continue
  609. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  610. continue
  611. mh = self.mean_height[bxs[0]["page_number"] - 1]
  612. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  613. continue
  614. tables[k0].extend(tables[k])
  615. del tables[k]
  616. def x_overlapped(a, b):
  617. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  618. # find captions and pop out
  619. i = 0
  620. while i < len(self.boxes):
  621. c = self.boxes[i]
  622. # mh = self.mean_height[c["page_number"]-1]
  623. if not TableStructureRecognizer.is_caption(c):
  624. i += 1
  625. continue
  626. # find the nearest layouts
  627. def nearest(tbls):
  628. nonlocal c
  629. mink = ""
  630. minv = 1000000000
  631. for k, bxs in tbls.items():
  632. for b in bxs:
  633. if b.get("layout_type", "").find("caption") >= 0:
  634. continue
  635. y_dis = self._y_dis(c, b)
  636. x_dis = self._x_dis(
  637. c, b) if not x_overlapped(
  638. c, b) else 0
  639. dis = y_dis * y_dis + x_dis * x_dis
  640. if dis < minv:
  641. mink = k
  642. minv = dis
  643. return mink, minv
  644. tk, tv = nearest(tables)
  645. fk, fv = nearest(figures)
  646. # if min(tv, fv) > 2000:
  647. # i += 1
  648. # continue
  649. if tv < fv and tk:
  650. tables[tk].insert(0, c)
  651. logging.debug(
  652. "TABLE:" +
  653. self.boxes[i]["text"] +
  654. "; Cap: " +
  655. tk)
  656. elif fk:
  657. figures[fk].insert(0, c)
  658. logging.debug(
  659. "FIGURE:" +
  660. self.boxes[i]["text"] +
  661. "; Cap: " +
  662. tk)
  663. self.boxes.pop(i)
  664. res = []
  665. positions = []
  666. def cropout(bxs, ltype, poss):
  667. nonlocal ZM
  668. pn = set([b["page_number"] - 1 for b in bxs])
  669. if len(pn) < 2:
  670. pn = list(pn)[0]
  671. ht = self.page_cum_height[pn]
  672. b = {
  673. "x0": np.min([b["x0"] for b in bxs]),
  674. "top": np.min([b["top"] for b in bxs]) - ht,
  675. "x1": np.max([b["x1"] for b in bxs]),
  676. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  677. }
  678. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  679. ii = Recognizer.find_overlapped(b, louts, naive=True)
  680. if ii is not None:
  681. b = louts[ii]
  682. else:
  683. logging.warn(
  684. f"Missing layout match: {pn + 1},%s" %
  685. (bxs[0].get(
  686. "layoutno", "")))
  687. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  688. poss.append((pn + self.page_from, left, right, top, bott))
  689. return self.page_images[pn] \
  690. .crop((left * ZM, top * ZM,
  691. right * ZM, bott * ZM))
  692. pn = {}
  693. for b in bxs:
  694. p = b["page_number"] - 1
  695. if p not in pn:
  696. pn[p] = []
  697. pn[p].append(b)
  698. pn = sorted(pn.items(), key=lambda x: x[0])
  699. imgs = [cropout(arr, ltype, poss) for p, arr in pn]
  700. pic = Image.new("RGB",
  701. (int(np.max([i.size[0] for i in imgs])),
  702. int(np.sum([m.size[1] for m in imgs]))),
  703. (245, 245, 245))
  704. height = 0
  705. for img in imgs:
  706. pic.paste(img, (0, int(height)))
  707. height += img.size[1]
  708. return pic
  709. # crop figure out and add caption
  710. for k, bxs in figures.items():
  711. txt = "\n".join([b["text"] for b in bxs])
  712. if not txt:
  713. continue
  714. poss = []
  715. res.append(
  716. (cropout(
  717. bxs,
  718. "figure", poss),
  719. [txt]))
  720. positions.append(poss)
  721. for k, bxs in tables.items():
  722. if not bxs:
  723. continue
  724. bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
  725. [(b["bottom"] - b["top"]) / 2 for b in bxs]))
  726. poss = []
  727. res.append((cropout(bxs, "table", poss),
  728. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  729. positions.append(poss)
  730. assert len(positions) == len(res)
  731. if need_position:
  732. return list(zip(res, positions))
  733. return res
  734. def proj_match(self, line):
  735. if len(line) <= 2:
  736. return
  737. if re.match(r"[0-9 ().,%%+/-]+$", line):
  738. return False
  739. for p, j in [
  740. (r"第[零一二三四五六七八九十百]+章", 1),
  741. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  742. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  743. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  744. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  745. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  746. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  747. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  748. (r".{,48}[::??]$", 9),
  749. (r"[0-9]+)", 10),
  750. (r"[\((][0-9]+[)\)]", 11),
  751. (r"[零一二三四五六七八九十百]+是", 12),
  752. (r"[⚫•➢✓]", 12)
  753. ]:
  754. if re.match(p, line):
  755. return j
  756. return
  757. def _line_tag(self, bx, ZM):
  758. pn = [bx["page_number"]]
  759. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  760. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  761. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  762. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  763. pn.append(pn[-1] + 1)
  764. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  765. .format("-".join([str(p) for p in pn]),
  766. bx["x0"], bx["x1"], top, bott)
  767. def __filterout_scraps(self, boxes, ZM):
  768. def width(b):
  769. return b["x1"] - b["x0"]
  770. def height(b):
  771. return b["bottom"] - b["top"]
  772. def usefull(b):
  773. if b.get("layout_type"):
  774. return True
  775. if width(
  776. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  777. return True
  778. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  779. return True
  780. return False
  781. res = []
  782. while boxes:
  783. lines = []
  784. widths = []
  785. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  786. mh = self.mean_height[boxes[0]["page_number"] - 1]
  787. mj = self.proj_match(
  788. boxes[0]["text"]) or boxes[0].get(
  789. "layout_type",
  790. "") == "title"
  791. def dfs(line, st):
  792. nonlocal mh, pw, lines, widths
  793. lines.append(line)
  794. widths.append(width(line))
  795. width_mean = np.mean(widths)
  796. mmj = self.proj_match(
  797. line["text"]) or line.get(
  798. "layout_type",
  799. "") == "title"
  800. for i in range(st + 1, min(st + 20, len(boxes))):
  801. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  802. break
  803. if not mmj and self._y_dis(
  804. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  805. break
  806. if not usefull(boxes[i]):
  807. continue
  808. if mmj or \
  809. (self._x_dis(boxes[i], line) < pw / 10): \
  810. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  811. # concat following
  812. dfs(boxes[i], i)
  813. boxes.pop(i)
  814. break
  815. try:
  816. if usefull(boxes[0]):
  817. dfs(boxes[0], 0)
  818. else:
  819. logging.debug("WASTE: " + boxes[0]["text"])
  820. except Exception as e:
  821. pass
  822. boxes.pop(0)
  823. mw = np.mean(widths)
  824. if mj or mw / pw >= 0.35 or mw > 200:
  825. res.append(
  826. "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  827. else:
  828. logging.debug("REMOVED: " +
  829. "<<".join([c["text"] for c in lines]))
  830. return "\n\n".join(res)
  831. @staticmethod
  832. def total_page_number(fnm, binary=None):
  833. try:
  834. pdf = pdfplumber.open(
  835. fnm) if not binary else pdfplumber.open(BytesIO(binary))
  836. return len(pdf.pages)
  837. except Exception as e:
  838. pdf = fitz.open(fnm) if not binary else fitz.open(
  839. stream=fnm, filetype="pdf")
  840. return len(pdf)
  841. def __images__(self, fnm, zoomin=3, page_from=0,
  842. page_to=299, callback=None):
  843. self.lefted_chars = []
  844. self.mean_height = []
  845. self.mean_width = []
  846. self.boxes = []
  847. self.garbages = {}
  848. self.page_cum_height = [0]
  849. self.page_layout = []
  850. self.page_from = page_from
  851. try:
  852. self.pdf = pdfplumber.open(fnm) if isinstance(
  853. fnm, str) else pdfplumber.open(BytesIO(fnm))
  854. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  855. enumerate(self.pdf.pages[page_from:page_to])]
  856. self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in
  857. self.pdf.pages[page_from:page_to]]
  858. self.total_page = len(self.pdf.pages)
  859. except Exception as e:
  860. self.pdf = fitz.open(fnm) if isinstance(
  861. fnm, str) else fitz.open(
  862. stream=fnm, filetype="pdf")
  863. self.page_images = []
  864. self.page_chars = []
  865. mat = fitz.Matrix(zoomin, zoomin)
  866. self.total_page = len(self.pdf)
  867. for i, page in enumerate(self.pdf):
  868. if i < page_from:
  869. continue
  870. if i >= page_to:
  871. break
  872. pix = page.get_pixmap(matrix=mat)
  873. img = Image.frombytes("RGB", [pix.width, pix.height],
  874. pix.samples)
  875. self.page_images.append(img)
  876. self.page_chars.append([])
  877. self.outlines = []
  878. try:
  879. self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
  880. outlines = self.pdf.outline
  881. def dfs(arr, depth):
  882. for a in arr:
  883. if isinstance(a, dict):
  884. self.outlines.append((a["/Title"], depth))
  885. continue
  886. dfs(a, depth + 1)
  887. dfs(outlines, 0)
  888. except Exception as e:
  889. logging.warning(f"Outlines exception: {e}")
  890. if not self.outlines:
  891. logging.warning(f"Miss outlines")
  892. logging.info("Images converted.")
  893. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
  894. random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
  895. range(len(self.page_chars))]
  896. if sum([1 if e else 0 for e in self.is_english]) > len(
  897. self.page_images) / 2:
  898. self.is_english = True
  899. else:
  900. self.is_english = False
  901. for i, img in enumerate(self.page_images):
  902. chars = self.page_chars[i] if not self.is_english else []
  903. self.mean_height.append(
  904. np.median(sorted([c["height"] for c in chars])) if chars else 0
  905. )
  906. self.mean_width.append(
  907. np.median(sorted([c["width"] for c in chars])) if chars else 8
  908. )
  909. self.page_cum_height.append(img.size[1] / zoomin)
  910. j = 0
  911. while j + 1 < len(chars):
  912. if chars[j]["text"] and chars[j + 1]["text"] \
  913. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  914. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  915. chars[j]["width"]) / 2:
  916. chars[j]["text"] += " "
  917. j += 1
  918. # if i > 0:
  919. # if not chars:
  920. # self.page_cum_height.append(img.size[1] / zoomin)
  921. # else:
  922. # self.page_cum_height.append(
  923. # np.max([c["bottom"] for c in chars]))
  924. self.__ocr(i + 1, img, chars, zoomin)
  925. if callback:
  926. callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
  927. if not self.is_english and not any(
  928. [c for c in self.page_chars]) and self.boxes:
  929. bxes = [b for bxs in self.boxes for b in bxs]
  930. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
  931. "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  932. logging.info("Is it English:", self.is_english)
  933. self.page_cum_height = np.cumsum(self.page_cum_height)
  934. assert len(self.page_cum_height) == len(self.page_images) + 1
  935. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  936. self.__images__(fnm, zoomin)
  937. self._layouts_rec(zoomin)
  938. self._table_transformer_job(zoomin)
  939. self._text_merge()
  940. self._concat_downward()
  941. self._filter_forpages()
  942. tbls = self._extract_table_figure(
  943. need_image, zoomin, return_html, False)
  944. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  945. def remove_tag(self, txt):
  946. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  947. def crop(self, text, ZM=3, need_position=False):
  948. imgs = []
  949. poss = []
  950. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  951. pn, left, right, top, bottom = tag.strip(
  952. "#").strip("@").split("\t")
  953. left, right, top, bottom = float(left), float(
  954. right), float(top), float(bottom)
  955. poss.append(([int(p) - 1 for p in pn.split("-")],
  956. left, right, top, bottom))
  957. if not poss:
  958. if need_position:
  959. return None, None
  960. return
  961. max_width = max(
  962. np.max([right - left for (_, left, right, _, _) in poss]), 6)
  963. GAP = 6
  964. pos = poss[0]
  965. poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
  966. 0, pos[3] - 120), max(pos[3] - GAP, 0)))
  967. pos = poss[-1]
  968. poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
  969. min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
  970. positions = []
  971. for ii, (pns, left, right, top, bottom) in enumerate(poss):
  972. right = left + max_width
  973. bottom *= ZM
  974. for pn in pns[1:]:
  975. bottom += self.page_images[pn - 1].size[1]
  976. imgs.append(
  977. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  978. right *
  979. ZM, min(
  980. bottom, self.page_images[pns[0]].size[1])
  981. ))
  982. )
  983. if 0 < ii < len(poss) - 1:
  984. positions.append((pns[0] + self.page_from, left, right, top, min(
  985. bottom, self.page_images[pns[0]].size[1]) / ZM))
  986. bottom -= self.page_images[pns[0]].size[1]
  987. for pn in pns[1:]:
  988. imgs.append(
  989. self.page_images[pn].crop((left * ZM, 0,
  990. right * ZM,
  991. min(bottom,
  992. self.page_images[pn].size[1])
  993. ))
  994. )
  995. if 0 < ii < len(poss) - 1:
  996. positions.append((pn + self.page_from, left, right, 0, min(
  997. bottom, self.page_images[pn].size[1]) / ZM))
  998. bottom -= self.page_images[pn].size[1]
  999. if not imgs:
  1000. if need_position:
  1001. return None, None
  1002. return
  1003. height = 0
  1004. for img in imgs:
  1005. height += img.size[1] + GAP
  1006. height = int(height)
  1007. width = int(np.max([i.size[0] for i in imgs]))
  1008. pic = Image.new("RGB",
  1009. (width, height),
  1010. (245, 245, 245))
  1011. height = 0
  1012. for ii, img in enumerate(imgs):
  1013. if ii == 0 or ii + 1 == len(imgs):
  1014. img = img.convert('RGBA')
  1015. overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
  1016. overlay.putalpha(128)
  1017. img = Image.alpha_composite(img, overlay).convert("RGB")
  1018. pic.paste(img, (0, int(height)))
  1019. height += img.size[1] + GAP
  1020. if need_position:
  1021. return pic, positions
  1022. return pic
  1023. def get_position(self, bx, ZM):
  1024. poss = []
  1025. pn = bx["page_number"]
  1026. top = bx["top"] - self.page_cum_height[pn - 1]
  1027. bott = bx["bottom"] - self.page_cum_height[pn - 1]
  1028. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1029. bott, self.page_images[pn - 1].size[1] / ZM)))
  1030. while bott * ZM > self.page_images[pn - 1].size[1]:
  1031. bott -= self.page_images[pn - 1].size[1] / ZM
  1032. top = 0
  1033. pn += 1
  1034. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1035. bott, self.page_images[pn - 1].size[1] / ZM)))
  1036. return poss
  1037. class PlainParser(object):
  1038. def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
  1039. self.outlines = []
  1040. lines = []
  1041. try:
  1042. self.pdf = pdf2_read(
  1043. filename if isinstance(
  1044. filename, str) else BytesIO(filename))
  1045. for page in self.pdf.pages[from_page:to_page]:
  1046. lines.extend([t for t in page.extract_text().split("\n")])
  1047. outlines = self.pdf.outline
  1048. def dfs(arr, depth):
  1049. for a in arr:
  1050. if isinstance(a, dict):
  1051. self.outlines.append((a["/Title"], depth))
  1052. continue
  1053. dfs(a, depth + 1)
  1054. dfs(outlines, 0)
  1055. except Exception as e:
  1056. logging.warning(f"Outlines exception: {e}")
  1057. if not self.outlines:
  1058. logging.warning(f"Miss outlines")
  1059. return [(l, "") for l in lines], []
  1060. def crop(self, ck, need_position):
  1061. raise NotImplementedError
  1062. @staticmethod
  1063. def remove_tag(txt):
  1064. raise NotImplementedError
  1065. if __name__ == "__main__":
  1066. pass