Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

pdf_parser.py 45KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import random
  4. import fitz
  5. import xgboost as xgb
  6. from io import BytesIO
  7. import torch
  8. import re
  9. import pdfplumber
  10. import logging
  11. from PIL import Image, ImageDraw
  12. import numpy as np
  13. from PyPDF2 import PdfReader as pdf2_read
  14. from api.utils.file_utils import get_project_base_directory
  15. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  16. from rag.nlp import huqie
  17. from copy import deepcopy
  18. from huggingface_hub import snapshot_download
  19. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  20. class HuParser:
  21. def __init__(self):
  22. self.ocr = OCR()
  23. if hasattr(self, "model_speciess"):
  24. self.layouter = LayoutRecognizer("layout." + self.model_speciess)
  25. else:
  26. self.layouter = LayoutRecognizer("layout")
  27. self.tbl_det = TableStructureRecognizer()
  28. self.updown_cnt_mdl = xgb.Booster()
  29. if torch.cuda.is_available():
  30. self.updown_cnt_mdl.set_param({"device": "cuda"})
  31. try:
  32. model_dir = os.path.join(
  33. get_project_base_directory(),
  34. "rag/res/deepdoc")
  35. self.updown_cnt_mdl.load_model(os.path.join(
  36. model_dir, "updown_concat_xgb.model"))
  37. except Exception as e:
  38. model_dir = snapshot_download(
  39. repo_id="InfiniFlow/text_concat_xgb_v1.0")
  40. self.updown_cnt_mdl.load_model(os.path.join(
  41. model_dir, "updown_concat_xgb.model"))
  42. self.page_from = 0
  43. """
  44. If you have trouble downloading HuggingFace models, -_^ this might help!!
  45. For Linux:
  46. export HF_ENDPOINT=https://hf-mirror.com
  47. For Windows:
  48. Good luck
  49. ^_-
  50. """
  51. def __char_width(self, c):
  52. return (c["x1"] - c["x0"]) // len(c["text"])
  53. def __height(self, c):
  54. return c["bottom"] - c["top"]
  55. def _x_dis(self, a, b):
  56. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  57. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  58. def _y_dis(
  59. self, a, b):
  60. return (
  61. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  62. def _match_proj(self, b):
  63. proj_patt = [
  64. r"第[零一二三四五六七八九十百]+章",
  65. r"第[零一二三四五六七八九十百]+[条节]",
  66. r"[零一二三四五六七八九十百]+[、是  ]",
  67. r"[\((][零一二三四五六七八九十百]+[)\)]",
  68. r"[\((][0-9]+[)\)]",
  69. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  70. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  71. r"[⚫•➢①② ]",
  72. ]
  73. return any([re.match(p, b["text"]) for p in proj_patt])
  74. def _updown_concat_features(self, up, down):
  75. w = max(self.__char_width(up), self.__char_width(down))
  76. h = max(self.__height(up), self.__height(down))
  77. y_dis = self._y_dis(up, down)
  78. LEN = 6
  79. tks_down = huqie.qie(down["text"][:LEN]).split(" ")
  80. tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
  81. tks_all = up["text"][-LEN:].strip() \
  82. + (" " if re.match(r"[a-zA-Z0-9]+",
  83. up["text"][-1] + down["text"][0]) else "") \
  84. + down["text"][:LEN].strip()
  85. tks_all = huqie.qie(tks_all).split(" ")
  86. fea = [
  87. up.get("R", -1) == down.get("R", -1),
  88. y_dis / h,
  89. down["page_number"] - up["page_number"],
  90. up["layout_type"] == down["layout_type"],
  91. up["layout_type"] == "text",
  92. down["layout_type"] == "text",
  93. up["layout_type"] == "table",
  94. down["layout_type"] == "table",
  95. True if re.search(
  96. r"([。?!;!?;+))]|[a-z]\.)$",
  97. up["text"]) else False,
  98. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  99. True if re.search(
  100. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  101. down["text"]) else False,
  102. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  103. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  104. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  105. True if re.search(r"[\((][^\))]+$", up["text"])
  106. and re.search(r"[\))]", down["text"]) else False,
  107. self._match_proj(down),
  108. True if re.match(r"[A-Z]", down["text"]) else False,
  109. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  110. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  111. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  112. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  113. ) > 1 and len(
  114. down["text"].strip()) > 1 else False,
  115. up["x0"] > down["x1"],
  116. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  117. self.__height(down)),
  118. self._x_dis(up, down) / max(w, 0.000001),
  119. (len(up["text"]) - len(down["text"])) /
  120. max(len(up["text"]), len(down["text"])),
  121. len(tks_all) - len(tks_up) - len(tks_down),
  122. len(tks_down) - len(tks_up),
  123. tks_down[-1] == tks_up[-1],
  124. max(down["in_row"], up["in_row"]),
  125. abs(down["in_row"] - up["in_row"]),
  126. len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
  127. len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
  128. ]
  129. return fea
  130. @staticmethod
  131. def sort_X_by_page(arr, threashold):
  132. # sort using y1 first and then x1
  133. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  134. for i in range(len(arr) - 1):
  135. for j in range(i, -1, -1):
  136. # restore the order using th
  137. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  138. and arr[j + 1]["top"] < arr[j]["top"] \
  139. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  140. tmp = arr[j]
  141. arr[j] = arr[j + 1]
  142. arr[j + 1] = tmp
  143. return arr
  144. def _has_color(self, o):
  145. if o.get("ncs", "") == "DeviceGray":
  146. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  147. o["non_stroking_color"][0] == 1:
  148. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  149. return False
  150. return True
  151. def _table_transformer_job(self, ZM):
  152. logging.info("Table processing...")
  153. imgs, pos = [], []
  154. tbcnt = [0]
  155. MARGIN = 10
  156. self.tb_cpns = []
  157. assert len(self.page_layout) == len(self.page_images)
  158. for p, tbls in enumerate(self.page_layout): # for page
  159. tbls = [f for f in tbls if f["type"] == "table"]
  160. tbcnt.append(len(tbls))
  161. if not tbls:
  162. continue
  163. for tb in tbls: # for table
  164. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  165. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  166. left *= ZM
  167. top *= ZM
  168. right *= ZM
  169. bott *= ZM
  170. pos.append((left, top))
  171. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  172. assert len(self.page_images) == len(tbcnt) - 1
  173. if not imgs:
  174. return
  175. recos = self.tbl_det(imgs)
  176. tbcnt = np.cumsum(tbcnt)
  177. for i in range(len(tbcnt) - 1): # for page
  178. pg = []
  179. for j, tb_items in enumerate(
  180. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  181. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  182. for it in tb_items: # for table components
  183. it["x0"] = (it["x0"] + poss[j][0])
  184. it["x1"] = (it["x1"] + poss[j][0])
  185. it["top"] = (it["top"] + poss[j][1])
  186. it["bottom"] = (it["bottom"] + poss[j][1])
  187. for n in ["x0", "x1", "top", "bottom"]:
  188. it[n] /= ZM
  189. it["top"] += self.page_cum_height[i]
  190. it["bottom"] += self.page_cum_height[i]
  191. it["pn"] = i
  192. it["layoutno"] = j
  193. pg.append(it)
  194. self.tb_cpns.extend(pg)
  195. def gather(kwd, fzy=10, ption=0.6):
  196. eles = Recognizer.sort_Y_firstly(
  197. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  198. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  199. return Recognizer.sort_Y_firstly(eles, 0)
  200. # add R,H,C,SP tag to boxes within table layout
  201. headers = gather(r".*header$")
  202. rows = gather(r".* (row|header)")
  203. spans = gather(r".*spanning")
  204. clmns = sorted([r for r in self.tb_cpns if re.match(
  205. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  206. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  207. for b in self.boxes:
  208. if b.get("layout_type", "") != "table":
  209. continue
  210. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  211. if ii is not None:
  212. b["R"] = ii
  213. b["R_top"] = rows[ii]["top"]
  214. b["R_bott"] = rows[ii]["bottom"]
  215. ii = Recognizer.find_overlapped_with_threashold(
  216. b, headers, thr=0.3)
  217. if ii is not None:
  218. b["H_top"] = headers[ii]["top"]
  219. b["H_bott"] = headers[ii]["bottom"]
  220. b["H_left"] = headers[ii]["x0"]
  221. b["H_right"] = headers[ii]["x1"]
  222. b["H"] = ii
  223. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  224. if ii is not None:
  225. b["C"] = ii
  226. b["C_left"] = clmns[ii]["x0"]
  227. b["C_right"] = clmns[ii]["x1"]
  228. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  229. if ii is not None:
  230. b["H_top"] = spans[ii]["top"]
  231. b["H_bott"] = spans[ii]["bottom"]
  232. b["H_left"] = spans[ii]["x0"]
  233. b["H_right"] = spans[ii]["x1"]
  234. b["SP"] = ii
  235. def __ocr(self, pagenum, img, chars, ZM=3):
  236. bxs = self.ocr.detect(np.array(img))
  237. if not bxs:
  238. self.boxes.append([])
  239. return
  240. bxs = [(line[0], line[1][0]) for line in bxs]
  241. bxs = Recognizer.sort_Y_firstly(
  242. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  243. "top": b[0][1] / ZM, "text": "", "txt": t,
  244. "bottom": b[-1][1] / ZM,
  245. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  246. self.mean_height[-1] / 3
  247. )
  248. # merge chars in the same rect
  249. for c in Recognizer.sort_X_firstly(
  250. chars, self.mean_width[pagenum - 1] // 4):
  251. ii = Recognizer.find_overlapped(c, bxs)
  252. if ii is None:
  253. self.lefted_chars.append(c)
  254. continue
  255. ch = c["bottom"] - c["top"]
  256. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  257. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  258. self.lefted_chars.append(c)
  259. continue
  260. if c["text"] == " " and bxs[ii]["text"]:
  261. if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]):
  262. bxs[ii]["text"] += " "
  263. else:
  264. bxs[ii]["text"] += c["text"]
  265. for b in bxs:
  266. if not b["text"]:
  267. left, right, top, bott = b["x0"] * ZM, b["x1"] * \
  268. ZM, b["top"] * ZM, b["bottom"] * ZM
  269. b["text"] = self.ocr.recognize(np.array(img),
  270. np.array([[left, top], [right, top], [right, bott], [left, bott]],
  271. dtype=np.float32))
  272. del b["txt"]
  273. bxs = [b for b in bxs if b["text"]]
  274. if self.mean_height[-1] == 0:
  275. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  276. for b in bxs])
  277. self.boxes.append(bxs)
  278. def _layouts_rec(self, ZM, drop=True):
  279. assert len(self.page_images) == len(self.boxes)
  280. self.boxes, self.page_layout = self.layouter(
  281. self.page_images, self.boxes, ZM, drop=drop)
  282. # cumlative Y
  283. for i in range(len(self.boxes)):
  284. self.boxes[i]["top"] += \
  285. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  286. self.boxes[i]["bottom"] += \
  287. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  288. def _text_merge(self):
  289. # merge adjusted boxes
  290. bxs = self.boxes
  291. def end_with(b, txt):
  292. txt = txt.strip()
  293. tt = b.get("text", "").strip()
  294. return tt and tt.find(txt) == len(tt) - len(txt)
  295. def start_with(b, txts):
  296. tt = b.get("text", "").strip()
  297. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  298. # horizontally merge adjacent box with the same layout
  299. i = 0
  300. while i < len(bxs) - 1:
  301. b = bxs[i]
  302. b_ = bxs[i + 1]
  303. if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
  304. "equation"]:
  305. i += 1
  306. continue
  307. if abs(self._y_dis(b, b_)
  308. ) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
  309. # merge
  310. bxs[i]["x1"] = b_["x1"]
  311. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  312. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  313. bxs[i]["text"] += b_["text"]
  314. bxs.pop(i + 1)
  315. continue
  316. i += 1
  317. continue
  318. dis_thr = 1
  319. dis = b["x1"] - b_["x0"]
  320. if b.get("layout_type", "") != "text" or b_.get(
  321. "layout_type", "") != "text":
  322. if end_with(b, ",") or start_with(b_, "(,"):
  323. dis_thr = -8
  324. else:
  325. i += 1
  326. continue
  327. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  328. and dis >= dis_thr and b["x1"] < b_["x1"]:
  329. # merge
  330. bxs[i]["x1"] = b_["x1"]
  331. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  332. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  333. bxs[i]["text"] += b_["text"]
  334. bxs.pop(i + 1)
  335. continue
  336. i += 1
  337. self.boxes = bxs
  338. def _naive_vertical_merge(self):
  339. bxs = Recognizer.sort_Y_firstly(
  340. self.boxes, np.median(
  341. self.mean_height) / 3)
  342. i = 0
  343. while i + 1 < len(bxs):
  344. b = bxs[i]
  345. b_ = bxs[i + 1]
  346. if b["page_number"] < b_["page_number"] and re.match(
  347. r"[0-9 •一—-]+$", b["text"]):
  348. bxs.pop(i)
  349. continue
  350. if not b["text"].strip():
  351. bxs.pop(i)
  352. continue
  353. concatting_feats = [
  354. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  355. len(b["text"].strip()) > 1 and b["text"].strip(
  356. )[-2] in ",;:'\",‘“、;:",
  357. b["text"].strip()[0] in "。;?!?”)),,、:",
  358. ]
  359. # features for not concating
  360. feats = [
  361. b.get("layoutno", 0) != b.get("layoutno", 0),
  362. b["text"].strip()[-1] in "。?!?",
  363. self.is_english and b["text"].strip()[-1] in ".!?",
  364. b["page_number"] == b_["page_number"] and b_["top"] -
  365. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  366. b["page_number"] < b_["page_number"] and abs(
  367. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
  368. ]
  369. # split features
  370. detach_feats = [b["x1"] < b_["x0"],
  371. b["x0"] > b_["x1"]]
  372. if (any(feats) and not any(concatting_feats)) or any(detach_feats):
  373. print(
  374. b["text"],
  375. b_["text"],
  376. any(feats),
  377. any(concatting_feats),
  378. any(detach_feats))
  379. i += 1
  380. continue
  381. # merge up and down
  382. b["bottom"] = b_["bottom"]
  383. b["text"] += b_["text"]
  384. b["x0"] = min(b["x0"], b_["x0"])
  385. b["x1"] = max(b["x1"], b_["x1"])
  386. bxs.pop(i + 1)
  387. self.boxes = bxs
  388. def _concat_downward(self, concat_between_pages=True):
  389. # count boxes in the same row as a feature
  390. for i in range(len(self.boxes)):
  391. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  392. self.boxes[i]["in_row"] = 0
  393. j = max(0, i - 12)
  394. while j < min(i + 12, len(self.boxes)):
  395. if j == i:
  396. j += 1
  397. continue
  398. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  399. if abs(ydis) < 1:
  400. self.boxes[i]["in_row"] += 1
  401. elif ydis > 0:
  402. break
  403. j += 1
  404. # concat between rows
  405. boxes = deepcopy(self.boxes)
  406. blocks = []
  407. while boxes:
  408. chunks = []
  409. def dfs(up, dp):
  410. chunks.append(up)
  411. i = dp
  412. while i < min(dp + 12, len(boxes)):
  413. ydis = self._y_dis(up, boxes[i])
  414. smpg = up["page_number"] == boxes[i]["page_number"]
  415. mh = self.mean_height[up["page_number"] - 1]
  416. mw = self.mean_width[up["page_number"] - 1]
  417. if smpg and ydis > mh * 4:
  418. break
  419. if not smpg and ydis > mh * 16:
  420. break
  421. down = boxes[i]
  422. if not concat_between_pages and down["page_number"] > up["page_number"]:
  423. break
  424. if up.get("R", "") != down.get(
  425. "R", "") and up["text"][-1] != ",":
  426. i += 1
  427. continue
  428. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  429. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
  430. i += 1
  431. continue
  432. if not down["text"].strip():
  433. i += 1
  434. continue
  435. if up["x1"] < down["x0"] - 10 * \
  436. mw or up["x0"] > down["x1"] + 10 * mw:
  437. i += 1
  438. continue
  439. if i - dp < 5 and up.get("layout_type") == "text":
  440. if up.get("layoutno", "1") == down.get(
  441. "layoutno", "2"):
  442. dfs(down, i + 1)
  443. boxes.pop(i)
  444. return
  445. i += 1
  446. continue
  447. fea = self._updown_concat_features(up, down)
  448. if self.updown_cnt_mdl.predict(
  449. xgb.DMatrix([fea]))[0] <= 0.5:
  450. i += 1
  451. continue
  452. dfs(down, i + 1)
  453. boxes.pop(i)
  454. return
  455. dfs(boxes[0], 1)
  456. boxes.pop(0)
  457. if chunks:
  458. blocks.append(chunks)
  459. # concat within each block
  460. boxes = []
  461. for b in blocks:
  462. if len(b) == 1:
  463. boxes.append(b[0])
  464. continue
  465. t = b[0]
  466. for c in b[1:]:
  467. t["text"] = t["text"].strip()
  468. c["text"] = c["text"].strip()
  469. if not c["text"]:
  470. continue
  471. if t["text"] and re.match(
  472. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  473. t["text"] += " "
  474. t["text"] += c["text"]
  475. t["x0"] = min(t["x0"], c["x0"])
  476. t["x1"] = max(t["x1"], c["x1"])
  477. t["page_number"] = min(t["page_number"], c["page_number"])
  478. t["bottom"] = c["bottom"]
  479. if not t["layout_type"] \
  480. and c["layout_type"]:
  481. t["layout_type"] = c["layout_type"]
  482. boxes.append(t)
  483. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  484. def _filter_forpages(self):
  485. if not self.boxes:
  486. return
  487. findit = False
  488. i = 0
  489. while i < len(self.boxes):
  490. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
  491. re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  492. i += 1
  493. continue
  494. findit = True
  495. eng = re.match(
  496. r"[0-9a-zA-Z :'.-]{5,}",
  497. self.boxes[i]["text"].strip())
  498. self.boxes.pop(i)
  499. if i >= len(self.boxes):
  500. break
  501. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  502. self.boxes[i]["text"].strip().split(" ")[:2])
  503. while not prefix:
  504. self.boxes.pop(i)
  505. if i >= len(self.boxes):
  506. break
  507. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  508. self.boxes[i]["text"].strip().split(" ")[:2])
  509. self.boxes.pop(i)
  510. if i >= len(self.boxes) or not prefix:
  511. break
  512. for j in range(i, min(i + 128, len(self.boxes))):
  513. if not re.match(prefix, self.boxes[j]["text"]):
  514. continue
  515. for k in range(i, j):
  516. self.boxes.pop(i)
  517. break
  518. if findit:
  519. return
  520. page_dirty = [0] * len(self.page_images)
  521. for b in self.boxes:
  522. if re.search(r"(··|··|··)", b["text"]):
  523. page_dirty[b["page_number"] - 1] += 1
  524. page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
  525. if not page_dirty:
  526. return
  527. i = 0
  528. while i < len(self.boxes):
  529. if self.boxes[i]["page_number"] in page_dirty:
  530. self.boxes.pop(i)
  531. continue
  532. i += 1
  533. def _merge_with_same_bullet(self):
  534. i = 0
  535. while i + 1 < len(self.boxes):
  536. b = self.boxes[i]
  537. b_ = self.boxes[i + 1]
  538. if not b["text"].strip():
  539. self.boxes.pop(i)
  540. continue
  541. if not b_["text"].strip():
  542. self.boxes.pop(i + 1)
  543. continue
  544. if b["text"].strip()[0] != b_["text"].strip()[0] \
  545. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  546. or huqie.is_chinese(b["text"].strip()[0]) \
  547. or b["top"] > b_["bottom"]:
  548. i += 1
  549. continue
  550. b_["text"] = b["text"] + "\n" + b_["text"]
  551. b_["x0"] = min(b["x0"], b_["x0"])
  552. b_["x1"] = max(b["x1"], b_["x1"])
  553. b_["top"] = b["top"]
  554. self.boxes.pop(i)
  555. def _extract_table_figure(self, need_image, ZM,
  556. return_html, need_position):
  557. tables = {}
  558. figures = {}
  559. # extract figure and table boxes
  560. i = 0
  561. lst_lout_no = ""
  562. nomerge_lout_no = []
  563. while i < len(self.boxes):
  564. if "layoutno" not in self.boxes[i]:
  565. i += 1
  566. continue
  567. lout_no = str(self.boxes[i]["page_number"]) + \
  568. "-" + str(self.boxes[i]["layoutno"])
  569. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
  570. "title",
  571. "figure caption",
  572. "reference"]:
  573. nomerge_lout_no.append(lst_lout_no)
  574. if self.boxes[i]["layout_type"] == "table":
  575. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  576. self.boxes.pop(i)
  577. continue
  578. if lout_no not in tables:
  579. tables[lout_no] = []
  580. tables[lout_no].append(self.boxes[i])
  581. self.boxes.pop(i)
  582. lst_lout_no = lout_no
  583. continue
  584. if need_image and self.boxes[i]["layout_type"] == "figure":
  585. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  586. self.boxes.pop(i)
  587. continue
  588. if lout_no not in figures:
  589. figures[lout_no] = []
  590. figures[lout_no].append(self.boxes[i])
  591. self.boxes.pop(i)
  592. lst_lout_no = lout_no
  593. continue
  594. i += 1
  595. # merge table on different pages
  596. nomerge_lout_no = set(nomerge_lout_no)
  597. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  598. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  599. i = len(tbls) - 1
  600. while i - 1 >= 0:
  601. k0, bxs0 = tbls[i - 1]
  602. k, bxs = tbls[i]
  603. i -= 1
  604. if k0 in nomerge_lout_no:
  605. continue
  606. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  607. continue
  608. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  609. continue
  610. mh = self.mean_height[bxs[0]["page_number"] - 1]
  611. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  612. continue
  613. tables[k0].extend(tables[k])
  614. del tables[k]
  615. def x_overlapped(a, b):
  616. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  617. # find captions and pop out
  618. i = 0
  619. while i < len(self.boxes):
  620. c = self.boxes[i]
  621. # mh = self.mean_height[c["page_number"]-1]
  622. if not TableStructureRecognizer.is_caption(c):
  623. i += 1
  624. continue
  625. # find the nearest layouts
  626. def nearest(tbls):
  627. nonlocal c
  628. mink = ""
  629. minv = 1000000000
  630. for k, bxs in tbls.items():
  631. for b in bxs:
  632. if b.get("layout_type", "").find("caption") >= 0:
  633. continue
  634. y_dis = self._y_dis(c, b)
  635. x_dis = self._x_dis(
  636. c, b) if not x_overlapped(
  637. c, b) else 0
  638. dis = y_dis * y_dis + x_dis * x_dis
  639. if dis < minv:
  640. mink = k
  641. minv = dis
  642. return mink, minv
  643. tk, tv = nearest(tables)
  644. fk, fv = nearest(figures)
  645. # if min(tv, fv) > 2000:
  646. # i += 1
  647. # continue
  648. if tv < fv and tk:
  649. tables[tk].insert(0, c)
  650. logging.debug(
  651. "TABLE:" +
  652. self.boxes[i]["text"] +
  653. "; Cap: " +
  654. tk)
  655. elif fk:
  656. figures[fk].insert(0, c)
  657. logging.debug(
  658. "FIGURE:" +
  659. self.boxes[i]["text"] +
  660. "; Cap: " +
  661. tk)
  662. self.boxes.pop(i)
  663. res = []
  664. positions = []
  665. def cropout(bxs, ltype, poss):
  666. nonlocal ZM
  667. pn = set([b["page_number"] - 1 for b in bxs])
  668. if len(pn) < 2:
  669. pn = list(pn)[0]
  670. ht = self.page_cum_height[pn]
  671. b = {
  672. "x0": np.min([b["x0"] for b in bxs]),
  673. "top": np.min([b["top"] for b in bxs]) - ht,
  674. "x1": np.max([b["x1"] for b in bxs]),
  675. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  676. }
  677. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  678. ii = Recognizer.find_overlapped(b, louts, naive=True)
  679. if ii is not None:
  680. b = louts[ii]
  681. else:
  682. logging.warn(
  683. f"Missing layout match: {pn + 1},%s" %
  684. (bxs[0].get(
  685. "layoutno", "")))
  686. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  687. poss.append((pn + self.page_from, left, right, top, bott))
  688. return self.page_images[pn] \
  689. .crop((left * ZM, top * ZM,
  690. right * ZM, bott * ZM))
  691. pn = {}
  692. for b in bxs:
  693. p = b["page_number"] - 1
  694. if p not in pn:
  695. pn[p] = []
  696. pn[p].append(b)
  697. pn = sorted(pn.items(), key=lambda x: x[0])
  698. imgs = [cropout(arr, ltype, poss) for p, arr in pn]
  699. pic = Image.new("RGB",
  700. (int(np.max([i.size[0] for i in imgs])),
  701. int(np.sum([m.size[1] for m in imgs]))),
  702. (245, 245, 245))
  703. height = 0
  704. for img in imgs:
  705. pic.paste(img, (0, int(height)))
  706. height += img.size[1]
  707. return pic
  708. # crop figure out and add caption
  709. for k, bxs in figures.items():
  710. txt = "\n".join([b["text"] for b in bxs])
  711. if not txt:
  712. continue
  713. poss = []
  714. res.append(
  715. (cropout(
  716. bxs,
  717. "figure", poss),
  718. [txt]))
  719. positions.append(poss)
  720. for k, bxs in tables.items():
  721. if not bxs:
  722. continue
  723. bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
  724. [(b["bottom"] - b["top"]) / 2 for b in bxs]))
  725. poss = []
  726. res.append((cropout(bxs, "table", poss),
  727. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  728. positions.append(poss)
  729. assert len(positions) == len(res)
  730. if need_position:
  731. return list(zip(res, positions))
  732. return res
  733. def proj_match(self, line):
  734. if len(line) <= 2:
  735. return
  736. if re.match(r"[0-9 ().,%%+/-]+$", line):
  737. return False
  738. for p, j in [
  739. (r"第[零一二三四五六七八九十百]+章", 1),
  740. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  741. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  742. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  743. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  744. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  745. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  746. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  747. (r".{,48}[::??]$", 9),
  748. (r"[0-9]+)", 10),
  749. (r"[\((][0-9]+[)\)]", 11),
  750. (r"[零一二三四五六七八九十百]+是", 12),
  751. (r"[⚫•➢✓]", 12)
  752. ]:
  753. if re.match(p, line):
  754. return j
  755. return
  756. def _line_tag(self, bx, ZM):
  757. pn = [bx["page_number"]]
  758. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  759. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  760. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  761. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  762. pn.append(pn[-1] + 1)
  763. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  764. .format("-".join([str(p) for p in pn]),
  765. bx["x0"], bx["x1"], top, bott)
  766. def __filterout_scraps(self, boxes, ZM):
  767. def width(b):
  768. return b["x1"] - b["x0"]
  769. def height(b):
  770. return b["bottom"] - b["top"]
  771. def usefull(b):
  772. if b.get("layout_type"):
  773. return True
  774. if width(
  775. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  776. return True
  777. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  778. return True
  779. return False
  780. res = []
  781. while boxes:
  782. lines = []
  783. widths = []
  784. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  785. mh = self.mean_height[boxes[0]["page_number"] - 1]
  786. mj = self.proj_match(
  787. boxes[0]["text"]) or boxes[0].get(
  788. "layout_type",
  789. "") == "title"
  790. def dfs(line, st):
  791. nonlocal mh, pw, lines, widths
  792. lines.append(line)
  793. widths.append(width(line))
  794. width_mean = np.mean(widths)
  795. mmj = self.proj_match(
  796. line["text"]) or line.get(
  797. "layout_type",
  798. "") == "title"
  799. for i in range(st + 1, min(st + 20, len(boxes))):
  800. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  801. break
  802. if not mmj and self._y_dis(
  803. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  804. break
  805. if not usefull(boxes[i]):
  806. continue
  807. if mmj or \
  808. (self._x_dis(boxes[i], line) < pw / 10): \
  809. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  810. # concat following
  811. dfs(boxes[i], i)
  812. boxes.pop(i)
  813. break
  814. try:
  815. if usefull(boxes[0]):
  816. dfs(boxes[0], 0)
  817. else:
  818. logging.debug("WASTE: " + boxes[0]["text"])
  819. except Exception as e:
  820. pass
  821. boxes.pop(0)
  822. mw = np.mean(widths)
  823. if mj or mw / pw >= 0.35 or mw > 200:
  824. res.append(
  825. "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  826. else:
  827. logging.debug("REMOVED: " +
  828. "<<".join([c["text"] for c in lines]))
  829. return "\n\n".join(res)
  830. @staticmethod
  831. def total_page_number(fnm, binary=None):
  832. try:
  833. pdf = pdfplumber.open(
  834. fnm) if not binary else pdfplumber.open(BytesIO(binary))
  835. return len(pdf.pages)
  836. except Exception as e:
  837. pdf = fitz.open(fnm) if not binary else fitz.open(
  838. stream=fnm, filetype="pdf")
  839. return len(pdf)
  840. def __images__(self, fnm, zoomin=3, page_from=0,
  841. page_to=299, callback=None):
  842. self.lefted_chars = []
  843. self.mean_height = []
  844. self.mean_width = []
  845. self.boxes = []
  846. self.garbages = {}
  847. self.page_cum_height = [0]
  848. self.page_layout = []
  849. self.page_from = page_from
  850. try:
  851. self.pdf = pdfplumber.open(fnm) if isinstance(
  852. fnm, str) else pdfplumber.open(BytesIO(fnm))
  853. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  854. enumerate(self.pdf.pages[page_from:page_to])]
  855. self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in
  856. self.pdf.pages[page_from:page_to]]
  857. self.total_page = len(self.pdf.pages)
  858. except Exception as e:
  859. self.pdf = fitz.open(fnm) if isinstance(
  860. fnm, str) else fitz.open(
  861. stream=fnm, filetype="pdf")
  862. self.page_images = []
  863. self.page_chars = []
  864. mat = fitz.Matrix(zoomin, zoomin)
  865. self.total_page = len(self.pdf)
  866. for i, page in enumerate(self.pdf):
  867. if i < page_from:
  868. continue
  869. if i >= page_to:
  870. break
  871. pix = page.get_pixmap(matrix=mat)
  872. img = Image.frombytes("RGB", [pix.width, pix.height],
  873. pix.samples)
  874. self.page_images.append(img)
  875. self.page_chars.append([])
  876. self.outlines = []
  877. try:
  878. self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
  879. outlines = self.pdf.outline
  880. def dfs(arr, depth):
  881. for a in arr:
  882. if isinstance(a, dict):
  883. self.outlines.append((a["/Title"], depth))
  884. continue
  885. dfs(a, depth + 1)
  886. dfs(outlines, 0)
  887. except Exception as e:
  888. logging.warning(f"Outlines exception: {e}")
  889. if not self.outlines:
  890. logging.warning(f"Miss outlines")
  891. logging.info("Images converted.")
  892. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
  893. random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
  894. range(len(self.page_chars))]
  895. if sum([1 if e else 0 for e in self.is_english]) > len(
  896. self.page_images) / 2:
  897. self.is_english = True
  898. else:
  899. self.is_english = False
  900. for i, img in enumerate(self.page_images):
  901. chars = self.page_chars[i] if not self.is_english else []
  902. self.mean_height.append(
  903. np.median(sorted([c["height"] for c in chars])) if chars else 0
  904. )
  905. self.mean_width.append(
  906. np.median(sorted([c["width"] for c in chars])) if chars else 8
  907. )
  908. self.page_cum_height.append(img.size[1] / zoomin)
  909. j = 0
  910. while j + 1 < len(chars):
  911. if chars[j]["text"] and chars[j + 1]["text"] \
  912. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  913. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  914. chars[j]["width"]) / 2:
  915. chars[j]["text"] += " "
  916. j += 1
  917. # if i > 0:
  918. # if not chars:
  919. # self.page_cum_height.append(img.size[1] / zoomin)
  920. # else:
  921. # self.page_cum_height.append(
  922. # np.max([c["bottom"] for c in chars]))
  923. self.__ocr(i + 1, img, chars, zoomin)
  924. if callback:
  925. callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
  926. if not self.is_english and not any(
  927. [c for c in self.page_chars]) and self.boxes:
  928. bxes = [b for bxs in self.boxes for b in bxs]
  929. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
  930. "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  931. logging.info("Is it English:", self.is_english)
  932. self.page_cum_height = np.cumsum(self.page_cum_height)
  933. assert len(self.page_cum_height) == len(self.page_images) + 1
  934. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  935. self.__images__(fnm, zoomin)
  936. self._layouts_rec(zoomin)
  937. self._table_transformer_job(zoomin)
  938. self._text_merge()
  939. self._concat_downward()
  940. self._filter_forpages()
  941. tbls = self._extract_table_figure(
  942. need_image, zoomin, return_html, False)
  943. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  944. def remove_tag(self, txt):
  945. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  946. def crop(self, text, ZM=3, need_position=False):
  947. imgs = []
  948. poss = []
  949. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  950. pn, left, right, top, bottom = tag.strip(
  951. "#").strip("@").split("\t")
  952. left, right, top, bottom = float(left), float(
  953. right), float(top), float(bottom)
  954. poss.append(([int(p) - 1 for p in pn.split("-")],
  955. left, right, top, bottom))
  956. if not poss:
  957. if need_position:
  958. return None, None
  959. return
  960. max_width = max(
  961. np.max([right - left for (_, left, right, _, _) in poss]), 6)
  962. GAP = 6
  963. pos = poss[0]
  964. poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
  965. 0, pos[3] - 120), max(pos[3] - GAP, 0)))
  966. pos = poss[-1]
  967. poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
  968. min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
  969. positions = []
  970. for ii, (pns, left, right, top, bottom) in enumerate(poss):
  971. right = left + max_width
  972. bottom *= ZM
  973. for pn in pns[1:]:
  974. bottom += self.page_images[pn - 1].size[1]
  975. imgs.append(
  976. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  977. right *
  978. ZM, min(
  979. bottom, self.page_images[pns[0]].size[1])
  980. ))
  981. )
  982. if 0 < ii < len(poss) - 1:
  983. positions.append((pns[0] + self.page_from, left, right, top, min(
  984. bottom, self.page_images[pns[0]].size[1]) / ZM))
  985. bottom -= self.page_images[pns[0]].size[1]
  986. for pn in pns[1:]:
  987. imgs.append(
  988. self.page_images[pn].crop((left * ZM, 0,
  989. right * ZM,
  990. min(bottom,
  991. self.page_images[pn].size[1])
  992. ))
  993. )
  994. if 0 < ii < len(poss) - 1:
  995. positions.append((pn + self.page_from, left, right, 0, min(
  996. bottom, self.page_images[pn].size[1]) / ZM))
  997. bottom -= self.page_images[pn].size[1]
  998. if not imgs:
  999. if need_position:
  1000. return None, None
  1001. return
  1002. height = 0
  1003. for img in imgs:
  1004. height += img.size[1] + GAP
  1005. height = int(height)
  1006. width = int(np.max([i.size[0] for i in imgs]))
  1007. pic = Image.new("RGB",
  1008. (width, height),
  1009. (245, 245, 245))
  1010. height = 0
  1011. for ii, img in enumerate(imgs):
  1012. if ii == 0 or ii + 1 == len(imgs):
  1013. img = img.convert('RGBA')
  1014. overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
  1015. overlay.putalpha(128)
  1016. img = Image.alpha_composite(img, overlay).convert("RGB")
  1017. pic.paste(img, (0, int(height)))
  1018. height += img.size[1] + GAP
  1019. if need_position:
  1020. return pic, positions
  1021. return pic
  1022. def get_position(self, bx, ZM):
  1023. poss = []
  1024. pn = bx["page_number"]
  1025. top = bx["top"] - self.page_cum_height[pn - 1]
  1026. bott = bx["bottom"] - self.page_cum_height[pn - 1]
  1027. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1028. bott, self.page_images[pn - 1].size[1] / ZM)))
  1029. while bott * ZM > self.page_images[pn - 1].size[1]:
  1030. bott -= self.page_images[pn - 1].size[1] / ZM
  1031. top = 0
  1032. pn += 1
  1033. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1034. bott, self.page_images[pn - 1].size[1] / ZM)))
  1035. return poss
  1036. class PlainParser(object):
  1037. def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
  1038. self.outlines = []
  1039. lines = []
  1040. try:
  1041. self.pdf = pdf2_read(
  1042. filename if isinstance(
  1043. filename, str) else BytesIO(filename))
  1044. for page in self.pdf.pages[from_page:to_page]:
  1045. lines.extend([t for t in page.extract_text().split("\n")])
  1046. outlines = self.pdf.outline
  1047. def dfs(arr, depth):
  1048. for a in arr:
  1049. if isinstance(a, dict):
  1050. self.outlines.append((a["/Title"], depth))
  1051. continue
  1052. dfs(a, depth + 1)
  1053. dfs(outlines, 0)
  1054. except Exception as e:
  1055. logging.warning(f"Outlines exception: {e}")
  1056. if not self.outlines:
  1057. logging.warning(f"Miss outlines")
  1058. return [(l, "") for l in lines], []
  1059. def crop(self, ck, need_position):
  1060. raise NotImplementedError
  1061. @staticmethod
  1062. def remove_tag(txt):
  1063. raise NotImplementedError
  1064. if __name__ == "__main__":
  1065. pass