You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

pdf_parser.py 46KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import random
  4. import fitz
  5. import xgboost as xgb
  6. from io import BytesIO
  7. import torch
  8. import re
  9. import pdfplumber
  10. import logging
  11. from PIL import Image, ImageDraw
  12. import numpy as np
  13. from PyPDF2 import PdfReader as pdf2_read
  14. from api.utils.file_utils import get_project_base_directory
  15. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  16. from rag.nlp import huqie
  17. from copy import deepcopy
  18. from huggingface_hub import snapshot_download
  19. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  20. class HuParser:
  21. def __init__(self):
  22. self.ocr = OCR()
  23. if hasattr(self, "model_speciess"):
  24. self.layouter = LayoutRecognizer("layout." + self.model_speciess)
  25. else:
  26. self.layouter = LayoutRecognizer("layout")
  27. self.tbl_det = TableStructureRecognizer()
  28. self.updown_cnt_mdl = xgb.Booster()
  29. if torch.cuda.is_available():
  30. self.updown_cnt_mdl.set_param({"device": "cuda"})
  31. try:
  32. model_dir = os.path.join(
  33. get_project_base_directory(),
  34. "rag/res/deepdoc")
  35. self.updown_cnt_mdl.load_model(os.path.join(
  36. model_dir, "updown_concat_xgb.model"))
  37. except Exception as e:
  38. model_dir = snapshot_download(
  39. repo_id="InfiniFlow/text_concat_xgb_v1.0",
  40. local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
  41. local_dir_use_symlinks=False)
  42. self.updown_cnt_mdl.load_model(os.path.join(
  43. model_dir, "updown_concat_xgb.model"))
  44. self.page_from = 0
  45. """
  46. If you have trouble downloading HuggingFace models, -_^ this might help!!
  47. For Linux:
  48. export HF_ENDPOINT=https://hf-mirror.com
  49. For Windows:
  50. Good luck
  51. ^_-
  52. """
  53. def __char_width(self, c):
  54. return (c["x1"] - c["x0"]) // len(c["text"])
  55. def __height(self, c):
  56. return c["bottom"] - c["top"]
  57. def _x_dis(self, a, b):
  58. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  59. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  60. def _y_dis(
  61. self, a, b):
  62. return (
  63. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  64. def _match_proj(self, b):
  65. proj_patt = [
  66. r"第[零一二三四五六七八九十百]+章",
  67. r"第[零一二三四五六七八九十百]+[条节]",
  68. r"[零一二三四五六七八九十百]+[、是  ]",
  69. r"[\((][零一二三四五六七八九十百]+[)\)]",
  70. r"[\((][0-9]+[)\)]",
  71. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  72. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  73. r"[⚫•➢①② ]",
  74. ]
  75. return any([re.match(p, b["text"]) for p in proj_patt])
  76. def _updown_concat_features(self, up, down):
  77. w = max(self.__char_width(up), self.__char_width(down))
  78. h = max(self.__height(up), self.__height(down))
  79. y_dis = self._y_dis(up, down)
  80. LEN = 6
  81. tks_down = huqie.qie(down["text"][:LEN]).split(" ")
  82. tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
  83. tks_all = up["text"][-LEN:].strip() \
  84. + (" " if re.match(r"[a-zA-Z0-9]+",
  85. up["text"][-1] + down["text"][0]) else "") \
  86. + down["text"][:LEN].strip()
  87. tks_all = huqie.qie(tks_all).split(" ")
  88. fea = [
  89. up.get("R", -1) == down.get("R", -1),
  90. y_dis / h,
  91. down["page_number"] - up["page_number"],
  92. up["layout_type"] == down["layout_type"],
  93. up["layout_type"] == "text",
  94. down["layout_type"] == "text",
  95. up["layout_type"] == "table",
  96. down["layout_type"] == "table",
  97. True if re.search(
  98. r"([。?!;!?;+))]|[a-z]\.)$",
  99. up["text"]) else False,
  100. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  101. True if re.search(
  102. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  103. down["text"]) else False,
  104. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  105. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  106. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  107. True if re.search(r"[\((][^\))]+$", up["text"])
  108. and re.search(r"[\))]", down["text"]) else False,
  109. self._match_proj(down),
  110. True if re.match(r"[A-Z]", down["text"]) else False,
  111. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  112. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  113. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  114. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  115. ) > 1 and len(
  116. down["text"].strip()) > 1 else False,
  117. up["x0"] > down["x1"],
  118. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  119. self.__height(down)),
  120. self._x_dis(up, down) / max(w, 0.000001),
  121. (len(up["text"]) - len(down["text"])) /
  122. max(len(up["text"]), len(down["text"])),
  123. len(tks_all) - len(tks_up) - len(tks_down),
  124. len(tks_down) - len(tks_up),
  125. tks_down[-1] == tks_up[-1],
  126. max(down["in_row"], up["in_row"]),
  127. abs(down["in_row"] - up["in_row"]),
  128. len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
  129. len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
  130. ]
  131. return fea
  132. @staticmethod
  133. def sort_X_by_page(arr, threashold):
  134. # sort using y1 first and then x1
  135. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  136. for i in range(len(arr) - 1):
  137. for j in range(i, -1, -1):
  138. # restore the order using th
  139. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  140. and arr[j + 1]["top"] < arr[j]["top"] \
  141. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  142. tmp = arr[j]
  143. arr[j] = arr[j + 1]
  144. arr[j + 1] = tmp
  145. return arr
  146. def _has_color(self, o):
  147. if o.get("ncs", "") == "DeviceGray":
  148. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  149. o["non_stroking_color"][0] == 1:
  150. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  151. return False
  152. return True
  153. def _table_transformer_job(self, ZM):
  154. logging.info("Table processing...")
  155. imgs, pos = [], []
  156. tbcnt = [0]
  157. MARGIN = 10
  158. self.tb_cpns = []
  159. assert len(self.page_layout) == len(self.page_images)
  160. for p, tbls in enumerate(self.page_layout): # for page
  161. tbls = [f for f in tbls if f["type"] == "table"]
  162. tbcnt.append(len(tbls))
  163. if not tbls:
  164. continue
  165. for tb in tbls: # for table
  166. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  167. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  168. left *= ZM
  169. top *= ZM
  170. right *= ZM
  171. bott *= ZM
  172. pos.append((left, top))
  173. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  174. assert len(self.page_images) == len(tbcnt) - 1
  175. if not imgs:
  176. return
  177. recos = self.tbl_det(imgs)
  178. tbcnt = np.cumsum(tbcnt)
  179. for i in range(len(tbcnt) - 1): # for page
  180. pg = []
  181. for j, tb_items in enumerate(
  182. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  183. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  184. for it in tb_items: # for table components
  185. it["x0"] = (it["x0"] + poss[j][0])
  186. it["x1"] = (it["x1"] + poss[j][0])
  187. it["top"] = (it["top"] + poss[j][1])
  188. it["bottom"] = (it["bottom"] + poss[j][1])
  189. for n in ["x0", "x1", "top", "bottom"]:
  190. it[n] /= ZM
  191. it["top"] += self.page_cum_height[i]
  192. it["bottom"] += self.page_cum_height[i]
  193. it["pn"] = i
  194. it["layoutno"] = j
  195. pg.append(it)
  196. self.tb_cpns.extend(pg)
  197. def gather(kwd, fzy=10, ption=0.6):
  198. eles = Recognizer.sort_Y_firstly(
  199. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  200. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  201. return Recognizer.sort_Y_firstly(eles, 0)
  202. # add R,H,C,SP tag to boxes within table layout
  203. headers = gather(r".*header$")
  204. rows = gather(r".* (row|header)")
  205. spans = gather(r".*spanning")
  206. clmns = sorted([r for r in self.tb_cpns if re.match(
  207. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  208. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  209. for b in self.boxes:
  210. if b.get("layout_type", "") != "table":
  211. continue
  212. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  213. if ii is not None:
  214. b["R"] = ii
  215. b["R_top"] = rows[ii]["top"]
  216. b["R_bott"] = rows[ii]["bottom"]
  217. ii = Recognizer.find_overlapped_with_threashold(
  218. b, headers, thr=0.3)
  219. if ii is not None:
  220. b["H_top"] = headers[ii]["top"]
  221. b["H_bott"] = headers[ii]["bottom"]
  222. b["H_left"] = headers[ii]["x0"]
  223. b["H_right"] = headers[ii]["x1"]
  224. b["H"] = ii
  225. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  226. if ii is not None:
  227. b["C"] = ii
  228. b["C_left"] = clmns[ii]["x0"]
  229. b["C_right"] = clmns[ii]["x1"]
  230. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  231. if ii is not None:
  232. b["H_top"] = spans[ii]["top"]
  233. b["H_bott"] = spans[ii]["bottom"]
  234. b["H_left"] = spans[ii]["x0"]
  235. b["H_right"] = spans[ii]["x1"]
  236. b["SP"] = ii
  237. def __ocr(self, pagenum, img, chars, ZM=3):
  238. bxs = self.ocr.detect(np.array(img))
  239. if not bxs:
  240. self.boxes.append([])
  241. return
  242. bxs = [(line[0], line[1][0]) for line in bxs]
  243. bxs = Recognizer.sort_Y_firstly(
  244. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  245. "top": b[0][1] / ZM, "text": "", "txt": t,
  246. "bottom": b[-1][1] / ZM,
  247. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  248. self.mean_height[-1] / 3
  249. )
  250. # merge chars in the same rect
  251. for c in Recognizer.sort_X_firstly(
  252. chars, self.mean_width[pagenum - 1] // 4):
  253. ii = Recognizer.find_overlapped(c, bxs)
  254. if ii is None:
  255. self.lefted_chars.append(c)
  256. continue
  257. ch = c["bottom"] - c["top"]
  258. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  259. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  260. self.lefted_chars.append(c)
  261. continue
  262. if c["text"] == " " and bxs[ii]["text"]:
  263. if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]):
  264. bxs[ii]["text"] += " "
  265. else:
  266. bxs[ii]["text"] += c["text"]
  267. for b in bxs:
  268. if not b["text"]:
  269. left, right, top, bott = b["x0"] * ZM, b["x1"] * \
  270. ZM, b["top"] * ZM, b["bottom"] * ZM
  271. b["text"] = self.ocr.recognize(np.array(img),
  272. np.array([[left, top], [right, top], [right, bott], [left, bott]],
  273. dtype=np.float32))
  274. del b["txt"]
  275. bxs = [b for b in bxs if b["text"]]
  276. if self.mean_height[-1] == 0:
  277. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  278. for b in bxs])
  279. self.boxes.append(bxs)
  280. def _layouts_rec(self, ZM, drop=True):
  281. assert len(self.page_images) == len(self.boxes)
  282. self.boxes, self.page_layout = self.layouter(
  283. self.page_images, self.boxes, ZM, drop=drop)
  284. # cumlative Y
  285. for i in range(len(self.boxes)):
  286. self.boxes[i]["top"] += \
  287. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  288. self.boxes[i]["bottom"] += \
  289. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  290. def _text_merge(self):
  291. # merge adjusted boxes
  292. bxs = self.boxes
  293. def end_with(b, txt):
  294. txt = txt.strip()
  295. tt = b.get("text", "").strip()
  296. return tt and tt.find(txt) == len(tt) - len(txt)
  297. def start_with(b, txts):
  298. tt = b.get("text", "").strip()
  299. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  300. # horizontally merge adjacent box with the same layout
  301. i = 0
  302. while i < len(bxs) - 1:
  303. b = bxs[i]
  304. b_ = bxs[i + 1]
  305. if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
  306. "equation"]:
  307. i += 1
  308. continue
  309. if abs(self._y_dis(b, b_)
  310. ) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
  311. # merge
  312. bxs[i]["x1"] = b_["x1"]
  313. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  314. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  315. bxs[i]["text"] += b_["text"]
  316. bxs.pop(i + 1)
  317. continue
  318. i += 1
  319. continue
  320. dis_thr = 1
  321. dis = b["x1"] - b_["x0"]
  322. if b.get("layout_type", "") != "text" or b_.get(
  323. "layout_type", "") != "text":
  324. if end_with(b, ",") or start_with(b_, "(,"):
  325. dis_thr = -8
  326. else:
  327. i += 1
  328. continue
  329. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  330. and dis >= dis_thr and b["x1"] < b_["x1"]:
  331. # merge
  332. bxs[i]["x1"] = b_["x1"]
  333. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  334. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  335. bxs[i]["text"] += b_["text"]
  336. bxs.pop(i + 1)
  337. continue
  338. i += 1
  339. self.boxes = bxs
  340. def _naive_vertical_merge(self):
  341. bxs = Recognizer.sort_Y_firstly(
  342. self.boxes, np.median(
  343. self.mean_height) / 3)
  344. i = 0
  345. while i + 1 < len(bxs):
  346. b = bxs[i]
  347. b_ = bxs[i + 1]
  348. if b["page_number"] < b_["page_number"] and re.match(
  349. r"[0-9 •一—-]+$", b["text"]):
  350. bxs.pop(i)
  351. continue
  352. if not b["text"].strip():
  353. bxs.pop(i)
  354. continue
  355. concatting_feats = [
  356. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  357. len(b["text"].strip()) > 1 and b["text"].strip(
  358. )[-2] in ",;:'\",‘“、;:",
  359. b["text"].strip()[0] in "。;?!?”)),,、:",
  360. ]
  361. # features for not concating
  362. feats = [
  363. b.get("layoutno", 0) != b.get("layoutno", 0),
  364. b["text"].strip()[-1] in "。?!?",
  365. self.is_english and b["text"].strip()[-1] in ".!?",
  366. b["page_number"] == b_["page_number"] and b_["top"] -
  367. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  368. b["page_number"] < b_["page_number"] and abs(
  369. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
  370. ]
  371. # split features
  372. detach_feats = [b["x1"] < b_["x0"],
  373. b["x0"] > b_["x1"]]
  374. if (any(feats) and not any(concatting_feats)) or any(detach_feats):
  375. print(
  376. b["text"],
  377. b_["text"],
  378. any(feats),
  379. any(concatting_feats),
  380. any(detach_feats))
  381. i += 1
  382. continue
  383. # merge up and down
  384. b["bottom"] = b_["bottom"]
  385. b["text"] += b_["text"]
  386. b["x0"] = min(b["x0"], b_["x0"])
  387. b["x1"] = max(b["x1"], b_["x1"])
  388. bxs.pop(i + 1)
  389. self.boxes = bxs
  390. def _concat_downward(self, concat_between_pages=True):
  391. # count boxes in the same row as a feature
  392. for i in range(len(self.boxes)):
  393. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  394. self.boxes[i]["in_row"] = 0
  395. j = max(0, i - 12)
  396. while j < min(i + 12, len(self.boxes)):
  397. if j == i:
  398. j += 1
  399. continue
  400. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  401. if abs(ydis) < 1:
  402. self.boxes[i]["in_row"] += 1
  403. elif ydis > 0:
  404. break
  405. j += 1
  406. # concat between rows
  407. boxes = deepcopy(self.boxes)
  408. blocks = []
  409. while boxes:
  410. chunks = []
  411. def dfs(up, dp):
  412. chunks.append(up)
  413. i = dp
  414. while i < min(dp + 12, len(boxes)):
  415. ydis = self._y_dis(up, boxes[i])
  416. smpg = up["page_number"] == boxes[i]["page_number"]
  417. mh = self.mean_height[up["page_number"] - 1]
  418. mw = self.mean_width[up["page_number"] - 1]
  419. if smpg and ydis > mh * 4:
  420. break
  421. if not smpg and ydis > mh * 16:
  422. break
  423. down = boxes[i]
  424. if not concat_between_pages and down["page_number"] > up["page_number"]:
  425. break
  426. if up.get("R", "") != down.get(
  427. "R", "") and up["text"][-1] != ",":
  428. i += 1
  429. continue
  430. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  431. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
  432. i += 1
  433. continue
  434. if not down["text"].strip():
  435. i += 1
  436. continue
  437. if up["x1"] < down["x0"] - 10 * \
  438. mw or up["x0"] > down["x1"] + 10 * mw:
  439. i += 1
  440. continue
  441. if i - dp < 5 and up.get("layout_type") == "text":
  442. if up.get("layoutno", "1") == down.get(
  443. "layoutno", "2"):
  444. dfs(down, i + 1)
  445. boxes.pop(i)
  446. return
  447. i += 1
  448. continue
  449. fea = self._updown_concat_features(up, down)
  450. if self.updown_cnt_mdl.predict(
  451. xgb.DMatrix([fea]))[0] <= 0.5:
  452. i += 1
  453. continue
  454. dfs(down, i + 1)
  455. boxes.pop(i)
  456. return
  457. dfs(boxes[0], 1)
  458. boxes.pop(0)
  459. if chunks:
  460. blocks.append(chunks)
  461. # concat within each block
  462. boxes = []
  463. for b in blocks:
  464. if len(b) == 1:
  465. boxes.append(b[0])
  466. continue
  467. t = b[0]
  468. for c in b[1:]:
  469. t["text"] = t["text"].strip()
  470. c["text"] = c["text"].strip()
  471. if not c["text"]:
  472. continue
  473. if t["text"] and re.match(
  474. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  475. t["text"] += " "
  476. t["text"] += c["text"]
  477. t["x0"] = min(t["x0"], c["x0"])
  478. t["x1"] = max(t["x1"], c["x1"])
  479. t["page_number"] = min(t["page_number"], c["page_number"])
  480. t["bottom"] = c["bottom"]
  481. if not t["layout_type"] \
  482. and c["layout_type"]:
  483. t["layout_type"] = c["layout_type"]
  484. boxes.append(t)
  485. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  486. def _filter_forpages(self):
  487. if not self.boxes:
  488. return
  489. findit = False
  490. i = 0
  491. while i < len(self.boxes):
  492. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
  493. re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  494. i += 1
  495. continue
  496. findit = True
  497. eng = re.match(
  498. r"[0-9a-zA-Z :'.-]{5,}",
  499. self.boxes[i]["text"].strip())
  500. self.boxes.pop(i)
  501. if i >= len(self.boxes):
  502. break
  503. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  504. self.boxes[i]["text"].strip().split(" ")[:2])
  505. while not prefix:
  506. self.boxes.pop(i)
  507. if i >= len(self.boxes):
  508. break
  509. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  510. self.boxes[i]["text"].strip().split(" ")[:2])
  511. self.boxes.pop(i)
  512. if i >= len(self.boxes) or not prefix:
  513. break
  514. for j in range(i, min(i + 128, len(self.boxes))):
  515. if not re.match(prefix, self.boxes[j]["text"]):
  516. continue
  517. for k in range(i, j):
  518. self.boxes.pop(i)
  519. break
  520. if findit:
  521. return
  522. page_dirty = [0] * len(self.page_images)
  523. for b in self.boxes:
  524. if re.search(r"(··|··|··)", b["text"]):
  525. page_dirty[b["page_number"] - 1] += 1
  526. page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
  527. if not page_dirty:
  528. return
  529. i = 0
  530. while i < len(self.boxes):
  531. if self.boxes[i]["page_number"] in page_dirty:
  532. self.boxes.pop(i)
  533. continue
  534. i += 1
  535. def _merge_with_same_bullet(self):
  536. i = 0
  537. while i + 1 < len(self.boxes):
  538. b = self.boxes[i]
  539. b_ = self.boxes[i + 1]
  540. if not b["text"].strip():
  541. self.boxes.pop(i)
  542. continue
  543. if not b_["text"].strip():
  544. self.boxes.pop(i + 1)
  545. continue
  546. if b["text"].strip()[0] != b_["text"].strip()[0] \
  547. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  548. or huqie.is_chinese(b["text"].strip()[0]) \
  549. or b["top"] > b_["bottom"]:
  550. i += 1
  551. continue
  552. b_["text"] = b["text"] + "\n" + b_["text"]
  553. b_["x0"] = min(b["x0"], b_["x0"])
  554. b_["x1"] = max(b["x1"], b_["x1"])
  555. b_["top"] = b["top"]
  556. self.boxes.pop(i)
  557. def _extract_table_figure(self, need_image, ZM,
  558. return_html, need_position):
  559. tables = {}
  560. figures = {}
  561. # extract figure and table boxes
  562. i = 0
  563. lst_lout_no = ""
  564. nomerge_lout_no = []
  565. while i < len(self.boxes):
  566. if "layoutno" not in self.boxes[i]:
  567. i += 1
  568. continue
  569. lout_no = str(self.boxes[i]["page_number"]) + \
  570. "-" + str(self.boxes[i]["layoutno"])
  571. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
  572. "title",
  573. "figure caption",
  574. "reference"]:
  575. nomerge_lout_no.append(lst_lout_no)
  576. if self.boxes[i]["layout_type"] == "table":
  577. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  578. self.boxes.pop(i)
  579. continue
  580. if lout_no not in tables:
  581. tables[lout_no] = []
  582. tables[lout_no].append(self.boxes[i])
  583. self.boxes.pop(i)
  584. lst_lout_no = lout_no
  585. continue
  586. if need_image and self.boxes[i]["layout_type"] == "figure":
  587. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  588. self.boxes.pop(i)
  589. continue
  590. if lout_no not in figures:
  591. figures[lout_no] = []
  592. figures[lout_no].append(self.boxes[i])
  593. self.boxes.pop(i)
  594. lst_lout_no = lout_no
  595. continue
  596. i += 1
  597. # merge table on different pages
  598. nomerge_lout_no = set(nomerge_lout_no)
  599. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  600. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  601. i = len(tbls) - 1
  602. while i - 1 >= 0:
  603. k0, bxs0 = tbls[i - 1]
  604. k, bxs = tbls[i]
  605. i -= 1
  606. if k0 in nomerge_lout_no:
  607. continue
  608. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  609. continue
  610. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  611. continue
  612. mh = self.mean_height[bxs[0]["page_number"] - 1]
  613. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  614. continue
  615. tables[k0].extend(tables[k])
  616. del tables[k]
  617. def x_overlapped(a, b):
  618. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  619. # find captions and pop out
  620. i = 0
  621. while i < len(self.boxes):
  622. c = self.boxes[i]
  623. # mh = self.mean_height[c["page_number"]-1]
  624. if not TableStructureRecognizer.is_caption(c):
  625. i += 1
  626. continue
  627. # find the nearest layouts
  628. def nearest(tbls):
  629. nonlocal c
  630. mink = ""
  631. minv = 1000000000
  632. for k, bxs in tbls.items():
  633. for b in bxs:
  634. if b.get("layout_type", "").find("caption") >= 0:
  635. continue
  636. y_dis = self._y_dis(c, b)
  637. x_dis = self._x_dis(
  638. c, b) if not x_overlapped(
  639. c, b) else 0
  640. dis = y_dis * y_dis + x_dis * x_dis
  641. if dis < minv:
  642. mink = k
  643. minv = dis
  644. return mink, minv
  645. tk, tv = nearest(tables)
  646. fk, fv = nearest(figures)
  647. # if min(tv, fv) > 2000:
  648. # i += 1
  649. # continue
  650. if tv < fv and tk:
  651. tables[tk].insert(0, c)
  652. logging.debug(
  653. "TABLE:" +
  654. self.boxes[i]["text"] +
  655. "; Cap: " +
  656. tk)
  657. elif fk:
  658. figures[fk].insert(0, c)
  659. logging.debug(
  660. "FIGURE:" +
  661. self.boxes[i]["text"] +
  662. "; Cap: " +
  663. tk)
  664. self.boxes.pop(i)
  665. res = []
  666. positions = []
  667. def cropout(bxs, ltype, poss):
  668. nonlocal ZM
  669. pn = set([b["page_number"] - 1 for b in bxs])
  670. if len(pn) < 2:
  671. pn = list(pn)[0]
  672. ht = self.page_cum_height[pn]
  673. b = {
  674. "x0": np.min([b["x0"] for b in bxs]),
  675. "top": np.min([b["top"] for b in bxs]) - ht,
  676. "x1": np.max([b["x1"] for b in bxs]),
  677. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  678. }
  679. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  680. ii = Recognizer.find_overlapped(b, louts, naive=True)
  681. if ii is not None:
  682. b = louts[ii]
  683. else:
  684. logging.warn(
  685. f"Missing layout match: {pn + 1},%s" %
  686. (bxs[0].get(
  687. "layoutno", "")))
  688. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  689. poss.append((pn + self.page_from, left, right, top, bott))
  690. return self.page_images[pn] \
  691. .crop((left * ZM, top * ZM,
  692. right * ZM, bott * ZM))
  693. pn = {}
  694. for b in bxs:
  695. p = b["page_number"] - 1
  696. if p not in pn:
  697. pn[p] = []
  698. pn[p].append(b)
  699. pn = sorted(pn.items(), key=lambda x: x[0])
  700. imgs = [cropout(arr, ltype, poss) for p, arr in pn]
  701. pic = Image.new("RGB",
  702. (int(np.max([i.size[0] for i in imgs])),
  703. int(np.sum([m.size[1] for m in imgs]))),
  704. (245, 245, 245))
  705. height = 0
  706. for img in imgs:
  707. pic.paste(img, (0, int(height)))
  708. height += img.size[1]
  709. return pic
  710. # crop figure out and add caption
  711. for k, bxs in figures.items():
  712. txt = "\n".join([b["text"] for b in bxs])
  713. if not txt:
  714. continue
  715. poss = []
  716. res.append(
  717. (cropout(
  718. bxs,
  719. "figure", poss),
  720. [txt]))
  721. positions.append(poss)
  722. for k, bxs in tables.items():
  723. if not bxs:
  724. continue
  725. bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
  726. [(b["bottom"] - b["top"]) / 2 for b in bxs]))
  727. poss = []
  728. res.append((cropout(bxs, "table", poss),
  729. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  730. positions.append(poss)
  731. assert len(positions) == len(res)
  732. if need_position:
  733. return list(zip(res, positions))
  734. return res
  735. def proj_match(self, line):
  736. if len(line) <= 2:
  737. return
  738. if re.match(r"[0-9 ().,%%+/-]+$", line):
  739. return False
  740. for p, j in [
  741. (r"第[零一二三四五六七八九十百]+章", 1),
  742. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  743. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  744. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  745. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  746. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  747. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  748. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  749. (r".{,48}[::??]$", 9),
  750. (r"[0-9]+)", 10),
  751. (r"[\((][0-9]+[)\)]", 11),
  752. (r"[零一二三四五六七八九十百]+是", 12),
  753. (r"[⚫•➢✓]", 12)
  754. ]:
  755. if re.match(p, line):
  756. return j
  757. return
  758. def _line_tag(self, bx, ZM):
  759. pn = [bx["page_number"]]
  760. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  761. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  762. if pn[-1] - 1 >= len(self.page_images): return ""
  763. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  764. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  765. pn.append(pn[-1] + 1)
  766. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  767. .format("-".join([str(p) for p in pn]),
  768. bx["x0"], bx["x1"], top, bott)
  769. def __filterout_scraps(self, boxes, ZM):
  770. def width(b):
  771. return b["x1"] - b["x0"]
  772. def height(b):
  773. return b["bottom"] - b["top"]
  774. def usefull(b):
  775. if b.get("layout_type"):
  776. return True
  777. if width(
  778. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  779. return True
  780. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  781. return True
  782. return False
  783. res = []
  784. while boxes:
  785. lines = []
  786. widths = []
  787. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  788. mh = self.mean_height[boxes[0]["page_number"] - 1]
  789. mj = self.proj_match(
  790. boxes[0]["text"]) or boxes[0].get(
  791. "layout_type",
  792. "") == "title"
  793. def dfs(line, st):
  794. nonlocal mh, pw, lines, widths
  795. lines.append(line)
  796. widths.append(width(line))
  797. width_mean = np.mean(widths)
  798. mmj = self.proj_match(
  799. line["text"]) or line.get(
  800. "layout_type",
  801. "") == "title"
  802. for i in range(st + 1, min(st + 20, len(boxes))):
  803. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  804. break
  805. if not mmj and self._y_dis(
  806. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  807. break
  808. if not usefull(boxes[i]):
  809. continue
  810. if mmj or \
  811. (self._x_dis(boxes[i], line) < pw / 10): \
  812. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  813. # concat following
  814. dfs(boxes[i], i)
  815. boxes.pop(i)
  816. break
  817. try:
  818. if usefull(boxes[0]):
  819. dfs(boxes[0], 0)
  820. else:
  821. logging.debug("WASTE: " + boxes[0]["text"])
  822. except Exception as e:
  823. pass
  824. boxes.pop(0)
  825. mw = np.mean(widths)
  826. if mj or mw / pw >= 0.35 or mw > 200:
  827. res.append(
  828. "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  829. else:
  830. logging.debug("REMOVED: " +
  831. "<<".join([c["text"] for c in lines]))
  832. return "\n\n".join(res)
  833. @staticmethod
  834. def total_page_number(fnm, binary=None):
  835. try:
  836. pdf = pdfplumber.open(
  837. fnm) if not binary else pdfplumber.open(BytesIO(binary))
  838. return len(pdf.pages)
  839. except Exception as e:
  840. pdf = fitz.open(fnm) if not binary else fitz.open(
  841. stream=fnm, filetype="pdf")
  842. return len(pdf)
  843. def __images__(self, fnm, zoomin=3, page_from=0,
  844. page_to=299, callback=None):
  845. self.lefted_chars = []
  846. self.mean_height = []
  847. self.mean_width = []
  848. self.boxes = []
  849. self.garbages = {}
  850. self.page_cum_height = [0]
  851. self.page_layout = []
  852. self.page_from = page_from
  853. try:
  854. self.pdf = pdfplumber.open(fnm) if isinstance(
  855. fnm, str) else pdfplumber.open(BytesIO(fnm))
  856. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  857. enumerate(self.pdf.pages[page_from:page_to])]
  858. self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in
  859. self.pdf.pages[page_from:page_to]]
  860. self.total_page = len(self.pdf.pages)
  861. except Exception as e:
  862. self.pdf = fitz.open(fnm) if isinstance(
  863. fnm, str) else fitz.open(
  864. stream=fnm, filetype="pdf")
  865. self.page_images = []
  866. self.page_chars = []
  867. mat = fitz.Matrix(zoomin, zoomin)
  868. self.total_page = len(self.pdf)
  869. for i, page in enumerate(self.pdf):
  870. if i < page_from:
  871. continue
  872. if i >= page_to:
  873. break
  874. pix = page.get_pixmap(matrix=mat)
  875. img = Image.frombytes("RGB", [pix.width, pix.height],
  876. pix.samples)
  877. self.page_images.append(img)
  878. self.page_chars.append([])
  879. self.outlines = []
  880. try:
  881. self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
  882. outlines = self.pdf.outline
  883. def dfs(arr, depth):
  884. for a in arr:
  885. if isinstance(a, dict):
  886. self.outlines.append((a["/Title"], depth))
  887. continue
  888. dfs(a, depth + 1)
  889. dfs(outlines, 0)
  890. except Exception as e:
  891. logging.warning(f"Outlines exception: {e}")
  892. if not self.outlines:
  893. logging.warning(f"Miss outlines")
  894. logging.info("Images converted.")
  895. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
  896. random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
  897. range(len(self.page_chars))]
  898. if sum([1 if e else 0 for e in self.is_english]) > len(
  899. self.page_images) / 2:
  900. self.is_english = True
  901. else:
  902. self.is_english = False
  903. for i, img in enumerate(self.page_images):
  904. chars = self.page_chars[i] if not self.is_english else []
  905. self.mean_height.append(
  906. np.median(sorted([c["height"] for c in chars])) if chars else 0
  907. )
  908. self.mean_width.append(
  909. np.median(sorted([c["width"] for c in chars])) if chars else 8
  910. )
  911. self.page_cum_height.append(img.size[1] / zoomin)
  912. j = 0
  913. while j + 1 < len(chars):
  914. if chars[j]["text"] and chars[j + 1]["text"] \
  915. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  916. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  917. chars[j]["width"]) / 2:
  918. chars[j]["text"] += " "
  919. j += 1
  920. # if i > 0:
  921. # if not chars:
  922. # self.page_cum_height.append(img.size[1] / zoomin)
  923. # else:
  924. # self.page_cum_height.append(
  925. # np.max([c["bottom"] for c in chars]))
  926. self.__ocr(i + 1, img, chars, zoomin)
  927. if callback:
  928. callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
  929. if not self.is_english and not any(
  930. [c for c in self.page_chars]) and self.boxes:
  931. bxes = [b for bxs in self.boxes for b in bxs]
  932. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
  933. "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  934. logging.info("Is it English:", self.is_english)
  935. self.page_cum_height = np.cumsum(self.page_cum_height)
  936. assert len(self.page_cum_height) == len(self.page_images) + 1
  937. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  938. self.__images__(fnm, zoomin)
  939. self._layouts_rec(zoomin)
  940. self._table_transformer_job(zoomin)
  941. self._text_merge()
  942. self._concat_downward()
  943. self._filter_forpages()
  944. tbls = self._extract_table_figure(
  945. need_image, zoomin, return_html, False)
  946. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  947. def remove_tag(self, txt):
  948. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  949. def crop(self, text, ZM=3, need_position=False):
  950. imgs = []
  951. poss = []
  952. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  953. pn, left, right, top, bottom = tag.strip(
  954. "#").strip("@").split("\t")
  955. left, right, top, bottom = float(left), float(
  956. right), float(top), float(bottom)
  957. poss.append(([int(p) - 1 for p in pn.split("-")],
  958. left, right, top, bottom))
  959. if not poss:
  960. if need_position:
  961. return None, None
  962. return
  963. max_width = max(
  964. np.max([right - left for (_, left, right, _, _) in poss]), 6)
  965. GAP = 6
  966. pos = poss[0]
  967. poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
  968. 0, pos[3] - 120), max(pos[3] - GAP, 0)))
  969. pos = poss[-1]
  970. poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
  971. min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
  972. positions = []
  973. for ii, (pns, left, right, top, bottom) in enumerate(poss):
  974. right = left + max_width
  975. bottom *= ZM
  976. for pn in pns[1:]:
  977. bottom += self.page_images[pn - 1].size[1]
  978. imgs.append(
  979. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  980. right *
  981. ZM, min(
  982. bottom, self.page_images[pns[0]].size[1])
  983. ))
  984. )
  985. if 0 < ii < len(poss) - 1:
  986. positions.append((pns[0] + self.page_from, left, right, top, min(
  987. bottom, self.page_images[pns[0]].size[1]) / ZM))
  988. bottom -= self.page_images[pns[0]].size[1]
  989. for pn in pns[1:]:
  990. imgs.append(
  991. self.page_images[pn].crop((left * ZM, 0,
  992. right * ZM,
  993. min(bottom,
  994. self.page_images[pn].size[1])
  995. ))
  996. )
  997. if 0 < ii < len(poss) - 1:
  998. positions.append((pn + self.page_from, left, right, 0, min(
  999. bottom, self.page_images[pn].size[1]) / ZM))
  1000. bottom -= self.page_images[pn].size[1]
  1001. if not imgs:
  1002. if need_position:
  1003. return None, None
  1004. return
  1005. height = 0
  1006. for img in imgs:
  1007. height += img.size[1] + GAP
  1008. height = int(height)
  1009. width = int(np.max([i.size[0] for i in imgs]))
  1010. pic = Image.new("RGB",
  1011. (width, height),
  1012. (245, 245, 245))
  1013. height = 0
  1014. for ii, img in enumerate(imgs):
  1015. if ii == 0 or ii + 1 == len(imgs):
  1016. img = img.convert('RGBA')
  1017. overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
  1018. overlay.putalpha(128)
  1019. img = Image.alpha_composite(img, overlay).convert("RGB")
  1020. pic.paste(img, (0, int(height)))
  1021. height += img.size[1] + GAP
  1022. if need_position:
  1023. return pic, positions
  1024. return pic
  1025. def get_position(self, bx, ZM):
  1026. poss = []
  1027. pn = bx["page_number"]
  1028. top = bx["top"] - self.page_cum_height[pn - 1]
  1029. bott = bx["bottom"] - self.page_cum_height[pn - 1]
  1030. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1031. bott, self.page_images[pn - 1].size[1] / ZM)))
  1032. while bott * ZM > self.page_images[pn - 1].size[1]:
  1033. bott -= self.page_images[pn - 1].size[1] / ZM
  1034. top = 0
  1035. pn += 1
  1036. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1037. bott, self.page_images[pn - 1].size[1] / ZM)))
  1038. return poss
  1039. class PlainParser(object):
  1040. def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
  1041. self.outlines = []
  1042. lines = []
  1043. try:
  1044. self.pdf = pdf2_read(
  1045. filename if isinstance(
  1046. filename, str) else BytesIO(filename))
  1047. for page in self.pdf.pages[from_page:to_page]:
  1048. lines.extend([t for t in page.extract_text().split("\n")])
  1049. outlines = self.pdf.outline
  1050. def dfs(arr, depth):
  1051. for a in arr:
  1052. if isinstance(a, dict):
  1053. self.outlines.append((a["/Title"], depth))
  1054. continue
  1055. dfs(a, depth + 1)
  1056. dfs(outlines, 0)
  1057. except Exception as e:
  1058. logging.warning(f"Outlines exception: {e}")
  1059. if not self.outlines:
  1060. logging.warning(f"Miss outlines")
  1061. return [(l, "") for l in lines], []
  1062. def crop(self, ck, need_position):
  1063. raise NotImplementedError
  1064. @staticmethod
  1065. def remove_tag(txt):
  1066. raise NotImplementedError
  1067. if __name__ == "__main__":
  1068. pass