您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

pdf_parser.py 46KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import random
  4. import fitz
  5. import xgboost as xgb
  6. from io import BytesIO
  7. import torch
  8. import re
  9. import pdfplumber
  10. import logging
  11. from PIL import Image, ImageDraw
  12. import numpy as np
  13. from timeit import default_timer as timer
  14. from PyPDF2 import PdfReader as pdf2_read
  15. from api.utils.file_utils import get_project_base_directory
  16. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  17. from rag.nlp import rag_tokenizer
  18. from copy import deepcopy
  19. from huggingface_hub import snapshot_download
  20. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  21. class RAGFlowPdfParser:
  22. def __init__(self):
  23. self.ocr = OCR()
  24. if hasattr(self, "model_speciess"):
  25. self.layouter = LayoutRecognizer("layout." + self.model_speciess)
  26. else:
  27. self.layouter = LayoutRecognizer("layout")
  28. self.tbl_det = TableStructureRecognizer()
  29. self.updown_cnt_mdl = xgb.Booster()
  30. if torch.cuda.is_available():
  31. self.updown_cnt_mdl.set_param({"device": "cuda"})
  32. try:
  33. model_dir = os.path.join(
  34. get_project_base_directory(),
  35. "rag/res/deepdoc")
  36. self.updown_cnt_mdl.load_model(os.path.join(
  37. model_dir, "updown_concat_xgb.model"))
  38. except Exception as e:
  39. model_dir = snapshot_download(
  40. repo_id="InfiniFlow/text_concat_xgb_v1.0",
  41. local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
  42. local_dir_use_symlinks=False)
  43. self.updown_cnt_mdl.load_model(os.path.join(
  44. model_dir, "updown_concat_xgb.model"))
  45. self.page_from = 0
  46. """
  47. If you have trouble downloading HuggingFace models, -_^ this might help!!
  48. For Linux:
  49. export HF_ENDPOINT=https://hf-mirror.com
  50. For Windows:
  51. Good luck
  52. ^_-
  53. """
  54. def __char_width(self, c):
  55. return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
  56. def __height(self, c):
  57. return c["bottom"] - c["top"]
  58. def _x_dis(self, a, b):
  59. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  60. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  61. def _y_dis(
  62. self, a, b):
  63. return (
  64. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  65. def _match_proj(self, b):
  66. proj_patt = [
  67. r"第[零一二三四五六七八九十百]+章",
  68. r"第[零一二三四五六七八九十百]+[条节]",
  69. r"[零一二三四五六七八九十百]+[、是  ]",
  70. r"[\((][零一二三四五六七八九十百]+[)\)]",
  71. r"[\((][0-9]+[)\)]",
  72. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  73. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  74. r"[⚫•➢①② ]",
  75. ]
  76. return any([re.match(p, b["text"]) for p in proj_patt])
  77. def _updown_concat_features(self, up, down):
  78. w = max(self.__char_width(up), self.__char_width(down))
  79. h = max(self.__height(up), self.__height(down))
  80. y_dis = self._y_dis(up, down)
  81. LEN = 6
  82. tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
  83. tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
  84. tks_all = up["text"][-LEN:].strip() \
  85. + (" " if re.match(r"[a-zA-Z0-9]+",
  86. up["text"][-1] + down["text"][0]) else "") \
  87. + down["text"][:LEN].strip()
  88. tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
  89. fea = [
  90. up.get("R", -1) == down.get("R", -1),
  91. y_dis / h,
  92. down["page_number"] - up["page_number"],
  93. up["layout_type"] == down["layout_type"],
  94. up["layout_type"] == "text",
  95. down["layout_type"] == "text",
  96. up["layout_type"] == "table",
  97. down["layout_type"] == "table",
  98. True if re.search(
  99. r"([。?!;!?;+))]|[a-z]\.)$",
  100. up["text"]) else False,
  101. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  102. True if re.search(
  103. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  104. down["text"]) else False,
  105. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  106. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  107. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  108. True if re.search(r"[\((][^\))]+$", up["text"])
  109. and re.search(r"[\))]", down["text"]) else False,
  110. self._match_proj(down),
  111. True if re.match(r"[A-Z]", down["text"]) else False,
  112. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  113. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  114. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  115. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  116. ) > 1 and len(
  117. down["text"].strip()) > 1 else False,
  118. up["x0"] > down["x1"],
  119. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  120. self.__height(down)),
  121. self._x_dis(up, down) / max(w, 0.000001),
  122. (len(up["text"]) - len(down["text"])) /
  123. max(len(up["text"]), len(down["text"])),
  124. len(tks_all) - len(tks_up) - len(tks_down),
  125. len(tks_down) - len(tks_up),
  126. tks_down[-1] == tks_up[-1],
  127. max(down["in_row"], up["in_row"]),
  128. abs(down["in_row"] - up["in_row"]),
  129. len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
  130. len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
  131. ]
  132. return fea
  133. @staticmethod
  134. def sort_X_by_page(arr, threashold):
  135. # sort using y1 first and then x1
  136. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  137. for i in range(len(arr) - 1):
  138. for j in range(i, -1, -1):
  139. # restore the order using th
  140. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  141. and arr[j + 1]["top"] < arr[j]["top"] \
  142. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  143. tmp = arr[j]
  144. arr[j] = arr[j + 1]
  145. arr[j + 1] = tmp
  146. return arr
  147. def _has_color(self, o):
  148. if o.get("ncs", "") == "DeviceGray":
  149. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  150. o["non_stroking_color"][0] == 1:
  151. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  152. return False
  153. return True
  154. def _table_transformer_job(self, ZM):
  155. logging.info("Table processing...")
  156. imgs, pos = [], []
  157. tbcnt = [0]
  158. MARGIN = 10
  159. self.tb_cpns = []
  160. assert len(self.page_layout) == len(self.page_images)
  161. for p, tbls in enumerate(self.page_layout): # for page
  162. tbls = [f for f in tbls if f["type"] == "table"]
  163. tbcnt.append(len(tbls))
  164. if not tbls:
  165. continue
  166. for tb in tbls: # for table
  167. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  168. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  169. left *= ZM
  170. top *= ZM
  171. right *= ZM
  172. bott *= ZM
  173. pos.append((left, top))
  174. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  175. assert len(self.page_images) == len(tbcnt) - 1
  176. if not imgs:
  177. return
  178. recos = self.tbl_det(imgs)
  179. tbcnt = np.cumsum(tbcnt)
  180. for i in range(len(tbcnt) - 1): # for page
  181. pg = []
  182. for j, tb_items in enumerate(
  183. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  184. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  185. for it in tb_items: # for table components
  186. it["x0"] = (it["x0"] + poss[j][0])
  187. it["x1"] = (it["x1"] + poss[j][0])
  188. it["top"] = (it["top"] + poss[j][1])
  189. it["bottom"] = (it["bottom"] + poss[j][1])
  190. for n in ["x0", "x1", "top", "bottom"]:
  191. it[n] /= ZM
  192. it["top"] += self.page_cum_height[i]
  193. it["bottom"] += self.page_cum_height[i]
  194. it["pn"] = i
  195. it["layoutno"] = j
  196. pg.append(it)
  197. self.tb_cpns.extend(pg)
  198. def gather(kwd, fzy=10, ption=0.6):
  199. eles = Recognizer.sort_Y_firstly(
  200. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  201. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  202. return Recognizer.sort_Y_firstly(eles, 0)
  203. # add R,H,C,SP tag to boxes within table layout
  204. headers = gather(r".*header$")
  205. rows = gather(r".* (row|header)")
  206. spans = gather(r".*spanning")
  207. clmns = sorted([r for r in self.tb_cpns if re.match(
  208. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  209. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  210. for b in self.boxes:
  211. if b.get("layout_type", "") != "table":
  212. continue
  213. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  214. if ii is not None:
  215. b["R"] = ii
  216. b["R_top"] = rows[ii]["top"]
  217. b["R_bott"] = rows[ii]["bottom"]
  218. ii = Recognizer.find_overlapped_with_threashold(
  219. b, headers, thr=0.3)
  220. if ii is not None:
  221. b["H_top"] = headers[ii]["top"]
  222. b["H_bott"] = headers[ii]["bottom"]
  223. b["H_left"] = headers[ii]["x0"]
  224. b["H_right"] = headers[ii]["x1"]
  225. b["H"] = ii
  226. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  227. if ii is not None:
  228. b["C"] = ii
  229. b["C_left"] = clmns[ii]["x0"]
  230. b["C_right"] = clmns[ii]["x1"]
  231. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  232. if ii is not None:
  233. b["H_top"] = spans[ii]["top"]
  234. b["H_bott"] = spans[ii]["bottom"]
  235. b["H_left"] = spans[ii]["x0"]
  236. b["H_right"] = spans[ii]["x1"]
  237. b["SP"] = ii
  238. def __ocr(self, pagenum, img, chars, ZM=3):
  239. bxs = self.ocr.detect(np.array(img))
  240. if not bxs:
  241. self.boxes.append([])
  242. return
  243. bxs = [(line[0], line[1][0]) for line in bxs]
  244. bxs = Recognizer.sort_Y_firstly(
  245. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  246. "top": b[0][1] / ZM, "text": "", "txt": t,
  247. "bottom": b[-1][1] / ZM,
  248. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  249. self.mean_height[-1] / 3
  250. )
  251. # merge chars in the same rect
  252. for c in Recognizer.sort_X_firstly(
  253. chars, self.mean_width[pagenum - 1] // 4):
  254. ii = Recognizer.find_overlapped(c, bxs)
  255. if ii is None:
  256. self.lefted_chars.append(c)
  257. continue
  258. ch = c["bottom"] - c["top"]
  259. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  260. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  261. self.lefted_chars.append(c)
  262. continue
  263. if c["text"] == " " and bxs[ii]["text"]:
  264. if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]):
  265. bxs[ii]["text"] += " "
  266. else:
  267. bxs[ii]["text"] += c["text"]
  268. for b in bxs:
  269. if not b["text"]:
  270. left, right, top, bott = b["x0"] * ZM, b["x1"] * \
  271. ZM, b["top"] * ZM, b["bottom"] * ZM
  272. b["text"] = self.ocr.recognize(np.array(img),
  273. np.array([[left, top], [right, top], [right, bott], [left, bott]],
  274. dtype=np.float32))
  275. del b["txt"]
  276. bxs = [b for b in bxs if b["text"]]
  277. if self.mean_height[-1] == 0:
  278. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  279. for b in bxs])
  280. self.boxes.append(bxs)
  281. def _layouts_rec(self, ZM, drop=True):
  282. assert len(self.page_images) == len(self.boxes)
  283. self.boxes, self.page_layout = self.layouter(
  284. self.page_images, self.boxes, ZM, drop=drop)
  285. # cumlative Y
  286. for i in range(len(self.boxes)):
  287. self.boxes[i]["top"] += \
  288. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  289. self.boxes[i]["bottom"] += \
  290. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  291. def _text_merge(self):
  292. # merge adjusted boxes
  293. bxs = self.boxes
  294. def end_with(b, txt):
  295. txt = txt.strip()
  296. tt = b.get("text", "").strip()
  297. return tt and tt.find(txt) == len(tt) - len(txt)
  298. def start_with(b, txts):
  299. tt = b.get("text", "").strip()
  300. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  301. # horizontally merge adjacent box with the same layout
  302. i = 0
  303. while i < len(bxs) - 1:
  304. b = bxs[i]
  305. b_ = bxs[i + 1]
  306. if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
  307. "equation"]:
  308. i += 1
  309. continue
  310. if abs(self._y_dis(b, b_)
  311. ) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
  312. # merge
  313. bxs[i]["x1"] = b_["x1"]
  314. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  315. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  316. bxs[i]["text"] += b_["text"]
  317. bxs.pop(i + 1)
  318. continue
  319. i += 1
  320. continue
  321. dis_thr = 1
  322. dis = b["x1"] - b_["x0"]
  323. if b.get("layout_type", "") != "text" or b_.get(
  324. "layout_type", "") != "text":
  325. if end_with(b, ",") or start_with(b_, "(,"):
  326. dis_thr = -8
  327. else:
  328. i += 1
  329. continue
  330. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  331. and dis >= dis_thr and b["x1"] < b_["x1"]:
  332. # merge
  333. bxs[i]["x1"] = b_["x1"]
  334. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  335. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  336. bxs[i]["text"] += b_["text"]
  337. bxs.pop(i + 1)
  338. continue
  339. i += 1
  340. self.boxes = bxs
  341. def _naive_vertical_merge(self):
  342. bxs = Recognizer.sort_Y_firstly(
  343. self.boxes, np.median(
  344. self.mean_height) / 3)
  345. i = 0
  346. while i + 1 < len(bxs):
  347. b = bxs[i]
  348. b_ = bxs[i + 1]
  349. if b["page_number"] < b_["page_number"] and re.match(
  350. r"[0-9 •一—-]+$", b["text"]):
  351. bxs.pop(i)
  352. continue
  353. if not b["text"].strip():
  354. bxs.pop(i)
  355. continue
  356. concatting_feats = [
  357. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  358. len(b["text"].strip()) > 1 and b["text"].strip(
  359. )[-2] in ",;:'\",‘“、;:",
  360. b["text"].strip()[0] in "。;?!?”)),,、:",
  361. ]
  362. # features for not concating
  363. feats = [
  364. b.get("layoutno", 0) != b.get("layoutno", 0),
  365. b["text"].strip()[-1] in "。?!?",
  366. self.is_english and b["text"].strip()[-1] in ".!?",
  367. b["page_number"] == b_["page_number"] and b_["top"] -
  368. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  369. b["page_number"] < b_["page_number"] and abs(
  370. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
  371. ]
  372. # split features
  373. detach_feats = [b["x1"] < b_["x0"],
  374. b["x0"] > b_["x1"]]
  375. if (any(feats) and not any(concatting_feats)) or any(detach_feats):
  376. print(
  377. b["text"],
  378. b_["text"],
  379. any(feats),
  380. any(concatting_feats),
  381. any(detach_feats))
  382. i += 1
  383. continue
  384. # merge up and down
  385. b["bottom"] = b_["bottom"]
  386. b["text"] += b_["text"]
  387. b["x0"] = min(b["x0"], b_["x0"])
  388. b["x1"] = max(b["x1"], b_["x1"])
  389. bxs.pop(i + 1)
  390. self.boxes = bxs
  391. def _concat_downward(self, concat_between_pages=True):
  392. # count boxes in the same row as a feature
  393. for i in range(len(self.boxes)):
  394. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  395. self.boxes[i]["in_row"] = 0
  396. j = max(0, i - 12)
  397. while j < min(i + 12, len(self.boxes)):
  398. if j == i:
  399. j += 1
  400. continue
  401. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  402. if abs(ydis) < 1:
  403. self.boxes[i]["in_row"] += 1
  404. elif ydis > 0:
  405. break
  406. j += 1
  407. # concat between rows
  408. boxes = deepcopy(self.boxes)
  409. blocks = []
  410. while boxes:
  411. chunks = []
  412. def dfs(up, dp):
  413. chunks.append(up)
  414. i = dp
  415. while i < min(dp + 12, len(boxes)):
  416. ydis = self._y_dis(up, boxes[i])
  417. smpg = up["page_number"] == boxes[i]["page_number"]
  418. mh = self.mean_height[up["page_number"] - 1]
  419. mw = self.mean_width[up["page_number"] - 1]
  420. if smpg and ydis > mh * 4:
  421. break
  422. if not smpg and ydis > mh * 16:
  423. break
  424. down = boxes[i]
  425. if not concat_between_pages and down["page_number"] > up["page_number"]:
  426. break
  427. if up.get("R", "") != down.get(
  428. "R", "") and up["text"][-1] != ",":
  429. i += 1
  430. continue
  431. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  432. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \
  433. or not down["text"].strip():
  434. i += 1
  435. continue
  436. if not down["text"].strip():
  437. i += 1
  438. continue
  439. if up["x1"] < down["x0"] - 10 * \
  440. mw or up["x0"] > down["x1"] + 10 * mw:
  441. i += 1
  442. continue
  443. if i - dp < 5 and up.get("layout_type") == "text":
  444. if up.get("layoutno", "1") == down.get(
  445. "layoutno", "2"):
  446. dfs(down, i + 1)
  447. boxes.pop(i)
  448. return
  449. i += 1
  450. continue
  451. fea = self._updown_concat_features(up, down)
  452. if self.updown_cnt_mdl.predict(
  453. xgb.DMatrix([fea]))[0] <= 0.5:
  454. i += 1
  455. continue
  456. dfs(down, i + 1)
  457. boxes.pop(i)
  458. return
  459. dfs(boxes[0], 1)
  460. boxes.pop(0)
  461. if chunks:
  462. blocks.append(chunks)
  463. # concat within each block
  464. boxes = []
  465. for b in blocks:
  466. if len(b) == 1:
  467. boxes.append(b[0])
  468. continue
  469. t = b[0]
  470. for c in b[1:]:
  471. t["text"] = t["text"].strip()
  472. c["text"] = c["text"].strip()
  473. if not c["text"]:
  474. continue
  475. if t["text"] and re.match(
  476. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  477. t["text"] += " "
  478. t["text"] += c["text"]
  479. t["x0"] = min(t["x0"], c["x0"])
  480. t["x1"] = max(t["x1"], c["x1"])
  481. t["page_number"] = min(t["page_number"], c["page_number"])
  482. t["bottom"] = c["bottom"]
  483. if not t["layout_type"] \
  484. and c["layout_type"]:
  485. t["layout_type"] = c["layout_type"]
  486. boxes.append(t)
  487. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  488. def _filter_forpages(self):
  489. if not self.boxes:
  490. return
  491. findit = False
  492. i = 0
  493. while i < len(self.boxes):
  494. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
  495. re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  496. i += 1
  497. continue
  498. findit = True
  499. eng = re.match(
  500. r"[0-9a-zA-Z :'.-]{5,}",
  501. self.boxes[i]["text"].strip())
  502. self.boxes.pop(i)
  503. if i >= len(self.boxes):
  504. break
  505. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  506. self.boxes[i]["text"].strip().split(" ")[:2])
  507. while not prefix:
  508. self.boxes.pop(i)
  509. if i >= len(self.boxes):
  510. break
  511. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  512. self.boxes[i]["text"].strip().split(" ")[:2])
  513. self.boxes.pop(i)
  514. if i >= len(self.boxes) or not prefix:
  515. break
  516. for j in range(i, min(i + 128, len(self.boxes))):
  517. if not re.match(prefix, self.boxes[j]["text"]):
  518. continue
  519. for k in range(i, j):
  520. self.boxes.pop(i)
  521. break
  522. if findit:
  523. return
  524. page_dirty = [0] * len(self.page_images)
  525. for b in self.boxes:
  526. if re.search(r"(··|··|··)", b["text"]):
  527. page_dirty[b["page_number"] - 1] += 1
  528. page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
  529. if not page_dirty:
  530. return
  531. i = 0
  532. while i < len(self.boxes):
  533. if self.boxes[i]["page_number"] in page_dirty:
  534. self.boxes.pop(i)
  535. continue
  536. i += 1
  537. def _merge_with_same_bullet(self):
  538. i = 0
  539. while i + 1 < len(self.boxes):
  540. b = self.boxes[i]
  541. b_ = self.boxes[i + 1]
  542. if not b["text"].strip():
  543. self.boxes.pop(i)
  544. continue
  545. if not b_["text"].strip():
  546. self.boxes.pop(i + 1)
  547. continue
  548. if b["text"].strip()[0] != b_["text"].strip()[0] \
  549. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  550. or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
  551. or b["top"] > b_["bottom"]:
  552. i += 1
  553. continue
  554. b_["text"] = b["text"] + "\n" + b_["text"]
  555. b_["x0"] = min(b["x0"], b_["x0"])
  556. b_["x1"] = max(b["x1"], b_["x1"])
  557. b_["top"] = b["top"]
  558. self.boxes.pop(i)
  559. def _extract_table_figure(self, need_image, ZM,
  560. return_html, need_position):
  561. tables = {}
  562. figures = {}
  563. # extract figure and table boxes
  564. i = 0
  565. lst_lout_no = ""
  566. nomerge_lout_no = []
  567. while i < len(self.boxes):
  568. if "layoutno" not in self.boxes[i]:
  569. i += 1
  570. continue
  571. lout_no = str(self.boxes[i]["page_number"]) + \
  572. "-" + str(self.boxes[i]["layoutno"])
  573. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
  574. "title",
  575. "figure caption",
  576. "reference"]:
  577. nomerge_lout_no.append(lst_lout_no)
  578. if self.boxes[i]["layout_type"] == "table":
  579. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  580. self.boxes.pop(i)
  581. continue
  582. if lout_no not in tables:
  583. tables[lout_no] = []
  584. tables[lout_no].append(self.boxes[i])
  585. self.boxes.pop(i)
  586. lst_lout_no = lout_no
  587. continue
  588. if need_image and self.boxes[i]["layout_type"] == "figure":
  589. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  590. self.boxes.pop(i)
  591. continue
  592. if lout_no not in figures:
  593. figures[lout_no] = []
  594. figures[lout_no].append(self.boxes[i])
  595. self.boxes.pop(i)
  596. lst_lout_no = lout_no
  597. continue
  598. i += 1
  599. # merge table on different pages
  600. nomerge_lout_no = set(nomerge_lout_no)
  601. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  602. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  603. i = len(tbls) - 1
  604. while i - 1 >= 0:
  605. k0, bxs0 = tbls[i - 1]
  606. k, bxs = tbls[i]
  607. i -= 1
  608. if k0 in nomerge_lout_no:
  609. continue
  610. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  611. continue
  612. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  613. continue
  614. mh = self.mean_height[bxs[0]["page_number"] - 1]
  615. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  616. continue
  617. tables[k0].extend(tables[k])
  618. del tables[k]
  619. def x_overlapped(a, b):
  620. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  621. # find captions and pop out
  622. i = 0
  623. while i < len(self.boxes):
  624. c = self.boxes[i]
  625. # mh = self.mean_height[c["page_number"]-1]
  626. if not TableStructureRecognizer.is_caption(c):
  627. i += 1
  628. continue
  629. # find the nearest layouts
  630. def nearest(tbls):
  631. nonlocal c
  632. mink = ""
  633. minv = 1000000000
  634. for k, bxs in tbls.items():
  635. for b in bxs:
  636. if b.get("layout_type", "").find("caption") >= 0:
  637. continue
  638. y_dis = self._y_dis(c, b)
  639. x_dis = self._x_dis(
  640. c, b) if not x_overlapped(
  641. c, b) else 0
  642. dis = y_dis * y_dis + x_dis * x_dis
  643. if dis < minv:
  644. mink = k
  645. minv = dis
  646. return mink, minv
  647. tk, tv = nearest(tables)
  648. fk, fv = nearest(figures)
  649. # if min(tv, fv) > 2000:
  650. # i += 1
  651. # continue
  652. if tv < fv and tk:
  653. tables[tk].insert(0, c)
  654. logging.debug(
  655. "TABLE:" +
  656. self.boxes[i]["text"] +
  657. "; Cap: " +
  658. tk)
  659. elif fk:
  660. figures[fk].insert(0, c)
  661. logging.debug(
  662. "FIGURE:" +
  663. self.boxes[i]["text"] +
  664. "; Cap: " +
  665. tk)
  666. self.boxes.pop(i)
  667. res = []
  668. positions = []
  669. def cropout(bxs, ltype, poss):
  670. nonlocal ZM
  671. pn = set([b["page_number"] - 1 for b in bxs])
  672. if len(pn) < 2:
  673. pn = list(pn)[0]
  674. ht = self.page_cum_height[pn]
  675. b = {
  676. "x0": np.min([b["x0"] for b in bxs]),
  677. "top": np.min([b["top"] for b in bxs]) - ht,
  678. "x1": np.max([b["x1"] for b in bxs]),
  679. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  680. }
  681. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  682. ii = Recognizer.find_overlapped(b, louts, naive=True)
  683. if ii is not None:
  684. b = louts[ii]
  685. else:
  686. logging.warn(
  687. f"Missing layout match: {pn + 1},%s" %
  688. (bxs[0].get(
  689. "layoutno", "")))
  690. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  691. poss.append((pn + self.page_from, left, right, top, bott))
  692. return self.page_images[pn] \
  693. .crop((left * ZM, top * ZM,
  694. right * ZM, bott * ZM))
  695. pn = {}
  696. for b in bxs:
  697. p = b["page_number"] - 1
  698. if p not in pn:
  699. pn[p] = []
  700. pn[p].append(b)
  701. pn = sorted(pn.items(), key=lambda x: x[0])
  702. imgs = [cropout(arr, ltype, poss) for p, arr in pn]
  703. pic = Image.new("RGB",
  704. (int(np.max([i.size[0] for i in imgs])),
  705. int(np.sum([m.size[1] for m in imgs]))),
  706. (245, 245, 245))
  707. height = 0
  708. for img in imgs:
  709. pic.paste(img, (0, int(height)))
  710. height += img.size[1]
  711. return pic
  712. # crop figure out and add caption
  713. for k, bxs in figures.items():
  714. txt = "\n".join([b["text"] for b in bxs])
  715. if not txt:
  716. continue
  717. poss = []
  718. res.append(
  719. (cropout(
  720. bxs,
  721. "figure", poss),
  722. [txt]))
  723. positions.append(poss)
  724. for k, bxs in tables.items():
  725. if not bxs:
  726. continue
  727. bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
  728. [(b["bottom"] - b["top"]) / 2 for b in bxs]))
  729. poss = []
  730. res.append((cropout(bxs, "table", poss),
  731. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  732. positions.append(poss)
  733. assert len(positions) == len(res)
  734. if need_position:
  735. return list(zip(res, positions))
  736. return res
  737. def proj_match(self, line):
  738. if len(line) <= 2:
  739. return
  740. if re.match(r"[0-9 ().,%%+/-]+$", line):
  741. return False
  742. for p, j in [
  743. (r"第[零一二三四五六七八九十百]+章", 1),
  744. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  745. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  746. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  747. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  748. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  749. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  750. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  751. (r".{,48}[::??]$", 9),
  752. (r"[0-9]+)", 10),
  753. (r"[\((][0-9]+[)\)]", 11),
  754. (r"[零一二三四五六七八九十百]+是", 12),
  755. (r"[⚫•➢✓]", 12)
  756. ]:
  757. if re.match(p, line):
  758. return j
  759. return
  760. def _line_tag(self, bx, ZM):
  761. pn = [bx["page_number"]]
  762. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  763. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  764. page_images_cnt = len(self.page_images)
  765. if pn[-1] - 1 >= page_images_cnt: return ""
  766. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  767. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  768. pn.append(pn[-1] + 1)
  769. if pn[-1] - 1 >= page_images_cnt:
  770. return ""
  771. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  772. .format("-".join([str(p) for p in pn]),
  773. bx["x0"], bx["x1"], top, bott)
  774. def __filterout_scraps(self, boxes, ZM):
  775. def width(b):
  776. return b["x1"] - b["x0"]
  777. def height(b):
  778. return b["bottom"] - b["top"]
  779. def usefull(b):
  780. if b.get("layout_type"):
  781. return True
  782. if width(
  783. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  784. return True
  785. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  786. return True
  787. return False
  788. res = []
  789. while boxes:
  790. lines = []
  791. widths = []
  792. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  793. mh = self.mean_height[boxes[0]["page_number"] - 1]
  794. mj = self.proj_match(
  795. boxes[0]["text"]) or boxes[0].get(
  796. "layout_type",
  797. "") == "title"
  798. def dfs(line, st):
  799. nonlocal mh, pw, lines, widths
  800. lines.append(line)
  801. widths.append(width(line))
  802. width_mean = np.mean(widths)
  803. mmj = self.proj_match(
  804. line["text"]) or line.get(
  805. "layout_type",
  806. "") == "title"
  807. for i in range(st + 1, min(st + 20, len(boxes))):
  808. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  809. break
  810. if not mmj and self._y_dis(
  811. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  812. break
  813. if not usefull(boxes[i]):
  814. continue
  815. if mmj or \
  816. (self._x_dis(boxes[i], line) < pw / 10): \
  817. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  818. # concat following
  819. dfs(boxes[i], i)
  820. boxes.pop(i)
  821. break
  822. try:
  823. if usefull(boxes[0]):
  824. dfs(boxes[0], 0)
  825. else:
  826. logging.debug("WASTE: " + boxes[0]["text"])
  827. except Exception as e:
  828. pass
  829. boxes.pop(0)
  830. mw = np.mean(widths)
  831. if mj or mw / pw >= 0.35 or mw > 200:
  832. res.append(
  833. "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  834. else:
  835. logging.debug("REMOVED: " +
  836. "<<".join([c["text"] for c in lines]))
  837. return "\n\n".join(res)
  838. @staticmethod
  839. def total_page_number(fnm, binary=None):
  840. try:
  841. pdf = pdfplumber.open(
  842. fnm) if not binary else pdfplumber.open(BytesIO(binary))
  843. return len(pdf.pages)
  844. except Exception as e:
  845. pdf = fitz.open(fnm) if not binary else fitz.open(
  846. stream=fnm, filetype="pdf")
  847. return len(pdf)
  848. def __images__(self, fnm, zoomin=3, page_from=0,
  849. page_to=299, callback=None):
  850. self.lefted_chars = []
  851. self.mean_height = []
  852. self.mean_width = []
  853. self.boxes = []
  854. self.garbages = {}
  855. self.page_cum_height = [0]
  856. self.page_layout = []
  857. self.page_from = page_from
  858. st = timer()
  859. try:
  860. self.pdf = pdfplumber.open(fnm) if isinstance(
  861. fnm, str) else pdfplumber.open(BytesIO(fnm))
  862. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  863. enumerate(self.pdf.pages[page_from:page_to])]
  864. self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in
  865. self.pdf.pages[page_from:page_to]]
  866. self.total_page = len(self.pdf.pages)
  867. except Exception as e:
  868. self.pdf = fitz.open(fnm) if isinstance(
  869. fnm, str) else fitz.open(
  870. stream=fnm, filetype="pdf")
  871. self.page_images = []
  872. self.page_chars = []
  873. mat = fitz.Matrix(zoomin, zoomin)
  874. self.total_page = len(self.pdf)
  875. for i, page in enumerate(self.pdf):
  876. if i < page_from:
  877. continue
  878. if i >= page_to:
  879. break
  880. pix = page.get_pixmap(matrix=mat)
  881. img = Image.frombytes("RGB", [pix.width, pix.height],
  882. pix.samples)
  883. self.page_images.append(img)
  884. self.page_chars.append([])
  885. self.outlines = []
  886. try:
  887. self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
  888. outlines = self.pdf.outline
  889. def dfs(arr, depth):
  890. for a in arr:
  891. if isinstance(a, dict):
  892. self.outlines.append((a["/Title"], depth))
  893. continue
  894. dfs(a, depth + 1)
  895. dfs(outlines, 0)
  896. except Exception as e:
  897. logging.warning(f"Outlines exception: {e}")
  898. if not self.outlines:
  899. logging.warning(f"Miss outlines")
  900. logging.info("Images converted.")
  901. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
  902. random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
  903. range(len(self.page_chars))]
  904. if sum([1 if e else 0 for e in self.is_english]) > len(
  905. self.page_images) / 2:
  906. self.is_english = True
  907. else:
  908. self.is_english = False
  909. self.is_english = False
  910. st = timer()
  911. for i, img in enumerate(self.page_images):
  912. chars = self.page_chars[i] if not self.is_english else []
  913. self.mean_height.append(
  914. np.median(sorted([c["height"] for c in chars])) if chars else 0
  915. )
  916. self.mean_width.append(
  917. np.median(sorted([c["width"] for c in chars])) if chars else 8
  918. )
  919. self.page_cum_height.append(img.size[1] / zoomin)
  920. j = 0
  921. while j + 1 < len(chars):
  922. if chars[j]["text"] and chars[j + 1]["text"] \
  923. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  924. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  925. chars[j]["width"]) / 2:
  926. chars[j]["text"] += " "
  927. j += 1
  928. self.__ocr(i + 1, img, chars, zoomin)
  929. if callback and i % 6 == 5:
  930. callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
  931. # print("OCR:", timer()-st)
  932. if not self.is_english and not any(
  933. [c for c in self.page_chars]) and self.boxes:
  934. bxes = [b for bxs in self.boxes for b in bxs]
  935. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
  936. "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  937. logging.info("Is it English:", self.is_english)
  938. self.page_cum_height = np.cumsum(self.page_cum_height)
  939. assert len(self.page_cum_height) == len(self.page_images) + 1
  940. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  941. self.__images__(fnm, zoomin)
  942. self._layouts_rec(zoomin)
  943. self._table_transformer_job(zoomin)
  944. self._text_merge()
  945. self._concat_downward()
  946. self._filter_forpages()
  947. tbls = self._extract_table_figure(
  948. need_image, zoomin, return_html, False)
  949. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  950. def remove_tag(self, txt):
  951. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  952. def crop(self, text, ZM=3, need_position=False):
  953. imgs = []
  954. poss = []
  955. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  956. pn, left, right, top, bottom = tag.strip(
  957. "#").strip("@").split("\t")
  958. left, right, top, bottom = float(left), float(
  959. right), float(top), float(bottom)
  960. poss.append(([int(p) - 1 for p in pn.split("-")],
  961. left, right, top, bottom))
  962. if not poss:
  963. if need_position:
  964. return None, None
  965. return
  966. max_width = max(
  967. np.max([right - left for (_, left, right, _, _) in poss]), 6)
  968. GAP = 6
  969. pos = poss[0]
  970. poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
  971. 0, pos[3] - 120), max(pos[3] - GAP, 0)))
  972. pos = poss[-1]
  973. poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
  974. min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
  975. positions = []
  976. for ii, (pns, left, right, top, bottom) in enumerate(poss):
  977. right = left + max_width
  978. bottom *= ZM
  979. for pn in pns[1:]:
  980. bottom += self.page_images[pn - 1].size[1]
  981. imgs.append(
  982. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  983. right *
  984. ZM, min(
  985. bottom, self.page_images[pns[0]].size[1])
  986. ))
  987. )
  988. if 0 < ii < len(poss) - 1:
  989. positions.append((pns[0] + self.page_from, left, right, top, min(
  990. bottom, self.page_images[pns[0]].size[1]) / ZM))
  991. bottom -= self.page_images[pns[0]].size[1]
  992. for pn in pns[1:]:
  993. imgs.append(
  994. self.page_images[pn].crop((left * ZM, 0,
  995. right * ZM,
  996. min(bottom,
  997. self.page_images[pn].size[1])
  998. ))
  999. )
  1000. if 0 < ii < len(poss) - 1:
  1001. positions.append((pn + self.page_from, left, right, 0, min(
  1002. bottom, self.page_images[pn].size[1]) / ZM))
  1003. bottom -= self.page_images[pn].size[1]
  1004. if not imgs:
  1005. if need_position:
  1006. return None, None
  1007. return
  1008. height = 0
  1009. for img in imgs:
  1010. height += img.size[1] + GAP
  1011. height = int(height)
  1012. width = int(np.max([i.size[0] for i in imgs]))
  1013. pic = Image.new("RGB",
  1014. (width, height),
  1015. (245, 245, 245))
  1016. height = 0
  1017. for ii, img in enumerate(imgs):
  1018. if ii == 0 or ii + 1 == len(imgs):
  1019. img = img.convert('RGBA')
  1020. overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
  1021. overlay.putalpha(128)
  1022. img = Image.alpha_composite(img, overlay).convert("RGB")
  1023. pic.paste(img, (0, int(height)))
  1024. height += img.size[1] + GAP
  1025. if need_position:
  1026. return pic, positions
  1027. return pic
  1028. def get_position(self, bx, ZM):
  1029. poss = []
  1030. pn = bx["page_number"]
  1031. top = bx["top"] - self.page_cum_height[pn - 1]
  1032. bott = bx["bottom"] - self.page_cum_height[pn - 1]
  1033. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1034. bott, self.page_images[pn - 1].size[1] / ZM)))
  1035. while bott * ZM > self.page_images[pn - 1].size[1]:
  1036. bott -= self.page_images[pn - 1].size[1] / ZM
  1037. top = 0
  1038. pn += 1
  1039. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1040. bott, self.page_images[pn - 1].size[1] / ZM)))
  1041. return poss
  1042. class PlainParser(object):
  1043. def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
  1044. self.outlines = []
  1045. lines = []
  1046. try:
  1047. self.pdf = pdf2_read(
  1048. filename if isinstance(
  1049. filename, str) else BytesIO(filename))
  1050. for page in self.pdf.pages[from_page:to_page]:
  1051. lines.extend([t for t in page.extract_text().split("\n")])
  1052. outlines = self.pdf.outline
  1053. def dfs(arr, depth):
  1054. for a in arr:
  1055. if isinstance(a, dict):
  1056. self.outlines.append((a["/Title"], depth))
  1057. continue
  1058. dfs(a, depth + 1)
  1059. dfs(outlines, 0)
  1060. except Exception as e:
  1061. logging.warning(f"Outlines exception: {e}")
  1062. if not self.outlines:
  1063. logging.warning(f"Miss outlines")
  1064. return [(l, "") for l in lines], []
  1065. def crop(self, ck, need_position):
  1066. raise NotImplementedError
  1067. @staticmethod
  1068. def remove_tag(txt):
  1069. raise NotImplementedError
  1070. if __name__ == "__main__":
  1071. pass