您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import os
  14. import random
  15. import xgboost as xgb
  16. from io import BytesIO
  17. import re
  18. import pdfplumber
  19. import logging
  20. from PIL import Image, ImageDraw
  21. import numpy as np
  22. from timeit import default_timer as timer
  23. from pypdf import PdfReader as pdf2_read
  24. from api.settings import LIGHTEN
  25. from api.utils.file_utils import get_project_base_directory
  26. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  27. from rag.nlp import rag_tokenizer
  28. from copy import deepcopy
  29. from huggingface_hub import snapshot_download
  30. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  31. class RAGFlowPdfParser:
  32. def __init__(self):
  33. self.ocr = OCR()
  34. if hasattr(self, "model_speciess"):
  35. self.layouter = LayoutRecognizer("layout." + self.model_speciess)
  36. else:
  37. self.layouter = LayoutRecognizer("layout")
  38. self.tbl_det = TableStructureRecognizer()
  39. self.updown_cnt_mdl = xgb.Booster()
  40. if not LIGHTEN:
  41. import torch
  42. if torch.cuda.is_available():
  43. self.updown_cnt_mdl.set_param({"device": "cuda"})
  44. try:
  45. model_dir = os.path.join(
  46. get_project_base_directory(),
  47. "rag/res/deepdoc")
  48. self.updown_cnt_mdl.load_model(os.path.join(
  49. model_dir, "updown_concat_xgb.model"))
  50. except Exception as e:
  51. model_dir = snapshot_download(
  52. repo_id="InfiniFlow/text_concat_xgb_v1.0",
  53. local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
  54. local_dir_use_symlinks=False)
  55. self.updown_cnt_mdl.load_model(os.path.join(
  56. model_dir, "updown_concat_xgb.model"))
  57. self.page_from = 0
  58. """
  59. If you have trouble downloading HuggingFace models, -_^ this might help!!
  60. For Linux:
  61. export HF_ENDPOINT=https://hf-mirror.com
  62. For Windows:
  63. Good luck
  64. ^_-
  65. """
  66. def __char_width(self, c):
  67. return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
  68. def __height(self, c):
  69. return c["bottom"] - c["top"]
  70. def _x_dis(self, a, b):
  71. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  72. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  73. def _y_dis(
  74. self, a, b):
  75. return (
  76. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  77. def _match_proj(self, b):
  78. proj_patt = [
  79. r"第[零一二三四五六七八九十百]+章",
  80. r"第[零一二三四五六七八九十百]+[条节]",
  81. r"[零一二三四五六七八九十百]+[、是  ]",
  82. r"[\((][零一二三四五六七八九十百]+[)\)]",
  83. r"[\((][0-9]+[)\)]",
  84. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  85. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  86. r"[⚫•➢①② ]",
  87. ]
  88. return any([re.match(p, b["text"]) for p in proj_patt])
  89. def _updown_concat_features(self, up, down):
  90. w = max(self.__char_width(up), self.__char_width(down))
  91. h = max(self.__height(up), self.__height(down))
  92. y_dis = self._y_dis(up, down)
  93. LEN = 6
  94. tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
  95. tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
  96. tks_all = up["text"][-LEN:].strip() \
  97. + (" " if re.match(r"[a-zA-Z0-9]+",
  98. up["text"][-1] + down["text"][0]) else "") \
  99. + down["text"][:LEN].strip()
  100. tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
  101. fea = [
  102. up.get("R", -1) == down.get("R", -1),
  103. y_dis / h,
  104. down["page_number"] - up["page_number"],
  105. up["layout_type"] == down["layout_type"],
  106. up["layout_type"] == "text",
  107. down["layout_type"] == "text",
  108. up["layout_type"] == "table",
  109. down["layout_type"] == "table",
  110. True if re.search(
  111. r"([。?!;!?;+))]|[a-z]\.)$",
  112. up["text"]) else False,
  113. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  114. True if re.search(
  115. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  116. down["text"]) else False,
  117. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  118. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  119. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  120. True if re.search(r"[\((][^\))]+$", up["text"])
  121. and re.search(r"[\))]", down["text"]) else False,
  122. self._match_proj(down),
  123. True if re.match(r"[A-Z]", down["text"]) else False,
  124. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  125. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  126. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  127. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  128. ) > 1 and len(
  129. down["text"].strip()) > 1 else False,
  130. up["x0"] > down["x1"],
  131. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  132. self.__height(down)),
  133. self._x_dis(up, down) / max(w, 0.000001),
  134. (len(up["text"]) - len(down["text"])) /
  135. max(len(up["text"]), len(down["text"])),
  136. len(tks_all) - len(tks_up) - len(tks_down),
  137. len(tks_down) - len(tks_up),
  138. tks_down[-1] == tks_up[-1],
  139. max(down["in_row"], up["in_row"]),
  140. abs(down["in_row"] - up["in_row"]),
  141. len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
  142. len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
  143. ]
  144. return fea
  145. @staticmethod
  146. def sort_X_by_page(arr, threashold):
  147. # sort using y1 first and then x1
  148. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  149. for i in range(len(arr) - 1):
  150. for j in range(i, -1, -1):
  151. # restore the order using th
  152. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  153. and arr[j + 1]["top"] < arr[j]["top"] \
  154. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  155. tmp = arr[j]
  156. arr[j] = arr[j + 1]
  157. arr[j + 1] = tmp
  158. return arr
  159. def _has_color(self, o):
  160. if o.get("ncs", "") == "DeviceGray":
  161. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  162. o["non_stroking_color"][0] == 1:
  163. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  164. return False
  165. return True
  166. def _table_transformer_job(self, ZM):
  167. logging.info("Table processing...")
  168. imgs, pos = [], []
  169. tbcnt = [0]
  170. MARGIN = 10
  171. self.tb_cpns = []
  172. assert len(self.page_layout) == len(self.page_images)
  173. for p, tbls in enumerate(self.page_layout): # for page
  174. tbls = [f for f in tbls if f["type"] == "table"]
  175. tbcnt.append(len(tbls))
  176. if not tbls:
  177. continue
  178. for tb in tbls: # for table
  179. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  180. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  181. left *= ZM
  182. top *= ZM
  183. right *= ZM
  184. bott *= ZM
  185. pos.append((left, top))
  186. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  187. assert len(self.page_images) == len(tbcnt) - 1
  188. if not imgs:
  189. return
  190. recos = self.tbl_det(imgs)
  191. tbcnt = np.cumsum(tbcnt)
  192. for i in range(len(tbcnt) - 1): # for page
  193. pg = []
  194. for j, tb_items in enumerate(
  195. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  196. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  197. for it in tb_items: # for table components
  198. it["x0"] = (it["x0"] + poss[j][0])
  199. it["x1"] = (it["x1"] + poss[j][0])
  200. it["top"] = (it["top"] + poss[j][1])
  201. it["bottom"] = (it["bottom"] + poss[j][1])
  202. for n in ["x0", "x1", "top", "bottom"]:
  203. it[n] /= ZM
  204. it["top"] += self.page_cum_height[i]
  205. it["bottom"] += self.page_cum_height[i]
  206. it["pn"] = i
  207. it["layoutno"] = j
  208. pg.append(it)
  209. self.tb_cpns.extend(pg)
  210. def gather(kwd, fzy=10, ption=0.6):
  211. eles = Recognizer.sort_Y_firstly(
  212. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  213. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  214. return Recognizer.sort_Y_firstly(eles, 0)
  215. # add R,H,C,SP tag to boxes within table layout
  216. headers = gather(r".*header$")
  217. rows = gather(r".* (row|header)")
  218. spans = gather(r".*spanning")
  219. clmns = sorted([r for r in self.tb_cpns if re.match(
  220. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  221. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  222. for b in self.boxes:
  223. if b.get("layout_type", "") != "table":
  224. continue
  225. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  226. if ii is not None:
  227. b["R"] = ii
  228. b["R_top"] = rows[ii]["top"]
  229. b["R_bott"] = rows[ii]["bottom"]
  230. ii = Recognizer.find_overlapped_with_threashold(
  231. b, headers, thr=0.3)
  232. if ii is not None:
  233. b["H_top"] = headers[ii]["top"]
  234. b["H_bott"] = headers[ii]["bottom"]
  235. b["H_left"] = headers[ii]["x0"]
  236. b["H_right"] = headers[ii]["x1"]
  237. b["H"] = ii
  238. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  239. if ii is not None:
  240. b["C"] = ii
  241. b["C_left"] = clmns[ii]["x0"]
  242. b["C_right"] = clmns[ii]["x1"]
  243. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  244. if ii is not None:
  245. b["H_top"] = spans[ii]["top"]
  246. b["H_bott"] = spans[ii]["bottom"]
  247. b["H_left"] = spans[ii]["x0"]
  248. b["H_right"] = spans[ii]["x1"]
  249. b["SP"] = ii
  250. def __ocr(self, pagenum, img, chars, ZM=3):
  251. bxs = self.ocr.detect(np.array(img))
  252. if not bxs:
  253. self.boxes.append([])
  254. return
  255. bxs = [(line[0], line[1][0]) for line in bxs]
  256. bxs = Recognizer.sort_Y_firstly(
  257. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  258. "top": b[0][1] / ZM, "text": "", "txt": t,
  259. "bottom": b[-1][1] / ZM,
  260. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  261. self.mean_height[-1] / 3
  262. )
  263. # merge chars in the same rect
  264. for c in Recognizer.sort_Y_firstly(
  265. chars, self.mean_height[pagenum - 1] // 4):
  266. ii = Recognizer.find_overlapped(c, bxs)
  267. if ii is None:
  268. self.lefted_chars.append(c)
  269. continue
  270. ch = c["bottom"] - c["top"]
  271. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  272. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  273. self.lefted_chars.append(c)
  274. continue
  275. if c["text"] == " " and bxs[ii]["text"]:
  276. if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", bxs[ii]["text"][-1]):
  277. bxs[ii]["text"] += " "
  278. else:
  279. bxs[ii]["text"] += c["text"]
  280. for b in bxs:
  281. if not b["text"]:
  282. left, right, top, bott = b["x0"] * ZM, b["x1"] * \
  283. ZM, b["top"] * ZM, b["bottom"] * ZM
  284. b["text"] = self.ocr.recognize(np.array(img),
  285. np.array([[left, top], [right, top], [right, bott], [left, bott]],
  286. dtype=np.float32))
  287. del b["txt"]
  288. bxs = [b for b in bxs if b["text"]]
  289. if self.mean_height[-1] == 0:
  290. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  291. for b in bxs])
  292. self.boxes.append(bxs)
  293. def _layouts_rec(self, ZM, drop=True):
  294. assert len(self.page_images) == len(self.boxes)
  295. self.boxes, self.page_layout = self.layouter(
  296. self.page_images, self.boxes, ZM, drop=drop)
  297. # cumlative Y
  298. for i in range(len(self.boxes)):
  299. self.boxes[i]["top"] += \
  300. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  301. self.boxes[i]["bottom"] += \
  302. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  303. def _text_merge(self):
  304. # merge adjusted boxes
  305. bxs = self.boxes
  306. def end_with(b, txt):
  307. txt = txt.strip()
  308. tt = b.get("text", "").strip()
  309. return tt and tt.find(txt) == len(tt) - len(txt)
  310. def start_with(b, txts):
  311. tt = b.get("text", "").strip()
  312. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  313. # horizontally merge adjacent box with the same layout
  314. i = 0
  315. while i < len(bxs) - 1:
  316. b = bxs[i]
  317. b_ = bxs[i + 1]
  318. if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
  319. "equation"]:
  320. i += 1
  321. continue
  322. if abs(self._y_dis(b, b_)
  323. ) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
  324. # merge
  325. bxs[i]["x1"] = b_["x1"]
  326. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  327. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  328. bxs[i]["text"] += b_["text"]
  329. bxs.pop(i + 1)
  330. continue
  331. i += 1
  332. continue
  333. dis_thr = 1
  334. dis = b["x1"] - b_["x0"]
  335. if b.get("layout_type", "") != "text" or b_.get(
  336. "layout_type", "") != "text":
  337. if end_with(b, ",") or start_with(b_, "(,"):
  338. dis_thr = -8
  339. else:
  340. i += 1
  341. continue
  342. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  343. and dis >= dis_thr and b["x1"] < b_["x1"]:
  344. # merge
  345. bxs[i]["x1"] = b_["x1"]
  346. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  347. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  348. bxs[i]["text"] += b_["text"]
  349. bxs.pop(i + 1)
  350. continue
  351. i += 1
  352. self.boxes = bxs
  353. def _naive_vertical_merge(self):
  354. bxs = Recognizer.sort_Y_firstly(
  355. self.boxes, np.median(
  356. self.mean_height) / 3)
  357. i = 0
  358. while i + 1 < len(bxs):
  359. b = bxs[i]
  360. b_ = bxs[i + 1]
  361. if b["page_number"] < b_["page_number"] and re.match(
  362. r"[0-9 •一—-]+$", b["text"]):
  363. bxs.pop(i)
  364. continue
  365. if not b["text"].strip():
  366. bxs.pop(i)
  367. continue
  368. concatting_feats = [
  369. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  370. len(b["text"].strip()) > 1 and b["text"].strip(
  371. )[-2] in ",;:'\",‘“、;:",
  372. b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:",
  373. ]
  374. # features for not concating
  375. feats = [
  376. b.get("layoutno", 0) != b_.get("layoutno", 0),
  377. b["text"].strip()[-1] in "。?!?",
  378. self.is_english and b["text"].strip()[-1] in ".!?",
  379. b["page_number"] == b_["page_number"] and b_["top"] -
  380. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  381. b["page_number"] < b_["page_number"] and abs(
  382. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
  383. ]
  384. # split features
  385. detach_feats = [b["x1"] < b_["x0"],
  386. b["x0"] > b_["x1"]]
  387. if (any(feats) and not any(concatting_feats)) or any(detach_feats):
  388. print(
  389. b["text"],
  390. b_["text"],
  391. any(feats),
  392. any(concatting_feats),
  393. any(detach_feats))
  394. i += 1
  395. continue
  396. # merge up and down
  397. b["bottom"] = b_["bottom"]
  398. b["text"] += b_["text"]
  399. b["x0"] = min(b["x0"], b_["x0"])
  400. b["x1"] = max(b["x1"], b_["x1"])
  401. bxs.pop(i + 1)
  402. self.boxes = bxs
  403. def _concat_downward(self, concat_between_pages=True):
  404. # count boxes in the same row as a feature
  405. for i in range(len(self.boxes)):
  406. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  407. self.boxes[i]["in_row"] = 0
  408. j = max(0, i - 12)
  409. while j < min(i + 12, len(self.boxes)):
  410. if j == i:
  411. j += 1
  412. continue
  413. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  414. if abs(ydis) < 1:
  415. self.boxes[i]["in_row"] += 1
  416. elif ydis > 0:
  417. break
  418. j += 1
  419. # concat between rows
  420. boxes = deepcopy(self.boxes)
  421. blocks = []
  422. while boxes:
  423. chunks = []
  424. def dfs(up, dp):
  425. chunks.append(up)
  426. i = dp
  427. while i < min(dp + 12, len(boxes)):
  428. ydis = self._y_dis(up, boxes[i])
  429. smpg = up["page_number"] == boxes[i]["page_number"]
  430. mh = self.mean_height[up["page_number"] - 1]
  431. mw = self.mean_width[up["page_number"] - 1]
  432. if smpg and ydis > mh * 4:
  433. break
  434. if not smpg and ydis > mh * 16:
  435. break
  436. down = boxes[i]
  437. if not concat_between_pages and down["page_number"] > up["page_number"]:
  438. break
  439. if up.get("R", "") != down.get(
  440. "R", "") and up["text"][-1] != ",":
  441. i += 1
  442. continue
  443. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  444. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \
  445. or not down["text"].strip():
  446. i += 1
  447. continue
  448. if not down["text"].strip() or not up["text"].strip():
  449. i += 1
  450. continue
  451. if up["x1"] < down["x0"] - 10 * \
  452. mw or up["x0"] > down["x1"] + 10 * mw:
  453. i += 1
  454. continue
  455. if i - dp < 5 and up.get("layout_type") == "text":
  456. if up.get("layoutno", "1") == down.get(
  457. "layoutno", "2"):
  458. dfs(down, i + 1)
  459. boxes.pop(i)
  460. return
  461. i += 1
  462. continue
  463. fea = self._updown_concat_features(up, down)
  464. if self.updown_cnt_mdl.predict(
  465. xgb.DMatrix([fea]))[0] <= 0.5:
  466. i += 1
  467. continue
  468. dfs(down, i + 1)
  469. boxes.pop(i)
  470. return
  471. dfs(boxes[0], 1)
  472. boxes.pop(0)
  473. if chunks:
  474. blocks.append(chunks)
  475. # concat within each block
  476. boxes = []
  477. for b in blocks:
  478. if len(b) == 1:
  479. boxes.append(b[0])
  480. continue
  481. t = b[0]
  482. for c in b[1:]:
  483. t["text"] = t["text"].strip()
  484. c["text"] = c["text"].strip()
  485. if not c["text"]:
  486. continue
  487. if t["text"] and re.match(
  488. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  489. t["text"] += " "
  490. t["text"] += c["text"]
  491. t["x0"] = min(t["x0"], c["x0"])
  492. t["x1"] = max(t["x1"], c["x1"])
  493. t["page_number"] = min(t["page_number"], c["page_number"])
  494. t["bottom"] = c["bottom"]
  495. if not t["layout_type"] \
  496. and c["layout_type"]:
  497. t["layout_type"] = c["layout_type"]
  498. boxes.append(t)
  499. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  500. def _filter_forpages(self):
  501. if not self.boxes:
  502. return
  503. findit = False
  504. i = 0
  505. while i < len(self.boxes):
  506. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
  507. re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  508. i += 1
  509. continue
  510. findit = True
  511. eng = re.match(
  512. r"[0-9a-zA-Z :'.-]{5,}",
  513. self.boxes[i]["text"].strip())
  514. self.boxes.pop(i)
  515. if i >= len(self.boxes):
  516. break
  517. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  518. self.boxes[i]["text"].strip().split(" ")[:2])
  519. while not prefix:
  520. self.boxes.pop(i)
  521. if i >= len(self.boxes):
  522. break
  523. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  524. self.boxes[i]["text"].strip().split(" ")[:2])
  525. self.boxes.pop(i)
  526. if i >= len(self.boxes) or not prefix:
  527. break
  528. for j in range(i, min(i + 128, len(self.boxes))):
  529. if not re.match(prefix, self.boxes[j]["text"]):
  530. continue
  531. for k in range(i, j):
  532. self.boxes.pop(i)
  533. break
  534. if findit:
  535. return
  536. page_dirty = [0] * len(self.page_images)
  537. for b in self.boxes:
  538. if re.search(r"(··|··|··)", b["text"]):
  539. page_dirty[b["page_number"] - 1] += 1
  540. page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
  541. if not page_dirty:
  542. return
  543. i = 0
  544. while i < len(self.boxes):
  545. if self.boxes[i]["page_number"] in page_dirty:
  546. self.boxes.pop(i)
  547. continue
  548. i += 1
  549. def _merge_with_same_bullet(self):
  550. i = 0
  551. while i + 1 < len(self.boxes):
  552. b = self.boxes[i]
  553. b_ = self.boxes[i + 1]
  554. if not b["text"].strip():
  555. self.boxes.pop(i)
  556. continue
  557. if not b_["text"].strip():
  558. self.boxes.pop(i + 1)
  559. continue
  560. if b["text"].strip()[0] != b_["text"].strip()[0] \
  561. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  562. or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
  563. or b["top"] > b_["bottom"]:
  564. i += 1
  565. continue
  566. b_["text"] = b["text"] + "\n" + b_["text"]
  567. b_["x0"] = min(b["x0"], b_["x0"])
  568. b_["x1"] = max(b["x1"], b_["x1"])
  569. b_["top"] = b["top"]
  570. self.boxes.pop(i)
  571. def _extract_table_figure(self, need_image, ZM,
  572. return_html, need_position):
  573. tables = {}
  574. figures = {}
  575. # extract figure and table boxes
  576. i = 0
  577. lst_lout_no = ""
  578. nomerge_lout_no = []
  579. while i < len(self.boxes):
  580. if "layoutno" not in self.boxes[i]:
  581. i += 1
  582. continue
  583. lout_no = str(self.boxes[i]["page_number"]) + \
  584. "-" + str(self.boxes[i]["layoutno"])
  585. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
  586. "title",
  587. "figure caption",
  588. "reference"]:
  589. nomerge_lout_no.append(lst_lout_no)
  590. if self.boxes[i]["layout_type"] == "table":
  591. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  592. self.boxes.pop(i)
  593. continue
  594. if lout_no not in tables:
  595. tables[lout_no] = []
  596. tables[lout_no].append(self.boxes[i])
  597. self.boxes.pop(i)
  598. lst_lout_no = lout_no
  599. continue
  600. if need_image and self.boxes[i]["layout_type"] == "figure":
  601. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  602. self.boxes.pop(i)
  603. continue
  604. if lout_no not in figures:
  605. figures[lout_no] = []
  606. figures[lout_no].append(self.boxes[i])
  607. self.boxes.pop(i)
  608. lst_lout_no = lout_no
  609. continue
  610. i += 1
  611. # merge table on different pages
  612. nomerge_lout_no = set(nomerge_lout_no)
  613. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  614. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  615. i = len(tbls) - 1
  616. while i - 1 >= 0:
  617. k0, bxs0 = tbls[i - 1]
  618. k, bxs = tbls[i]
  619. i -= 1
  620. if k0 in nomerge_lout_no:
  621. continue
  622. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  623. continue
  624. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  625. continue
  626. mh = self.mean_height[bxs[0]["page_number"] - 1]
  627. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  628. continue
  629. tables[k0].extend(tables[k])
  630. del tables[k]
  631. def x_overlapped(a, b):
  632. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  633. # find captions and pop out
  634. i = 0
  635. while i < len(self.boxes):
  636. c = self.boxes[i]
  637. # mh = self.mean_height[c["page_number"]-1]
  638. if not TableStructureRecognizer.is_caption(c):
  639. i += 1
  640. continue
  641. # find the nearest layouts
  642. def nearest(tbls):
  643. nonlocal c
  644. mink = ""
  645. minv = 1000000000
  646. for k, bxs in tbls.items():
  647. for b in bxs:
  648. if b.get("layout_type", "").find("caption") >= 0:
  649. continue
  650. y_dis = self._y_dis(c, b)
  651. x_dis = self._x_dis(
  652. c, b) if not x_overlapped(
  653. c, b) else 0
  654. dis = y_dis * y_dis + x_dis * x_dis
  655. if dis < minv:
  656. mink = k
  657. minv = dis
  658. return mink, minv
  659. tk, tv = nearest(tables)
  660. fk, fv = nearest(figures)
  661. # if min(tv, fv) > 2000:
  662. # i += 1
  663. # continue
  664. if tv < fv and tk:
  665. tables[tk].insert(0, c)
  666. logging.debug(
  667. "TABLE:" +
  668. self.boxes[i]["text"] +
  669. "; Cap: " +
  670. tk)
  671. elif fk:
  672. figures[fk].insert(0, c)
  673. logging.debug(
  674. "FIGURE:" +
  675. self.boxes[i]["text"] +
  676. "; Cap: " +
  677. tk)
  678. self.boxes.pop(i)
  679. res = []
  680. positions = []
  681. def cropout(bxs, ltype, poss):
  682. nonlocal ZM
  683. pn = set([b["page_number"] - 1 for b in bxs])
  684. if len(pn) < 2:
  685. pn = list(pn)[0]
  686. ht = self.page_cum_height[pn]
  687. b = {
  688. "x0": np.min([b["x0"] for b in bxs]),
  689. "top": np.min([b["top"] for b in bxs]) - ht,
  690. "x1": np.max([b["x1"] for b in bxs]),
  691. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  692. }
  693. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  694. ii = Recognizer.find_overlapped(b, louts, naive=True)
  695. if ii is not None:
  696. b = louts[ii]
  697. else:
  698. logging.warn(
  699. f"Missing layout match: {pn + 1},%s" %
  700. (bxs[0].get(
  701. "layoutno", "")))
  702. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  703. if right < left: right = left + 1
  704. poss.append((pn + self.page_from, left, right, top, bott))
  705. return self.page_images[pn] \
  706. .crop((left * ZM, top * ZM,
  707. right * ZM, bott * ZM))
  708. pn = {}
  709. for b in bxs:
  710. p = b["page_number"] - 1
  711. if p not in pn:
  712. pn[p] = []
  713. pn[p].append(b)
  714. pn = sorted(pn.items(), key=lambda x: x[0])
  715. imgs = [cropout(arr, ltype, poss) for p, arr in pn]
  716. pic = Image.new("RGB",
  717. (int(np.max([i.size[0] for i in imgs])),
  718. int(np.sum([m.size[1] for m in imgs]))),
  719. (245, 245, 245))
  720. height = 0
  721. for img in imgs:
  722. pic.paste(img, (0, int(height)))
  723. height += img.size[1]
  724. return pic
  725. # crop figure out and add caption
  726. for k, bxs in figures.items():
  727. txt = "\n".join([b["text"] for b in bxs])
  728. if not txt:
  729. continue
  730. poss = []
  731. res.append(
  732. (cropout(
  733. bxs,
  734. "figure", poss),
  735. [txt]))
  736. positions.append(poss)
  737. for k, bxs in tables.items():
  738. if not bxs:
  739. continue
  740. bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
  741. [(b["bottom"] - b["top"]) / 2 for b in bxs]))
  742. poss = []
  743. res.append((cropout(bxs, "table", poss),
  744. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  745. positions.append(poss)
  746. assert len(positions) == len(res)
  747. if need_position:
  748. return list(zip(res, positions))
  749. return res
  750. def proj_match(self, line):
  751. if len(line) <= 2:
  752. return
  753. if re.match(r"[0-9 ().,%%+/-]+$", line):
  754. return False
  755. for p, j in [
  756. (r"第[零一二三四五六七八九十百]+章", 1),
  757. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  758. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  759. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  760. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  761. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  762. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  763. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  764. (r".{,48}[::??]$", 9),
  765. (r"[0-9]+)", 10),
  766. (r"[\((][0-9]+[)\)]", 11),
  767. (r"[零一二三四五六七八九十百]+是", 12),
  768. (r"[⚫•➢✓]", 12)
  769. ]:
  770. if re.match(p, line):
  771. return j
  772. return
  773. def _line_tag(self, bx, ZM):
  774. pn = [bx["page_number"]]
  775. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  776. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  777. page_images_cnt = len(self.page_images)
  778. if pn[-1] - 1 >= page_images_cnt: return ""
  779. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  780. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  781. pn.append(pn[-1] + 1)
  782. if pn[-1] - 1 >= page_images_cnt:
  783. return ""
  784. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  785. .format("-".join([str(p) for p in pn]),
  786. bx["x0"], bx["x1"], top, bott)
  787. def __filterout_scraps(self, boxes, ZM):
  788. def width(b):
  789. return b["x1"] - b["x0"]
  790. def height(b):
  791. return b["bottom"] - b["top"]
  792. def usefull(b):
  793. if b.get("layout_type"):
  794. return True
  795. if width(
  796. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  797. return True
  798. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  799. return True
  800. return False
  801. res = []
  802. while boxes:
  803. lines = []
  804. widths = []
  805. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  806. mh = self.mean_height[boxes[0]["page_number"] - 1]
  807. mj = self.proj_match(
  808. boxes[0]["text"]) or boxes[0].get(
  809. "layout_type",
  810. "") == "title"
  811. def dfs(line, st):
  812. nonlocal mh, pw, lines, widths
  813. lines.append(line)
  814. widths.append(width(line))
  815. width_mean = np.mean(widths)
  816. mmj = self.proj_match(
  817. line["text"]) or line.get(
  818. "layout_type",
  819. "") == "title"
  820. for i in range(st + 1, min(st + 20, len(boxes))):
  821. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  822. break
  823. if not mmj and self._y_dis(
  824. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  825. break
  826. if not usefull(boxes[i]):
  827. continue
  828. if mmj or \
  829. (self._x_dis(boxes[i], line) < pw / 10): \
  830. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  831. # concat following
  832. dfs(boxes[i], i)
  833. boxes.pop(i)
  834. break
  835. try:
  836. if usefull(boxes[0]):
  837. dfs(boxes[0], 0)
  838. else:
  839. logging.debug("WASTE: " + boxes[0]["text"])
  840. except Exception as e:
  841. pass
  842. boxes.pop(0)
  843. mw = np.mean(widths)
  844. if mj or mw / pw >= 0.35 or mw > 200:
  845. res.append(
  846. "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  847. else:
  848. logging.debug("REMOVED: " +
  849. "<<".join([c["text"] for c in lines]))
  850. return "\n\n".join(res)
  851. @staticmethod
  852. def total_page_number(fnm, binary=None):
  853. try:
  854. pdf = pdfplumber.open(
  855. fnm) if not binary else pdfplumber.open(BytesIO(binary))
  856. return len(pdf.pages)
  857. except Exception as e:
  858. logging.error(str(e))
  859. def __images__(self, fnm, zoomin=3, page_from=0,
  860. page_to=299, callback=None):
  861. self.lefted_chars = []
  862. self.mean_height = []
  863. self.mean_width = []
  864. self.boxes = []
  865. self.garbages = {}
  866. self.page_cum_height = [0]
  867. self.page_layout = []
  868. self.page_from = page_from
  869. st = timer()
  870. try:
  871. self.pdf = pdfplumber.open(fnm) if isinstance(
  872. fnm, str) else pdfplumber.open(BytesIO(fnm))
  873. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  874. enumerate(self.pdf.pages[page_from:page_to])]
  875. self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
  876. self.pdf.pages[page_from:page_to]]
  877. self.total_page = len(self.pdf.pages)
  878. except Exception as e:
  879. logging.error(str(e))
  880. self.outlines = []
  881. try:
  882. self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
  883. outlines = self.pdf.outline
  884. def dfs(arr, depth):
  885. for a in arr:
  886. if isinstance(a, dict):
  887. self.outlines.append((a["/Title"], depth))
  888. continue
  889. dfs(a, depth + 1)
  890. dfs(outlines, 0)
  891. except Exception as e:
  892. logging.warning(f"Outlines exception: {e}")
  893. if not self.outlines:
  894. logging.warning(f"Miss outlines")
  895. logging.info("Images converted.")
  896. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
  897. random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
  898. range(len(self.page_chars))]
  899. if sum([1 if e else 0 for e in self.is_english]) > len(
  900. self.page_images) / 2:
  901. self.is_english = True
  902. else:
  903. self.is_english = False
  904. st = timer()
  905. for i, img in enumerate(self.page_images):
  906. chars = self.page_chars[i] if not self.is_english else []
  907. self.mean_height.append(
  908. np.median(sorted([c["height"] for c in chars])) if chars else 0
  909. )
  910. self.mean_width.append(
  911. np.median(sorted([c["width"] for c in chars])) if chars else 8
  912. )
  913. self.page_cum_height.append(img.size[1] / zoomin)
  914. j = 0
  915. while j + 1 < len(chars):
  916. if chars[j]["text"] and chars[j + 1]["text"] \
  917. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  918. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  919. chars[j]["width"]) / 2:
  920. chars[j]["text"] += " "
  921. j += 1
  922. self.__ocr(i + 1, img, chars, zoomin)
  923. if callback and i % 6 == 5:
  924. callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
  925. # print("OCR:", timer()-st)
  926. if not self.is_english and not any(
  927. [c for c in self.page_chars]) and self.boxes:
  928. bxes = [b for bxs in self.boxes for b in bxs]
  929. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
  930. "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  931. logging.info("Is it English:", self.is_english)
  932. self.page_cum_height = np.cumsum(self.page_cum_height)
  933. assert len(self.page_cum_height) == len(self.page_images) + 1
  934. if len(self.boxes) == 0 and zoomin < 9: self.__images__(fnm, zoomin * 3, page_from,
  935. page_to, callback)
  936. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  937. self.__images__(fnm, zoomin)
  938. self._layouts_rec(zoomin)
  939. self._table_transformer_job(zoomin)
  940. self._text_merge()
  941. self._concat_downward()
  942. self._filter_forpages()
  943. tbls = self._extract_table_figure(
  944. need_image, zoomin, return_html, False)
  945. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  946. def remove_tag(self, txt):
  947. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  948. def crop(self, text, ZM=3, need_position=False):
  949. imgs = []
  950. poss = []
  951. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  952. pn, left, right, top, bottom = tag.strip(
  953. "#").strip("@").split("\t")
  954. left, right, top, bottom = float(left), float(
  955. right), float(top), float(bottom)
  956. poss.append(([int(p) - 1 for p in pn.split("-")],
  957. left, right, top, bottom))
  958. if not poss:
  959. if need_position:
  960. return None, None
  961. return
  962. max_width = max(
  963. np.max([right - left for (_, left, right, _, _) in poss]), 6)
  964. GAP = 6
  965. pos = poss[0]
  966. poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
  967. 0, pos[3] - 120), max(pos[3] - GAP, 0)))
  968. pos = poss[-1]
  969. poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
  970. min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
  971. positions = []
  972. for ii, (pns, left, right, top, bottom) in enumerate(poss):
  973. right = left + max_width
  974. bottom *= ZM
  975. for pn in pns[1:]:
  976. bottom += self.page_images[pn - 1].size[1]
  977. imgs.append(
  978. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  979. right *
  980. ZM, min(
  981. bottom, self.page_images[pns[0]].size[1])
  982. ))
  983. )
  984. if 0 < ii < len(poss) - 1:
  985. positions.append((pns[0] + self.page_from, left, right, top, min(
  986. bottom, self.page_images[pns[0]].size[1]) / ZM))
  987. bottom -= self.page_images[pns[0]].size[1]
  988. for pn in pns[1:]:
  989. imgs.append(
  990. self.page_images[pn].crop((left * ZM, 0,
  991. right * ZM,
  992. min(bottom,
  993. self.page_images[pn].size[1])
  994. ))
  995. )
  996. if 0 < ii < len(poss) - 1:
  997. positions.append((pn + self.page_from, left, right, 0, min(
  998. bottom, self.page_images[pn].size[1]) / ZM))
  999. bottom -= self.page_images[pn].size[1]
  1000. if not imgs:
  1001. if need_position:
  1002. return None, None
  1003. return
  1004. height = 0
  1005. for img in imgs:
  1006. height += img.size[1] + GAP
  1007. height = int(height)
  1008. width = int(np.max([i.size[0] for i in imgs]))
  1009. pic = Image.new("RGB",
  1010. (width, height),
  1011. (245, 245, 245))
  1012. height = 0
  1013. for ii, img in enumerate(imgs):
  1014. if ii == 0 or ii + 1 == len(imgs):
  1015. img = img.convert('RGBA')
  1016. overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
  1017. overlay.putalpha(128)
  1018. img = Image.alpha_composite(img, overlay).convert("RGB")
  1019. pic.paste(img, (0, int(height)))
  1020. height += img.size[1] + GAP
  1021. if need_position:
  1022. return pic, positions
  1023. return pic
  1024. def get_position(self, bx, ZM):
  1025. poss = []
  1026. pn = bx["page_number"]
  1027. top = bx["top"] - self.page_cum_height[pn - 1]
  1028. bott = bx["bottom"] - self.page_cum_height[pn - 1]
  1029. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1030. bott, self.page_images[pn - 1].size[1] / ZM)))
  1031. while bott * ZM > self.page_images[pn - 1].size[1]:
  1032. bott -= self.page_images[pn - 1].size[1] / ZM
  1033. top = 0
  1034. pn += 1
  1035. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1036. bott, self.page_images[pn - 1].size[1] / ZM)))
  1037. return poss
  1038. class PlainParser(object):
  1039. def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
  1040. self.outlines = []
  1041. lines = []
  1042. try:
  1043. self.pdf = pdf2_read(
  1044. filename if isinstance(
  1045. filename, str) else BytesIO(filename))
  1046. for page in self.pdf.pages[from_page:to_page]:
  1047. lines.extend([t for t in page.extract_text().split("\n")])
  1048. outlines = self.pdf.outline
  1049. def dfs(arr, depth):
  1050. for a in arr:
  1051. if isinstance(a, dict):
  1052. self.outlines.append((a["/Title"], depth))
  1053. continue
  1054. dfs(a, depth + 1)
  1055. dfs(outlines, 0)
  1056. except Exception as e:
  1057. logging.warning(f"Outlines exception: {e}")
  1058. if not self.outlines:
  1059. logging.warning(f"Miss outlines")
  1060. return [(l, "") for l in lines], []
  1061. def crop(self, ck, need_position):
  1062. raise NotImplementedError
  1063. @staticmethod
  1064. def remove_tag(txt):
  1065. raise NotImplementedError
  1066. if __name__ == "__main__":
  1067. pass