Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

pdf_parser.py 46KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import os
  14. import random
  15. import xgboost as xgb
  16. from io import BytesIO
  17. import torch
  18. import re
  19. import pdfplumber
  20. import logging
  21. from PIL import Image, ImageDraw
  22. import numpy as np
  23. from timeit import default_timer as timer
  24. from PyPDF2 import PdfReader as pdf2_read
  25. from api.utils.file_utils import get_project_base_directory
  26. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  27. from rag.nlp import rag_tokenizer
  28. from copy import deepcopy
  29. from huggingface_hub import snapshot_download
  30. logging.getLogger("pdfminer").setLevel(logging.WARNING)
  31. class RAGFlowPdfParser:
  32. def __init__(self):
  33. self.ocr = OCR()
  34. if hasattr(self, "model_speciess"):
  35. self.layouter = LayoutRecognizer("layout." + self.model_speciess)
  36. else:
  37. self.layouter = LayoutRecognizer("layout")
  38. self.tbl_det = TableStructureRecognizer()
  39. self.updown_cnt_mdl = xgb.Booster()
  40. if torch.cuda.is_available():
  41. self.updown_cnt_mdl.set_param({"device": "cuda"})
  42. try:
  43. model_dir = os.path.join(
  44. get_project_base_directory(),
  45. "rag/res/deepdoc")
  46. self.updown_cnt_mdl.load_model(os.path.join(
  47. model_dir, "updown_concat_xgb.model"))
  48. except Exception as e:
  49. model_dir = snapshot_download(
  50. repo_id="InfiniFlow/text_concat_xgb_v1.0",
  51. local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
  52. local_dir_use_symlinks=False)
  53. self.updown_cnt_mdl.load_model(os.path.join(
  54. model_dir, "updown_concat_xgb.model"))
  55. self.page_from = 0
  56. """
  57. If you have trouble downloading HuggingFace models, -_^ this might help!!
  58. For Linux:
  59. export HF_ENDPOINT=https://hf-mirror.com
  60. For Windows:
  61. Good luck
  62. ^_-
  63. """
  64. def __char_width(self, c):
  65. return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
  66. def __height(self, c):
  67. return c["bottom"] - c["top"]
  68. def _x_dis(self, a, b):
  69. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  70. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  71. def _y_dis(
  72. self, a, b):
  73. return (
  74. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  75. def _match_proj(self, b):
  76. proj_patt = [
  77. r"第[零一二三四五六七八九十百]+章",
  78. r"第[零一二三四五六七八九十百]+[条节]",
  79. r"[零一二三四五六七八九十百]+[、是  ]",
  80. r"[\((][零一二三四五六七八九十百]+[)\)]",
  81. r"[\((][0-9]+[)\)]",
  82. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  83. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  84. r"[⚫•➢①② ]",
  85. ]
  86. return any([re.match(p, b["text"]) for p in proj_patt])
  87. def _updown_concat_features(self, up, down):
  88. w = max(self.__char_width(up), self.__char_width(down))
  89. h = max(self.__height(up), self.__height(down))
  90. y_dis = self._y_dis(up, down)
  91. LEN = 6
  92. tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
  93. tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
  94. tks_all = up["text"][-LEN:].strip() \
  95. + (" " if re.match(r"[a-zA-Z0-9]+",
  96. up["text"][-1] + down["text"][0]) else "") \
  97. + down["text"][:LEN].strip()
  98. tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
  99. fea = [
  100. up.get("R", -1) == down.get("R", -1),
  101. y_dis / h,
  102. down["page_number"] - up["page_number"],
  103. up["layout_type"] == down["layout_type"],
  104. up["layout_type"] == "text",
  105. down["layout_type"] == "text",
  106. up["layout_type"] == "table",
  107. down["layout_type"] == "table",
  108. True if re.search(
  109. r"([。?!;!?;+))]|[a-z]\.)$",
  110. up["text"]) else False,
  111. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  112. True if re.search(
  113. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  114. down["text"]) else False,
  115. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  116. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  117. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  118. True if re.search(r"[\((][^\))]+$", up["text"])
  119. and re.search(r"[\))]", down["text"]) else False,
  120. self._match_proj(down),
  121. True if re.match(r"[A-Z]", down["text"]) else False,
  122. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  123. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  124. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  125. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  126. ) > 1 and len(
  127. down["text"].strip()) > 1 else False,
  128. up["x0"] > down["x1"],
  129. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  130. self.__height(down)),
  131. self._x_dis(up, down) / max(w, 0.000001),
  132. (len(up["text"]) - len(down["text"])) /
  133. max(len(up["text"]), len(down["text"])),
  134. len(tks_all) - len(tks_up) - len(tks_down),
  135. len(tks_down) - len(tks_up),
  136. tks_down[-1] == tks_up[-1],
  137. max(down["in_row"], up["in_row"]),
  138. abs(down["in_row"] - up["in_row"]),
  139. len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
  140. len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
  141. ]
  142. return fea
  143. @staticmethod
  144. def sort_X_by_page(arr, threashold):
  145. # sort using y1 first and then x1
  146. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  147. for i in range(len(arr) - 1):
  148. for j in range(i, -1, -1):
  149. # restore the order using th
  150. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  151. and arr[j + 1]["top"] < arr[j]["top"] \
  152. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  153. tmp = arr[j]
  154. arr[j] = arr[j + 1]
  155. arr[j + 1] = tmp
  156. return arr
  157. def _has_color(self, o):
  158. if o.get("ncs", "") == "DeviceGray":
  159. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  160. o["non_stroking_color"][0] == 1:
  161. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  162. return False
  163. return True
  164. def _table_transformer_job(self, ZM):
  165. logging.info("Table processing...")
  166. imgs, pos = [], []
  167. tbcnt = [0]
  168. MARGIN = 10
  169. self.tb_cpns = []
  170. assert len(self.page_layout) == len(self.page_images)
  171. for p, tbls in enumerate(self.page_layout): # for page
  172. tbls = [f for f in tbls if f["type"] == "table"]
  173. tbcnt.append(len(tbls))
  174. if not tbls:
  175. continue
  176. for tb in tbls: # for table
  177. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  178. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  179. left *= ZM
  180. top *= ZM
  181. right *= ZM
  182. bott *= ZM
  183. pos.append((left, top))
  184. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  185. assert len(self.page_images) == len(tbcnt) - 1
  186. if not imgs:
  187. return
  188. recos = self.tbl_det(imgs)
  189. tbcnt = np.cumsum(tbcnt)
  190. for i in range(len(tbcnt) - 1): # for page
  191. pg = []
  192. for j, tb_items in enumerate(
  193. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  194. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  195. for it in tb_items: # for table components
  196. it["x0"] = (it["x0"] + poss[j][0])
  197. it["x1"] = (it["x1"] + poss[j][0])
  198. it["top"] = (it["top"] + poss[j][1])
  199. it["bottom"] = (it["bottom"] + poss[j][1])
  200. for n in ["x0", "x1", "top", "bottom"]:
  201. it[n] /= ZM
  202. it["top"] += self.page_cum_height[i]
  203. it["bottom"] += self.page_cum_height[i]
  204. it["pn"] = i
  205. it["layoutno"] = j
  206. pg.append(it)
  207. self.tb_cpns.extend(pg)
  208. def gather(kwd, fzy=10, ption=0.6):
  209. eles = Recognizer.sort_Y_firstly(
  210. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  211. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  212. return Recognizer.sort_Y_firstly(eles, 0)
  213. # add R,H,C,SP tag to boxes within table layout
  214. headers = gather(r".*header$")
  215. rows = gather(r".* (row|header)")
  216. spans = gather(r".*spanning")
  217. clmns = sorted([r for r in self.tb_cpns if re.match(
  218. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  219. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  220. for b in self.boxes:
  221. if b.get("layout_type", "") != "table":
  222. continue
  223. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  224. if ii is not None:
  225. b["R"] = ii
  226. b["R_top"] = rows[ii]["top"]
  227. b["R_bott"] = rows[ii]["bottom"]
  228. ii = Recognizer.find_overlapped_with_threashold(
  229. b, headers, thr=0.3)
  230. if ii is not None:
  231. b["H_top"] = headers[ii]["top"]
  232. b["H_bott"] = headers[ii]["bottom"]
  233. b["H_left"] = headers[ii]["x0"]
  234. b["H_right"] = headers[ii]["x1"]
  235. b["H"] = ii
  236. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  237. if ii is not None:
  238. b["C"] = ii
  239. b["C_left"] = clmns[ii]["x0"]
  240. b["C_right"] = clmns[ii]["x1"]
  241. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  242. if ii is not None:
  243. b["H_top"] = spans[ii]["top"]
  244. b["H_bott"] = spans[ii]["bottom"]
  245. b["H_left"] = spans[ii]["x0"]
  246. b["H_right"] = spans[ii]["x1"]
  247. b["SP"] = ii
  248. def __ocr(self, pagenum, img, chars, ZM=3):
  249. bxs = self.ocr.detect(np.array(img))
  250. if not bxs:
  251. self.boxes.append([])
  252. return
  253. bxs = [(line[0], line[1][0]) for line in bxs]
  254. bxs = Recognizer.sort_Y_firstly(
  255. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  256. "top": b[0][1] / ZM, "text": "", "txt": t,
  257. "bottom": b[-1][1] / ZM,
  258. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  259. self.mean_height[-1] / 3
  260. )
  261. # solve char content confusion
  262. record_error_length, ct = 0, 1
  263. for c in chars[0:128]:
  264. ii = Recognizer.find_overlapped(c, bxs)
  265. if ii is None:
  266. continue
  267. record_error_length += abs((bxs[ii]["bottom"] + bxs[ii]["top"] - c["bottom"] - c["top"]) / 2)
  268. ct += 1
  269. record_error_length = record_error_length / ct
  270. for char in chars:
  271. char["top"] -= record_error_length
  272. char["bottom"] -= record_error_length
  273. # merge chars in the same rect
  274. for c in Recognizer.sort_X_firstly(
  275. chars, self.mean_width[pagenum - 1] // 4):
  276. ii = Recognizer.find_overlapped(c, bxs)
  277. if ii is None:
  278. self.lefted_chars.append(c)
  279. continue
  280. ch = c["bottom"] - c["top"]
  281. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  282. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  283. self.lefted_chars.append(c)
  284. continue
  285. if c["text"] == " " and bxs[ii]["text"]:
  286. if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]):
  287. bxs[ii]["text"] += " "
  288. else:
  289. bxs[ii]["text"] += c["text"]
  290. for b in bxs:
  291. if not b["text"]:
  292. left, right, top, bott = b["x0"] * ZM, b["x1"] * \
  293. ZM, b["top"] * ZM, b["bottom"] * ZM
  294. b["text"] = self.ocr.recognize(np.array(img),
  295. np.array([[left, top], [right, top], [right, bott], [left, bott]],
  296. dtype=np.float32))
  297. del b["txt"]
  298. bxs = [b for b in bxs if b["text"]]
  299. if self.mean_height[-1] == 0:
  300. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  301. for b in bxs])
  302. self.boxes.append(bxs)
  303. def _layouts_rec(self, ZM, drop=True):
  304. assert len(self.page_images) == len(self.boxes)
  305. self.boxes, self.page_layout = self.layouter(
  306. self.page_images, self.boxes, ZM, drop=drop)
  307. # cumlative Y
  308. for i in range(len(self.boxes)):
  309. self.boxes[i]["top"] += \
  310. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  311. self.boxes[i]["bottom"] += \
  312. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  313. def _text_merge(self):
  314. # merge adjusted boxes
  315. bxs = self.boxes
  316. def end_with(b, txt):
  317. txt = txt.strip()
  318. tt = b.get("text", "").strip()
  319. return tt and tt.find(txt) == len(tt) - len(txt)
  320. def start_with(b, txts):
  321. tt = b.get("text", "").strip()
  322. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  323. # horizontally merge adjacent box with the same layout
  324. i = 0
  325. while i < len(bxs) - 1:
  326. b = bxs[i]
  327. b_ = bxs[i + 1]
  328. if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
  329. "equation"]:
  330. i += 1
  331. continue
  332. if abs(self._y_dis(b, b_)
  333. ) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
  334. # merge
  335. bxs[i]["x1"] = b_["x1"]
  336. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  337. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  338. bxs[i]["text"] += b_["text"]
  339. bxs.pop(i + 1)
  340. continue
  341. i += 1
  342. continue
  343. dis_thr = 1
  344. dis = b["x1"] - b_["x0"]
  345. if b.get("layout_type", "") != "text" or b_.get(
  346. "layout_type", "") != "text":
  347. if end_with(b, ",") or start_with(b_, "(,"):
  348. dis_thr = -8
  349. else:
  350. i += 1
  351. continue
  352. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  353. and dis >= dis_thr and b["x1"] < b_["x1"]:
  354. # merge
  355. bxs[i]["x1"] = b_["x1"]
  356. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  357. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  358. bxs[i]["text"] += b_["text"]
  359. bxs.pop(i + 1)
  360. continue
  361. i += 1
  362. self.boxes = bxs
  363. def _naive_vertical_merge(self):
  364. bxs = Recognizer.sort_Y_firstly(
  365. self.boxes, np.median(
  366. self.mean_height) / 3)
  367. i = 0
  368. while i + 1 < len(bxs):
  369. b = bxs[i]
  370. b_ = bxs[i + 1]
  371. if b["page_number"] < b_["page_number"] and re.match(
  372. r"[0-9 •一—-]+$", b["text"]):
  373. bxs.pop(i)
  374. continue
  375. if not b["text"].strip():
  376. bxs.pop(i)
  377. continue
  378. concatting_feats = [
  379. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  380. len(b["text"].strip()) > 1 and b["text"].strip(
  381. )[-2] in ",;:'\",‘“、;:",
  382. b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:",
  383. ]
  384. # features for not concating
  385. feats = [
  386. b.get("layoutno", 0) != b_.get("layoutno", 0),
  387. b["text"].strip()[-1] in "。?!?",
  388. self.is_english and b["text"].strip()[-1] in ".!?",
  389. b["page_number"] == b_["page_number"] and b_["top"] -
  390. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  391. b["page_number"] < b_["page_number"] and abs(
  392. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
  393. ]
  394. # split features
  395. detach_feats = [b["x1"] < b_["x0"],
  396. b["x0"] > b_["x1"]]
  397. if (any(feats) and not any(concatting_feats)) or any(detach_feats):
  398. print(
  399. b["text"],
  400. b_["text"],
  401. any(feats),
  402. any(concatting_feats),
  403. any(detach_feats))
  404. i += 1
  405. continue
  406. # merge up and down
  407. b["bottom"] = b_["bottom"]
  408. b["text"] += b_["text"]
  409. b["x0"] = min(b["x0"], b_["x0"])
  410. b["x1"] = max(b["x1"], b_["x1"])
  411. bxs.pop(i + 1)
  412. self.boxes = bxs
  413. def _concat_downward(self, concat_between_pages=True):
  414. # count boxes in the same row as a feature
  415. for i in range(len(self.boxes)):
  416. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  417. self.boxes[i]["in_row"] = 0
  418. j = max(0, i - 12)
  419. while j < min(i + 12, len(self.boxes)):
  420. if j == i:
  421. j += 1
  422. continue
  423. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  424. if abs(ydis) < 1:
  425. self.boxes[i]["in_row"] += 1
  426. elif ydis > 0:
  427. break
  428. j += 1
  429. # concat between rows
  430. boxes = deepcopy(self.boxes)
  431. blocks = []
  432. while boxes:
  433. chunks = []
  434. def dfs(up, dp):
  435. chunks.append(up)
  436. i = dp
  437. while i < min(dp + 12, len(boxes)):
  438. ydis = self._y_dis(up, boxes[i])
  439. smpg = up["page_number"] == boxes[i]["page_number"]
  440. mh = self.mean_height[up["page_number"] - 1]
  441. mw = self.mean_width[up["page_number"] - 1]
  442. if smpg and ydis > mh * 4:
  443. break
  444. if not smpg and ydis > mh * 16:
  445. break
  446. down = boxes[i]
  447. if not concat_between_pages and down["page_number"] > up["page_number"]:
  448. break
  449. if up.get("R", "") != down.get(
  450. "R", "") and up["text"][-1] != ",":
  451. i += 1
  452. continue
  453. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  454. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \
  455. or not down["text"].strip():
  456. i += 1
  457. continue
  458. if not down["text"].strip():
  459. i += 1
  460. continue
  461. if up["x1"] < down["x0"] - 10 * \
  462. mw or up["x0"] > down["x1"] + 10 * mw:
  463. i += 1
  464. continue
  465. if i - dp < 5 and up.get("layout_type") == "text":
  466. if up.get("layoutno", "1") == down.get(
  467. "layoutno", "2"):
  468. dfs(down, i + 1)
  469. boxes.pop(i)
  470. return
  471. i += 1
  472. continue
  473. fea = self._updown_concat_features(up, down)
  474. if self.updown_cnt_mdl.predict(
  475. xgb.DMatrix([fea]))[0] <= 0.5:
  476. i += 1
  477. continue
  478. dfs(down, i + 1)
  479. boxes.pop(i)
  480. return
  481. dfs(boxes[0], 1)
  482. boxes.pop(0)
  483. if chunks:
  484. blocks.append(chunks)
  485. # concat within each block
  486. boxes = []
  487. for b in blocks:
  488. if len(b) == 1:
  489. boxes.append(b[0])
  490. continue
  491. t = b[0]
  492. for c in b[1:]:
  493. t["text"] = t["text"].strip()
  494. c["text"] = c["text"].strip()
  495. if not c["text"]:
  496. continue
  497. if t["text"] and re.match(
  498. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  499. t["text"] += " "
  500. t["text"] += c["text"]
  501. t["x0"] = min(t["x0"], c["x0"])
  502. t["x1"] = max(t["x1"], c["x1"])
  503. t["page_number"] = min(t["page_number"], c["page_number"])
  504. t["bottom"] = c["bottom"]
  505. if not t["layout_type"] \
  506. and c["layout_type"]:
  507. t["layout_type"] = c["layout_type"]
  508. boxes.append(t)
  509. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  510. def _filter_forpages(self):
  511. if not self.boxes:
  512. return
  513. findit = False
  514. i = 0
  515. while i < len(self.boxes):
  516. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
  517. re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  518. i += 1
  519. continue
  520. findit = True
  521. eng = re.match(
  522. r"[0-9a-zA-Z :'.-]{5,}",
  523. self.boxes[i]["text"].strip())
  524. self.boxes.pop(i)
  525. if i >= len(self.boxes):
  526. break
  527. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  528. self.boxes[i]["text"].strip().split(" ")[:2])
  529. while not prefix:
  530. self.boxes.pop(i)
  531. if i >= len(self.boxes):
  532. break
  533. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  534. self.boxes[i]["text"].strip().split(" ")[:2])
  535. self.boxes.pop(i)
  536. if i >= len(self.boxes) or not prefix:
  537. break
  538. for j in range(i, min(i + 128, len(self.boxes))):
  539. if not re.match(prefix, self.boxes[j]["text"]):
  540. continue
  541. for k in range(i, j):
  542. self.boxes.pop(i)
  543. break
  544. if findit:
  545. return
  546. page_dirty = [0] * len(self.page_images)
  547. for b in self.boxes:
  548. if re.search(r"(··|··|··)", b["text"]):
  549. page_dirty[b["page_number"] - 1] += 1
  550. page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
  551. if not page_dirty:
  552. return
  553. i = 0
  554. while i < len(self.boxes):
  555. if self.boxes[i]["page_number"] in page_dirty:
  556. self.boxes.pop(i)
  557. continue
  558. i += 1
  559. def _merge_with_same_bullet(self):
  560. i = 0
  561. while i + 1 < len(self.boxes):
  562. b = self.boxes[i]
  563. b_ = self.boxes[i + 1]
  564. if not b["text"].strip():
  565. self.boxes.pop(i)
  566. continue
  567. if not b_["text"].strip():
  568. self.boxes.pop(i + 1)
  569. continue
  570. if b["text"].strip()[0] != b_["text"].strip()[0] \
  571. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  572. or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
  573. or b["top"] > b_["bottom"]:
  574. i += 1
  575. continue
  576. b_["text"] = b["text"] + "\n" + b_["text"]
  577. b_["x0"] = min(b["x0"], b_["x0"])
  578. b_["x1"] = max(b["x1"], b_["x1"])
  579. b_["top"] = b["top"]
  580. self.boxes.pop(i)
  581. def _extract_table_figure(self, need_image, ZM,
  582. return_html, need_position):
  583. tables = {}
  584. figures = {}
  585. # extract figure and table boxes
  586. i = 0
  587. lst_lout_no = ""
  588. nomerge_lout_no = []
  589. while i < len(self.boxes):
  590. if "layoutno" not in self.boxes[i]:
  591. i += 1
  592. continue
  593. lout_no = str(self.boxes[i]["page_number"]) + \
  594. "-" + str(self.boxes[i]["layoutno"])
  595. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
  596. "title",
  597. "figure caption",
  598. "reference"]:
  599. nomerge_lout_no.append(lst_lout_no)
  600. if self.boxes[i]["layout_type"] == "table":
  601. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  602. self.boxes.pop(i)
  603. continue
  604. if lout_no not in tables:
  605. tables[lout_no] = []
  606. tables[lout_no].append(self.boxes[i])
  607. self.boxes.pop(i)
  608. lst_lout_no = lout_no
  609. continue
  610. if need_image and self.boxes[i]["layout_type"] == "figure":
  611. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  612. self.boxes.pop(i)
  613. continue
  614. if lout_no not in figures:
  615. figures[lout_no] = []
  616. figures[lout_no].append(self.boxes[i])
  617. self.boxes.pop(i)
  618. lst_lout_no = lout_no
  619. continue
  620. i += 1
  621. # merge table on different pages
  622. nomerge_lout_no = set(nomerge_lout_no)
  623. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  624. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  625. i = len(tbls) - 1
  626. while i - 1 >= 0:
  627. k0, bxs0 = tbls[i - 1]
  628. k, bxs = tbls[i]
  629. i -= 1
  630. if k0 in nomerge_lout_no:
  631. continue
  632. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  633. continue
  634. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  635. continue
  636. mh = self.mean_height[bxs[0]["page_number"] - 1]
  637. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  638. continue
  639. tables[k0].extend(tables[k])
  640. del tables[k]
  641. def x_overlapped(a, b):
  642. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  643. # find captions and pop out
  644. i = 0
  645. while i < len(self.boxes):
  646. c = self.boxes[i]
  647. # mh = self.mean_height[c["page_number"]-1]
  648. if not TableStructureRecognizer.is_caption(c):
  649. i += 1
  650. continue
  651. # find the nearest layouts
  652. def nearest(tbls):
  653. nonlocal c
  654. mink = ""
  655. minv = 1000000000
  656. for k, bxs in tbls.items():
  657. for b in bxs:
  658. if b.get("layout_type", "").find("caption") >= 0:
  659. continue
  660. y_dis = self._y_dis(c, b)
  661. x_dis = self._x_dis(
  662. c, b) if not x_overlapped(
  663. c, b) else 0
  664. dis = y_dis * y_dis + x_dis * x_dis
  665. if dis < minv:
  666. mink = k
  667. minv = dis
  668. return mink, minv
  669. tk, tv = nearest(tables)
  670. fk, fv = nearest(figures)
  671. # if min(tv, fv) > 2000:
  672. # i += 1
  673. # continue
  674. if tv < fv and tk:
  675. tables[tk].insert(0, c)
  676. logging.debug(
  677. "TABLE:" +
  678. self.boxes[i]["text"] +
  679. "; Cap: " +
  680. tk)
  681. elif fk:
  682. figures[fk].insert(0, c)
  683. logging.debug(
  684. "FIGURE:" +
  685. self.boxes[i]["text"] +
  686. "; Cap: " +
  687. tk)
  688. self.boxes.pop(i)
  689. res = []
  690. positions = []
  691. def cropout(bxs, ltype, poss):
  692. nonlocal ZM
  693. pn = set([b["page_number"] - 1 for b in bxs])
  694. if len(pn) < 2:
  695. pn = list(pn)[0]
  696. ht = self.page_cum_height[pn]
  697. b = {
  698. "x0": np.min([b["x0"] for b in bxs]),
  699. "top": np.min([b["top"] for b in bxs]) - ht,
  700. "x1": np.max([b["x1"] for b in bxs]),
  701. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  702. }
  703. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  704. ii = Recognizer.find_overlapped(b, louts, naive=True)
  705. if ii is not None:
  706. b = louts[ii]
  707. else:
  708. logging.warn(
  709. f"Missing layout match: {pn + 1},%s" %
  710. (bxs[0].get(
  711. "layoutno", "")))
  712. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  713. if right < left: right = left + 1
  714. poss.append((pn + self.page_from, left, right, top, bott))
  715. return self.page_images[pn] \
  716. .crop((left * ZM, top * ZM,
  717. right * ZM, bott * ZM))
  718. pn = {}
  719. for b in bxs:
  720. p = b["page_number"] - 1
  721. if p not in pn:
  722. pn[p] = []
  723. pn[p].append(b)
  724. pn = sorted(pn.items(), key=lambda x: x[0])
  725. imgs = [cropout(arr, ltype, poss) for p, arr in pn]
  726. pic = Image.new("RGB",
  727. (int(np.max([i.size[0] for i in imgs])),
  728. int(np.sum([m.size[1] for m in imgs]))),
  729. (245, 245, 245))
  730. height = 0
  731. for img in imgs:
  732. pic.paste(img, (0, int(height)))
  733. height += img.size[1]
  734. return pic
  735. # crop figure out and add caption
  736. for k, bxs in figures.items():
  737. txt = "\n".join([b["text"] for b in bxs])
  738. if not txt:
  739. continue
  740. poss = []
  741. res.append(
  742. (cropout(
  743. bxs,
  744. "figure", poss),
  745. [txt]))
  746. positions.append(poss)
  747. for k, bxs in tables.items():
  748. if not bxs:
  749. continue
  750. bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
  751. [(b["bottom"] - b["top"]) / 2 for b in bxs]))
  752. poss = []
  753. res.append((cropout(bxs, "table", poss),
  754. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  755. positions.append(poss)
  756. assert len(positions) == len(res)
  757. if need_position:
  758. return list(zip(res, positions))
  759. return res
  760. def proj_match(self, line):
  761. if len(line) <= 2:
  762. return
  763. if re.match(r"[0-9 ().,%%+/-]+$", line):
  764. return False
  765. for p, j in [
  766. (r"第[零一二三四五六七八九十百]+章", 1),
  767. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  768. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  769. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  770. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  771. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  772. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  773. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  774. (r".{,48}[::??]$", 9),
  775. (r"[0-9]+)", 10),
  776. (r"[\((][0-9]+[)\)]", 11),
  777. (r"[零一二三四五六七八九十百]+是", 12),
  778. (r"[⚫•➢✓]", 12)
  779. ]:
  780. if re.match(p, line):
  781. return j
  782. return
  783. def _line_tag(self, bx, ZM):
  784. pn = [bx["page_number"]]
  785. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  786. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  787. page_images_cnt = len(self.page_images)
  788. if pn[-1] - 1 >= page_images_cnt: return ""
  789. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  790. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  791. pn.append(pn[-1] + 1)
  792. if pn[-1] - 1 >= page_images_cnt:
  793. return ""
  794. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  795. .format("-".join([str(p) for p in pn]),
  796. bx["x0"], bx["x1"], top, bott)
  797. def __filterout_scraps(self, boxes, ZM):
  798. def width(b):
  799. return b["x1"] - b["x0"]
  800. def height(b):
  801. return b["bottom"] - b["top"]
  802. def usefull(b):
  803. if b.get("layout_type"):
  804. return True
  805. if width(
  806. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  807. return True
  808. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  809. return True
  810. return False
  811. res = []
  812. while boxes:
  813. lines = []
  814. widths = []
  815. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  816. mh = self.mean_height[boxes[0]["page_number"] - 1]
  817. mj = self.proj_match(
  818. boxes[0]["text"]) or boxes[0].get(
  819. "layout_type",
  820. "") == "title"
  821. def dfs(line, st):
  822. nonlocal mh, pw, lines, widths
  823. lines.append(line)
  824. widths.append(width(line))
  825. width_mean = np.mean(widths)
  826. mmj = self.proj_match(
  827. line["text"]) or line.get(
  828. "layout_type",
  829. "") == "title"
  830. for i in range(st + 1, min(st + 20, len(boxes))):
  831. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  832. break
  833. if not mmj and self._y_dis(
  834. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  835. break
  836. if not usefull(boxes[i]):
  837. continue
  838. if mmj or \
  839. (self._x_dis(boxes[i], line) < pw / 10): \
  840. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  841. # concat following
  842. dfs(boxes[i], i)
  843. boxes.pop(i)
  844. break
  845. try:
  846. if usefull(boxes[0]):
  847. dfs(boxes[0], 0)
  848. else:
  849. logging.debug("WASTE: " + boxes[0]["text"])
  850. except Exception as e:
  851. pass
  852. boxes.pop(0)
  853. mw = np.mean(widths)
  854. if mj or mw / pw >= 0.35 or mw > 200:
  855. res.append(
  856. "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  857. else:
  858. logging.debug("REMOVED: " +
  859. "<<".join([c["text"] for c in lines]))
  860. return "\n\n".join(res)
  861. @staticmethod
  862. def total_page_number(fnm, binary=None):
  863. try:
  864. pdf = pdfplumber.open(
  865. fnm) if not binary else pdfplumber.open(BytesIO(binary))
  866. return len(pdf.pages)
  867. except Exception as e:
  868. logging.error(str(e))
  869. def __images__(self, fnm, zoomin=3, page_from=0,
  870. page_to=299, callback=None):
  871. self.lefted_chars = []
  872. self.mean_height = []
  873. self.mean_width = []
  874. self.boxes = []
  875. self.garbages = {}
  876. self.page_cum_height = [0]
  877. self.page_layout = []
  878. self.page_from = page_from
  879. st = timer()
  880. try:
  881. self.pdf = pdfplumber.open(fnm) if isinstance(
  882. fnm, str) else pdfplumber.open(BytesIO(fnm))
  883. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  884. enumerate(self.pdf.pages[page_from:page_to])]
  885. self.page_chars = [[{**c, 'top': max(0, c['top'] - 10), 'bottom': max(0, c['bottom'] - 10)} for c in page.chars if self._has_color(c)] for page in
  886. self.pdf.pages[page_from:page_to]]
  887. self.total_page = len(self.pdf.pages)
  888. except Exception as e:
  889. logging.error(str(e))
  890. self.outlines = []
  891. try:
  892. self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
  893. outlines = self.pdf.outline
  894. def dfs(arr, depth):
  895. for a in arr:
  896. if isinstance(a, dict):
  897. self.outlines.append((a["/Title"], depth))
  898. continue
  899. dfs(a, depth + 1)
  900. dfs(outlines, 0)
  901. except Exception as e:
  902. logging.warning(f"Outlines exception: {e}")
  903. if not self.outlines:
  904. logging.warning(f"Miss outlines")
  905. logging.info("Images converted.")
  906. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
  907. random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
  908. range(len(self.page_chars))]
  909. if sum([1 if e else 0 for e in self.is_english]) > len(
  910. self.page_images) / 2:
  911. self.is_english = True
  912. else:
  913. self.is_english = False
  914. self.is_english = False
  915. st = timer()
  916. for i, img in enumerate(self.page_images):
  917. chars = self.page_chars[i] if not self.is_english else []
  918. self.mean_height.append(
  919. np.median(sorted([c["height"] for c in chars])) if chars else 0
  920. )
  921. self.mean_width.append(
  922. np.median(sorted([c["width"] for c in chars])) if chars else 8
  923. )
  924. self.page_cum_height.append(img.size[1] / zoomin)
  925. j = 0
  926. while j + 1 < len(chars):
  927. if chars[j]["text"] and chars[j + 1]["text"] \
  928. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  929. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  930. chars[j]["width"]) / 2:
  931. chars[j]["text"] += " "
  932. j += 1
  933. self.__ocr(i + 1, img, chars, zoomin)
  934. if callback and i % 6 == 5:
  935. callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
  936. # print("OCR:", timer()-st)
  937. if not self.is_english and not any(
  938. [c for c in self.page_chars]) and self.boxes:
  939. bxes = [b for bxs in self.boxes for b in bxs]
  940. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
  941. "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  942. logging.info("Is it English:", self.is_english)
  943. self.page_cum_height = np.cumsum(self.page_cum_height)
  944. assert len(self.page_cum_height) == len(self.page_images) + 1
  945. if len(self.boxes) == 0 and zoomin < 9: self.__images__(fnm, zoomin * 3, page_from,
  946. page_to, callback)
  947. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  948. self.__images__(fnm, zoomin)
  949. self._layouts_rec(zoomin)
  950. self._table_transformer_job(zoomin)
  951. self._text_merge()
  952. self._concat_downward()
  953. self._filter_forpages()
  954. tbls = self._extract_table_figure(
  955. need_image, zoomin, return_html, False)
  956. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  957. def remove_tag(self, txt):
  958. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  959. def crop(self, text, ZM=3, need_position=False):
  960. imgs = []
  961. poss = []
  962. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  963. pn, left, right, top, bottom = tag.strip(
  964. "#").strip("@").split("\t")
  965. left, right, top, bottom = float(left), float(
  966. right), float(top), float(bottom)
  967. poss.append(([int(p) - 1 for p in pn.split("-")],
  968. left, right, top, bottom))
  969. if not poss:
  970. if need_position:
  971. return None, None
  972. return
  973. max_width = max(
  974. np.max([right - left for (_, left, right, _, _) in poss]), 6)
  975. GAP = 6
  976. pos = poss[0]
  977. poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
  978. 0, pos[3] - 120), max(pos[3] - GAP, 0)))
  979. pos = poss[-1]
  980. poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
  981. min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
  982. positions = []
  983. for ii, (pns, left, right, top, bottom) in enumerate(poss):
  984. right = left + max_width
  985. bottom *= ZM
  986. for pn in pns[1:]:
  987. bottom += self.page_images[pn - 1].size[1]
  988. imgs.append(
  989. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  990. right *
  991. ZM, min(
  992. bottom, self.page_images[pns[0]].size[1])
  993. ))
  994. )
  995. if 0 < ii < len(poss) - 1:
  996. positions.append((pns[0] + self.page_from, left, right, top, min(
  997. bottom, self.page_images[pns[0]].size[1]) / ZM))
  998. bottom -= self.page_images[pns[0]].size[1]
  999. for pn in pns[1:]:
  1000. imgs.append(
  1001. self.page_images[pn].crop((left * ZM, 0,
  1002. right * ZM,
  1003. min(bottom,
  1004. self.page_images[pn].size[1])
  1005. ))
  1006. )
  1007. if 0 < ii < len(poss) - 1:
  1008. positions.append((pn + self.page_from, left, right, 0, min(
  1009. bottom, self.page_images[pn].size[1]) / ZM))
  1010. bottom -= self.page_images[pn].size[1]
  1011. if not imgs:
  1012. if need_position:
  1013. return None, None
  1014. return
  1015. height = 0
  1016. for img in imgs:
  1017. height += img.size[1] + GAP
  1018. height = int(height)
  1019. width = int(np.max([i.size[0] for i in imgs]))
  1020. pic = Image.new("RGB",
  1021. (width, height),
  1022. (245, 245, 245))
  1023. height = 0
  1024. for ii, img in enumerate(imgs):
  1025. if ii == 0 or ii + 1 == len(imgs):
  1026. img = img.convert('RGBA')
  1027. overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
  1028. overlay.putalpha(128)
  1029. img = Image.alpha_composite(img, overlay).convert("RGB")
  1030. pic.paste(img, (0, int(height)))
  1031. height += img.size[1] + GAP
  1032. if need_position:
  1033. return pic, positions
  1034. return pic
  1035. def get_position(self, bx, ZM):
  1036. poss = []
  1037. pn = bx["page_number"]
  1038. top = bx["top"] - self.page_cum_height[pn - 1]
  1039. bott = bx["bottom"] - self.page_cum_height[pn - 1]
  1040. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1041. bott, self.page_images[pn - 1].size[1] / ZM)))
  1042. while bott * ZM > self.page_images[pn - 1].size[1]:
  1043. bott -= self.page_images[pn - 1].size[1] / ZM
  1044. top = 0
  1045. pn += 1
  1046. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1047. bott, self.page_images[pn - 1].size[1] / ZM)))
  1048. return poss
  1049. class PlainParser(object):
  1050. def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
  1051. self.outlines = []
  1052. lines = []
  1053. try:
  1054. self.pdf = pdf2_read(
  1055. filename if isinstance(
  1056. filename, str) else BytesIO(filename))
  1057. for page in self.pdf.pages[from_page:to_page]:
  1058. lines.extend([t for t in page.extract_text().split("\n")])
  1059. outlines = self.pdf.outline
  1060. def dfs(arr, depth):
  1061. for a in arr:
  1062. if isinstance(a, dict):
  1063. self.outlines.append((a["/Title"], depth))
  1064. continue
  1065. dfs(a, depth + 1)
  1066. dfs(outlines, 0)
  1067. except Exception as e:
  1068. logging.warning(f"Outlines exception: {e}")
  1069. if not self.outlines:
  1070. logging.warning(f"Miss outlines")
  1071. return [(l, "") for l in lines], []
  1072. def crop(self, ck, need_position):
  1073. raise NotImplementedError
  1074. @staticmethod
  1075. def remove_tag(txt):
  1076. raise NotImplementedError
  1077. if __name__ == "__main__":
  1078. pass