Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import logging
  17. import os
  18. import random
  19. import xgboost as xgb
  20. from io import BytesIO
  21. import re
  22. import pdfplumber
  23. from PIL import Image
  24. import numpy as np
  25. from pypdf import PdfReader as pdf2_read
  26. from api import settings
  27. from api.utils.file_utils import get_project_base_directory
  28. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  29. from rag.nlp import rag_tokenizer
  30. from copy import deepcopy
  31. from huggingface_hub import snapshot_download
  32. class RAGFlowPdfParser:
  33. def __init__(self):
  34. self.ocr = OCR()
  35. if hasattr(self, "model_speciess"):
  36. self.layouter = LayoutRecognizer("layout." + self.model_speciess)
  37. else:
  38. self.layouter = LayoutRecognizer("layout")
  39. self.tbl_det = TableStructureRecognizer()
  40. self.updown_cnt_mdl = xgb.Booster()
  41. if not settings.LIGHTEN:
  42. try:
  43. import torch
  44. if torch.cuda.is_available():
  45. self.updown_cnt_mdl.set_param({"device": "cuda"})
  46. except Exception:
  47. logging.exception("RAGFlowPdfParser __init__")
  48. try:
  49. model_dir = os.path.join(
  50. get_project_base_directory(),
  51. "rag/res/deepdoc")
  52. self.updown_cnt_mdl.load_model(os.path.join(
  53. model_dir, "updown_concat_xgb.model"))
  54. except Exception:
  55. model_dir = snapshot_download(
  56. repo_id="InfiniFlow/text_concat_xgb_v1.0",
  57. local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
  58. local_dir_use_symlinks=False)
  59. self.updown_cnt_mdl.load_model(os.path.join(
  60. model_dir, "updown_concat_xgb.model"))
  61. self.page_from = 0
  62. """
  63. If you have trouble downloading HuggingFace models, -_^ this might help!!
  64. For Linux:
  65. export HF_ENDPOINT=https://hf-mirror.com
  66. For Windows:
  67. Good luck
  68. ^_-
  69. """
  70. def __char_width(self, c):
  71. return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
  72. def __height(self, c):
  73. return c["bottom"] - c["top"]
  74. def _x_dis(self, a, b):
  75. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  76. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  77. def _y_dis(
  78. self, a, b):
  79. return (
  80. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  81. def _match_proj(self, b):
  82. proj_patt = [
  83. r"第[零一二三四五六七八九十百]+章",
  84. r"第[零一二三四五六七八九十百]+[条节]",
  85. r"[零一二三四五六七八九十百]+[、是  ]",
  86. r"[\((][零一二三四五六七八九十百]+[)\)]",
  87. r"[\((][0-9]+[)\)]",
  88. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  89. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  90. r"[⚫•➢①② ]",
  91. ]
  92. return any([re.match(p, b["text"]) for p in proj_patt])
  93. def _updown_concat_features(self, up, down):
  94. w = max(self.__char_width(up), self.__char_width(down))
  95. h = max(self.__height(up), self.__height(down))
  96. y_dis = self._y_dis(up, down)
  97. LEN = 6
  98. tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split()
  99. tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split()
  100. tks_all = up["text"][-LEN:].strip() \
  101. + (" " if re.match(r"[a-zA-Z0-9]+",
  102. up["text"][-1] + down["text"][0]) else "") \
  103. + down["text"][:LEN].strip()
  104. tks_all = rag_tokenizer.tokenize(tks_all).split()
  105. fea = [
  106. up.get("R", -1) == down.get("R", -1),
  107. y_dis / h,
  108. down["page_number"] - up["page_number"],
  109. up["layout_type"] == down["layout_type"],
  110. up["layout_type"] == "text",
  111. down["layout_type"] == "text",
  112. up["layout_type"] == "table",
  113. down["layout_type"] == "table",
  114. True if re.search(
  115. r"([。?!;!?;+))]|[a-z]\.)$",
  116. up["text"]) else False,
  117. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  118. True if re.search(
  119. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  120. down["text"]) else False,
  121. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  122. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  123. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  124. True if re.search(r"[\((][^\))]+$", up["text"])
  125. and re.search(r"[\))]", down["text"]) else False,
  126. self._match_proj(down),
  127. True if re.match(r"[A-Z]", down["text"]) else False,
  128. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  129. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  130. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  131. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  132. ) > 1 and len(
  133. down["text"].strip()) > 1 else False,
  134. up["x0"] > down["x1"],
  135. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  136. self.__height(down)),
  137. self._x_dis(up, down) / max(w, 0.000001),
  138. (len(up["text"]) - len(down["text"])) /
  139. max(len(up["text"]), len(down["text"])),
  140. len(tks_all) - len(tks_up) - len(tks_down),
  141. len(tks_down) - len(tks_up),
  142. tks_down[-1] == tks_up[-1] if tks_down and tks_up else False,
  143. max(down["in_row"], up["in_row"]),
  144. abs(down["in_row"] - up["in_row"]),
  145. len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
  146. len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
  147. ]
  148. return fea
  149. @staticmethod
  150. def sort_X_by_page(arr, threashold):
  151. # sort using y1 first and then x1
  152. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  153. for i in range(len(arr) - 1):
  154. for j in range(i, -1, -1):
  155. # restore the order using th
  156. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  157. and arr[j + 1]["top"] < arr[j]["top"] \
  158. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  159. tmp = arr[j]
  160. arr[j] = arr[j + 1]
  161. arr[j + 1] = tmp
  162. return arr
  163. def _has_color(self, o):
  164. if o.get("ncs", "") == "DeviceGray":
  165. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  166. o["non_stroking_color"][0] == 1:
  167. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  168. return False
  169. return True
  170. def _table_transformer_job(self, ZM):
  171. logging.debug("Table processing...")
  172. imgs, pos = [], []
  173. tbcnt = [0]
  174. MARGIN = 10
  175. self.tb_cpns = []
  176. assert len(self.page_layout) == len(self.page_images)
  177. for p, tbls in enumerate(self.page_layout): # for page
  178. tbls = [f for f in tbls if f["type"] == "table"]
  179. tbcnt.append(len(tbls))
  180. if not tbls:
  181. continue
  182. for tb in tbls: # for table
  183. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  184. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  185. left *= ZM
  186. top *= ZM
  187. right *= ZM
  188. bott *= ZM
  189. pos.append((left, top))
  190. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  191. assert len(self.page_images) == len(tbcnt) - 1
  192. if not imgs:
  193. return
  194. recos = self.tbl_det(imgs)
  195. tbcnt = np.cumsum(tbcnt)
  196. for i in range(len(tbcnt) - 1): # for page
  197. pg = []
  198. for j, tb_items in enumerate(
  199. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  200. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  201. for it in tb_items: # for table components
  202. it["x0"] = (it["x0"] + poss[j][0])
  203. it["x1"] = (it["x1"] + poss[j][0])
  204. it["top"] = (it["top"] + poss[j][1])
  205. it["bottom"] = (it["bottom"] + poss[j][1])
  206. for n in ["x0", "x1", "top", "bottom"]:
  207. it[n] /= ZM
  208. it["top"] += self.page_cum_height[i]
  209. it["bottom"] += self.page_cum_height[i]
  210. it["pn"] = i
  211. it["layoutno"] = j
  212. pg.append(it)
  213. self.tb_cpns.extend(pg)
  214. def gather(kwd, fzy=10, ption=0.6):
  215. eles = Recognizer.sort_Y_firstly(
  216. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  217. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  218. return Recognizer.sort_Y_firstly(eles, 0)
  219. # add R,H,C,SP tag to boxes within table layout
  220. headers = gather(r".*header$")
  221. rows = gather(r".* (row|header)")
  222. spans = gather(r".*spanning")
  223. clmns = sorted([r for r in self.tb_cpns if re.match(
  224. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  225. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  226. for b in self.boxes:
  227. if b.get("layout_type", "") != "table":
  228. continue
  229. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  230. if ii is not None:
  231. b["R"] = ii
  232. b["R_top"] = rows[ii]["top"]
  233. b["R_bott"] = rows[ii]["bottom"]
  234. ii = Recognizer.find_overlapped_with_threashold(
  235. b, headers, thr=0.3)
  236. if ii is not None:
  237. b["H_top"] = headers[ii]["top"]
  238. b["H_bott"] = headers[ii]["bottom"]
  239. b["H_left"] = headers[ii]["x0"]
  240. b["H_right"] = headers[ii]["x1"]
  241. b["H"] = ii
  242. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  243. if ii is not None:
  244. b["C"] = ii
  245. b["C_left"] = clmns[ii]["x0"]
  246. b["C_right"] = clmns[ii]["x1"]
  247. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  248. if ii is not None:
  249. b["H_top"] = spans[ii]["top"]
  250. b["H_bott"] = spans[ii]["bottom"]
  251. b["H_left"] = spans[ii]["x0"]
  252. b["H_right"] = spans[ii]["x1"]
  253. b["SP"] = ii
  254. def __ocr(self, pagenum, img, chars, ZM=3):
  255. bxs = self.ocr.detect(np.array(img))
  256. if not bxs:
  257. self.boxes.append([])
  258. return
  259. bxs = [(line[0], line[1][0]) for line in bxs]
  260. bxs = Recognizer.sort_Y_firstly(
  261. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  262. "top": b[0][1] / ZM, "text": "", "txt": t,
  263. "bottom": b[-1][1] / ZM,
  264. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  265. self.mean_height[-1] / 3
  266. )
  267. # merge chars in the same rect
  268. for c in Recognizer.sort_Y_firstly(
  269. chars, self.mean_height[pagenum - 1] // 4):
  270. ii = Recognizer.find_overlapped(c, bxs)
  271. if ii is None:
  272. self.lefted_chars.append(c)
  273. continue
  274. ch = c["bottom"] - c["top"]
  275. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  276. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  277. self.lefted_chars.append(c)
  278. continue
  279. if c["text"] == " " and bxs[ii]["text"]:
  280. if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", bxs[ii]["text"][-1]):
  281. bxs[ii]["text"] += " "
  282. else:
  283. bxs[ii]["text"] += c["text"]
  284. for b in bxs:
  285. if not b["text"]:
  286. left, right, top, bott = b["x0"] * ZM, b["x1"] * \
  287. ZM, b["top"] * ZM, b["bottom"] * ZM
  288. b["text"] = self.ocr.recognize(np.array(img),
  289. np.array([[left, top], [right, top], [right, bott], [left, bott]],
  290. dtype=np.float32))
  291. del b["txt"]
  292. bxs = [b for b in bxs if b["text"]]
  293. if self.mean_height[-1] == 0:
  294. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  295. for b in bxs])
  296. self.boxes.append(bxs)
  297. def _layouts_rec(self, ZM, drop=True):
  298. assert len(self.page_images) == len(self.boxes)
  299. self.boxes, self.page_layout = self.layouter(
  300. self.page_images, self.boxes, ZM, drop=drop)
  301. # cumlative Y
  302. for i in range(len(self.boxes)):
  303. self.boxes[i]["top"] += \
  304. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  305. self.boxes[i]["bottom"] += \
  306. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  307. def _text_merge(self):
  308. # merge adjusted boxes
  309. bxs = self.boxes
  310. def end_with(b, txt):
  311. txt = txt.strip()
  312. tt = b.get("text", "").strip()
  313. return tt and tt.find(txt) == len(tt) - len(txt)
  314. def start_with(b, txts):
  315. tt = b.get("text", "").strip()
  316. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  317. # horizontally merge adjacent box with the same layout
  318. i = 0
  319. while i < len(bxs) - 1:
  320. b = bxs[i]
  321. b_ = bxs[i + 1]
  322. if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
  323. "equation"]:
  324. i += 1
  325. continue
  326. if abs(self._y_dis(b, b_)
  327. ) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
  328. # merge
  329. bxs[i]["x1"] = b_["x1"]
  330. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  331. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  332. bxs[i]["text"] += b_["text"]
  333. bxs.pop(i + 1)
  334. continue
  335. i += 1
  336. continue
  337. dis_thr = 1
  338. dis = b["x1"] - b_["x0"]
  339. if b.get("layout_type", "") != "text" or b_.get(
  340. "layout_type", "") != "text":
  341. if end_with(b, ",") or start_with(b_, "(,"):
  342. dis_thr = -8
  343. else:
  344. i += 1
  345. continue
  346. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  347. and dis >= dis_thr and b["x1"] < b_["x1"]:
  348. # merge
  349. bxs[i]["x1"] = b_["x1"]
  350. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  351. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  352. bxs[i]["text"] += b_["text"]
  353. bxs.pop(i + 1)
  354. continue
  355. i += 1
  356. self.boxes = bxs
  357. def _naive_vertical_merge(self):
  358. bxs = Recognizer.sort_Y_firstly(
  359. self.boxes, np.median(
  360. self.mean_height) / 3)
  361. i = 0
  362. while i + 1 < len(bxs):
  363. b = bxs[i]
  364. b_ = bxs[i + 1]
  365. if b["page_number"] < b_["page_number"] and re.match(
  366. r"[0-9 •一—-]+$", b["text"]):
  367. bxs.pop(i)
  368. continue
  369. if not b["text"].strip():
  370. bxs.pop(i)
  371. continue
  372. concatting_feats = [
  373. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  374. len(b["text"].strip()) > 1 and b["text"].strip(
  375. )[-2] in ",;:'\",‘“、;:",
  376. b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:",
  377. ]
  378. # features for not concating
  379. feats = [
  380. b.get("layoutno", 0) != b_.get("layoutno", 0),
  381. b["text"].strip()[-1] in "。?!?",
  382. self.is_english and b["text"].strip()[-1] in ".!?",
  383. b["page_number"] == b_["page_number"] and b_["top"] -
  384. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  385. b["page_number"] < b_["page_number"] and abs(
  386. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
  387. ]
  388. # split features
  389. detach_feats = [b["x1"] < b_["x0"],
  390. b["x0"] > b_["x1"]]
  391. if (any(feats) and not any(concatting_feats)) or any(detach_feats):
  392. logging.debug("{} {} {} {}".format(
  393. b["text"],
  394. b_["text"],
  395. any(feats),
  396. any(concatting_feats),
  397. ))
  398. i += 1
  399. continue
  400. # merge up and down
  401. b["bottom"] = b_["bottom"]
  402. b["text"] += b_["text"]
  403. b["x0"] = min(b["x0"], b_["x0"])
  404. b["x1"] = max(b["x1"], b_["x1"])
  405. bxs.pop(i + 1)
  406. self.boxes = bxs
  407. def _concat_downward(self, concat_between_pages=True):
  408. # count boxes in the same row as a feature
  409. for i in range(len(self.boxes)):
  410. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  411. self.boxes[i]["in_row"] = 0
  412. j = max(0, i - 12)
  413. while j < min(i + 12, len(self.boxes)):
  414. if j == i:
  415. j += 1
  416. continue
  417. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  418. if abs(ydis) < 1:
  419. self.boxes[i]["in_row"] += 1
  420. elif ydis > 0:
  421. break
  422. j += 1
  423. # concat between rows
  424. boxes = deepcopy(self.boxes)
  425. blocks = []
  426. while boxes:
  427. chunks = []
  428. def dfs(up, dp):
  429. chunks.append(up)
  430. i = dp
  431. while i < min(dp + 12, len(boxes)):
  432. ydis = self._y_dis(up, boxes[i])
  433. smpg = up["page_number"] == boxes[i]["page_number"]
  434. mh = self.mean_height[up["page_number"] - 1]
  435. mw = self.mean_width[up["page_number"] - 1]
  436. if smpg and ydis > mh * 4:
  437. break
  438. if not smpg and ydis > mh * 16:
  439. break
  440. down = boxes[i]
  441. if not concat_between_pages and down["page_number"] > up["page_number"]:
  442. break
  443. if up.get("R", "") != down.get(
  444. "R", "") and up["text"][-1] != ",":
  445. i += 1
  446. continue
  447. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  448. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \
  449. or not down["text"].strip():
  450. i += 1
  451. continue
  452. if not down["text"].strip() or not up["text"].strip():
  453. i += 1
  454. continue
  455. if up["x1"] < down["x0"] - 10 * \
  456. mw or up["x0"] > down["x1"] + 10 * mw:
  457. i += 1
  458. continue
  459. if i - dp < 5 and up.get("layout_type") == "text":
  460. if up.get("layoutno", "1") == down.get(
  461. "layoutno", "2"):
  462. dfs(down, i + 1)
  463. boxes.pop(i)
  464. return
  465. i += 1
  466. continue
  467. fea = self._updown_concat_features(up, down)
  468. if self.updown_cnt_mdl.predict(
  469. xgb.DMatrix([fea]))[0] <= 0.5:
  470. i += 1
  471. continue
  472. dfs(down, i + 1)
  473. boxes.pop(i)
  474. return
  475. dfs(boxes[0], 1)
  476. boxes.pop(0)
  477. if chunks:
  478. blocks.append(chunks)
  479. # concat within each block
  480. boxes = []
  481. for b in blocks:
  482. if len(b) == 1:
  483. boxes.append(b[0])
  484. continue
  485. t = b[0]
  486. for c in b[1:]:
  487. t["text"] = t["text"].strip()
  488. c["text"] = c["text"].strip()
  489. if not c["text"]:
  490. continue
  491. if t["text"] and re.match(
  492. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  493. t["text"] += " "
  494. t["text"] += c["text"]
  495. t["x0"] = min(t["x0"], c["x0"])
  496. t["x1"] = max(t["x1"], c["x1"])
  497. t["page_number"] = min(t["page_number"], c["page_number"])
  498. t["bottom"] = c["bottom"]
  499. if not t["layout_type"] \
  500. and c["layout_type"]:
  501. t["layout_type"] = c["layout_type"]
  502. boxes.append(t)
  503. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  504. def _filter_forpages(self):
  505. if not self.boxes:
  506. return
  507. findit = False
  508. i = 0
  509. while i < len(self.boxes):
  510. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
  511. re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  512. i += 1
  513. continue
  514. findit = True
  515. eng = re.match(
  516. r"[0-9a-zA-Z :'.-]{5,}",
  517. self.boxes[i]["text"].strip())
  518. self.boxes.pop(i)
  519. if i >= len(self.boxes):
  520. break
  521. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  522. self.boxes[i]["text"].strip().split()[:2])
  523. while not prefix:
  524. self.boxes.pop(i)
  525. if i >= len(self.boxes):
  526. break
  527. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  528. self.boxes[i]["text"].strip().split()[:2])
  529. self.boxes.pop(i)
  530. if i >= len(self.boxes) or not prefix:
  531. break
  532. for j in range(i, min(i + 128, len(self.boxes))):
  533. if not re.match(prefix, self.boxes[j]["text"]):
  534. continue
  535. for k in range(i, j):
  536. self.boxes.pop(i)
  537. break
  538. if findit:
  539. return
  540. page_dirty = [0] * len(self.page_images)
  541. for b in self.boxes:
  542. if re.search(r"(··|··|··)", b["text"]):
  543. page_dirty[b["page_number"] - 1] += 1
  544. page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
  545. if not page_dirty:
  546. return
  547. i = 0
  548. while i < len(self.boxes):
  549. if self.boxes[i]["page_number"] in page_dirty:
  550. self.boxes.pop(i)
  551. continue
  552. i += 1
  553. def _merge_with_same_bullet(self):
  554. i = 0
  555. while i + 1 < len(self.boxes):
  556. b = self.boxes[i]
  557. b_ = self.boxes[i + 1]
  558. if not b["text"].strip():
  559. self.boxes.pop(i)
  560. continue
  561. if not b_["text"].strip():
  562. self.boxes.pop(i + 1)
  563. continue
  564. if b["text"].strip()[0] != b_["text"].strip()[0] \
  565. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  566. or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
  567. or b["top"] > b_["bottom"]:
  568. i += 1
  569. continue
  570. b_["text"] = b["text"] + "\n" + b_["text"]
  571. b_["x0"] = min(b["x0"], b_["x0"])
  572. b_["x1"] = max(b["x1"], b_["x1"])
  573. b_["top"] = b["top"]
  574. self.boxes.pop(i)
  575. def _extract_table_figure(self, need_image, ZM,
  576. return_html, need_position):
  577. tables = {}
  578. figures = {}
  579. # extract figure and table boxes
  580. i = 0
  581. lst_lout_no = ""
  582. nomerge_lout_no = []
  583. while i < len(self.boxes):
  584. if "layoutno" not in self.boxes[i]:
  585. i += 1
  586. continue
  587. lout_no = str(self.boxes[i]["page_number"]) + \
  588. "-" + str(self.boxes[i]["layoutno"])
  589. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
  590. "title",
  591. "figure caption",
  592. "reference"]:
  593. nomerge_lout_no.append(lst_lout_no)
  594. if self.boxes[i]["layout_type"] == "table":
  595. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  596. self.boxes.pop(i)
  597. continue
  598. if lout_no not in tables:
  599. tables[lout_no] = []
  600. tables[lout_no].append(self.boxes[i])
  601. self.boxes.pop(i)
  602. lst_lout_no = lout_no
  603. continue
  604. if need_image and self.boxes[i]["layout_type"] == "figure":
  605. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  606. self.boxes.pop(i)
  607. continue
  608. if lout_no not in figures:
  609. figures[lout_no] = []
  610. figures[lout_no].append(self.boxes[i])
  611. self.boxes.pop(i)
  612. lst_lout_no = lout_no
  613. continue
  614. i += 1
  615. # merge table on different pages
  616. nomerge_lout_no = set(nomerge_lout_no)
  617. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  618. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  619. i = len(tbls) - 1
  620. while i - 1 >= 0:
  621. k0, bxs0 = tbls[i - 1]
  622. k, bxs = tbls[i]
  623. i -= 1
  624. if k0 in nomerge_lout_no:
  625. continue
  626. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  627. continue
  628. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  629. continue
  630. mh = self.mean_height[bxs[0]["page_number"] - 1]
  631. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  632. continue
  633. tables[k0].extend(tables[k])
  634. del tables[k]
  635. def x_overlapped(a, b):
  636. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  637. # find captions and pop out
  638. i = 0
  639. while i < len(self.boxes):
  640. c = self.boxes[i]
  641. # mh = self.mean_height[c["page_number"]-1]
  642. if not TableStructureRecognizer.is_caption(c):
  643. i += 1
  644. continue
  645. # find the nearest layouts
  646. def nearest(tbls):
  647. nonlocal c
  648. mink = ""
  649. minv = 1000000000
  650. for k, bxs in tbls.items():
  651. for b in bxs:
  652. if b.get("layout_type", "").find("caption") >= 0:
  653. continue
  654. y_dis = self._y_dis(c, b)
  655. x_dis = self._x_dis(
  656. c, b) if not x_overlapped(
  657. c, b) else 0
  658. dis = y_dis * y_dis + x_dis * x_dis
  659. if dis < minv:
  660. mink = k
  661. minv = dis
  662. return mink, minv
  663. tk, tv = nearest(tables)
  664. fk, fv = nearest(figures)
  665. # if min(tv, fv) > 2000:
  666. # i += 1
  667. # continue
  668. if tv < fv and tk:
  669. tables[tk].insert(0, c)
  670. logging.debug(
  671. "TABLE:" +
  672. self.boxes[i]["text"] +
  673. "; Cap: " +
  674. tk)
  675. elif fk:
  676. figures[fk].insert(0, c)
  677. logging.debug(
  678. "FIGURE:" +
  679. self.boxes[i]["text"] +
  680. "; Cap: " +
  681. tk)
  682. self.boxes.pop(i)
  683. res = []
  684. positions = []
  685. def cropout(bxs, ltype, poss):
  686. nonlocal ZM
  687. pn = set([b["page_number"] - 1 for b in bxs])
  688. if len(pn) < 2:
  689. pn = list(pn)[0]
  690. ht = self.page_cum_height[pn]
  691. b = {
  692. "x0": np.min([b["x0"] for b in bxs]),
  693. "top": np.min([b["top"] for b in bxs]) - ht,
  694. "x1": np.max([b["x1"] for b in bxs]),
  695. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  696. }
  697. louts = [layout for layout in self.page_layout[pn] if layout["type"] == ltype]
  698. ii = Recognizer.find_overlapped(b, louts, naive=True)
  699. if ii is not None:
  700. b = louts[ii]
  701. else:
  702. logging.warning(
  703. f"Missing layout match: {pn + 1},%s" %
  704. (bxs[0].get(
  705. "layoutno", "")))
  706. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  707. if right < left:
  708. right = left + 1
  709. poss.append((pn + self.page_from, left, right, top, bott))
  710. return self.page_images[pn] \
  711. .crop((left * ZM, top * ZM,
  712. right * ZM, bott * ZM))
  713. pn = {}
  714. for b in bxs:
  715. p = b["page_number"] - 1
  716. if p not in pn:
  717. pn[p] = []
  718. pn[p].append(b)
  719. pn = sorted(pn.items(), key=lambda x: x[0])
  720. imgs = [cropout(arr, ltype, poss) for p, arr in pn]
  721. pic = Image.new("RGB",
  722. (int(np.max([i.size[0] for i in imgs])),
  723. int(np.sum([m.size[1] for m in imgs]))),
  724. (245, 245, 245))
  725. height = 0
  726. for img in imgs:
  727. pic.paste(img, (0, int(height)))
  728. height += img.size[1]
  729. return pic
  730. # crop figure out and add caption
  731. for k, bxs in figures.items():
  732. txt = "\n".join([b["text"] for b in bxs])
  733. if not txt:
  734. continue
  735. poss = []
  736. res.append(
  737. (cropout(
  738. bxs,
  739. "figure", poss),
  740. [txt]))
  741. positions.append(poss)
  742. for k, bxs in tables.items():
  743. if not bxs:
  744. continue
  745. bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
  746. [(b["bottom"] - b["top"]) / 2 for b in bxs]))
  747. poss = []
  748. res.append((cropout(bxs, "table", poss),
  749. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  750. positions.append(poss)
  751. assert len(positions) == len(res)
  752. if need_position:
  753. return list(zip(res, positions))
  754. return res
  755. def proj_match(self, line):
  756. if len(line) <= 2:
  757. return
  758. if re.match(r"[0-9 ().,%%+/-]+$", line):
  759. return False
  760. for p, j in [
  761. (r"第[零一二三四五六七八九十百]+章", 1),
  762. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  763. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  764. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  765. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  766. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  767. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  768. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  769. (r".{,48}[::??]$", 9),
  770. (r"[0-9]+)", 10),
  771. (r"[\((][0-9]+[)\)]", 11),
  772. (r"[零一二三四五六七八九十百]+是", 12),
  773. (r"[⚫•➢✓]", 12)
  774. ]:
  775. if re.match(p, line):
  776. return j
  777. return
  778. def _line_tag(self, bx, ZM):
  779. pn = [bx["page_number"]]
  780. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  781. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  782. page_images_cnt = len(self.page_images)
  783. if pn[-1] - 1 >= page_images_cnt:
  784. return ""
  785. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  786. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  787. pn.append(pn[-1] + 1)
  788. if pn[-1] - 1 >= page_images_cnt:
  789. return ""
  790. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  791. .format("-".join([str(p) for p in pn]),
  792. bx["x0"], bx["x1"], top, bott)
  793. def __filterout_scraps(self, boxes, ZM):
  794. def width(b):
  795. return b["x1"] - b["x0"]
  796. def height(b):
  797. return b["bottom"] - b["top"]
  798. def usefull(b):
  799. if b.get("layout_type"):
  800. return True
  801. if width(
  802. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  803. return True
  804. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  805. return True
  806. return False
  807. res = []
  808. while boxes:
  809. lines = []
  810. widths = []
  811. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  812. mh = self.mean_height[boxes[0]["page_number"] - 1]
  813. mj = self.proj_match(
  814. boxes[0]["text"]) or boxes[0].get(
  815. "layout_type",
  816. "") == "title"
  817. def dfs(line, st):
  818. nonlocal mh, pw, lines, widths
  819. lines.append(line)
  820. widths.append(width(line))
  821. mmj = self.proj_match(
  822. line["text"]) or line.get(
  823. "layout_type",
  824. "") == "title"
  825. for i in range(st + 1, min(st + 20, len(boxes))):
  826. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  827. break
  828. if not mmj and self._y_dis(
  829. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  830. break
  831. if not usefull(boxes[i]):
  832. continue
  833. if mmj or \
  834. (self._x_dis(boxes[i], line) < pw / 10): \
  835. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  836. # concat following
  837. dfs(boxes[i], i)
  838. boxes.pop(i)
  839. break
  840. try:
  841. if usefull(boxes[0]):
  842. dfs(boxes[0], 0)
  843. else:
  844. logging.debug("WASTE: " + boxes[0]["text"])
  845. except Exception:
  846. pass
  847. boxes.pop(0)
  848. mw = np.mean(widths)
  849. if mj or mw / pw >= 0.35 or mw > 200:
  850. res.append(
  851. "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  852. else:
  853. logging.debug("REMOVED: " +
  854. "<<".join([c["text"] for c in lines]))
  855. return "\n\n".join(res)
  856. @staticmethod
  857. def total_page_number(fnm, binary=None):
  858. try:
  859. pdf = pdfplumber.open(
  860. fnm) if not binary else pdfplumber.open(BytesIO(binary))
  861. return len(pdf.pages)
  862. except Exception:
  863. logging.exception("total_page_number")
  864. def __images__(self, fnm, zoomin=3, page_from=0,
  865. page_to=299, callback=None):
  866. self.lefted_chars = []
  867. self.mean_height = []
  868. self.mean_width = []
  869. self.boxes = []
  870. self.garbages = {}
  871. self.page_cum_height = [0]
  872. self.page_layout = []
  873. self.page_from = page_from
  874. try:
  875. self.pdf = pdfplumber.open(fnm) if isinstance(
  876. fnm, str) else pdfplumber.open(BytesIO(fnm))
  877. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  878. enumerate(self.pdf.pages[page_from:page_to])]
  879. self.page_images_x2 = [p.to_image(resolution=72 * zoomin * 2).annotated for i, p in
  880. enumerate(self.pdf.pages[page_from:page_to])]
  881. try:
  882. self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
  883. except Exception as e:
  884. logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
  885. self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
  886. self.total_page = len(self.pdf.pages)
  887. except Exception:
  888. logging.exception("RAGFlowPdfParser __images__")
  889. self.outlines = []
  890. try:
  891. self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
  892. outlines = self.pdf.outline
  893. def dfs(arr, depth):
  894. for a in arr:
  895. if isinstance(a, dict):
  896. self.outlines.append((a["/Title"], depth))
  897. continue
  898. dfs(a, depth + 1)
  899. dfs(outlines, 0)
  900. except Exception as e:
  901. logging.warning(f"Outlines exception: {e}")
  902. if not self.outlines:
  903. logging.warning("Miss outlines")
  904. logging.debug("Images converted.")
  905. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
  906. random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
  907. range(len(self.page_chars))]
  908. if sum([1 if e else 0 for e in self.is_english]) > len(
  909. self.page_images) / 2:
  910. self.is_english = True
  911. else:
  912. self.is_english = False
  913. # st = timer()
  914. for i, img in enumerate(self.page_images_x2):
  915. chars = self.page_chars[i] if not self.is_english else []
  916. self.mean_height.append(
  917. np.median(sorted([c["height"] for c in chars])) if chars else 0
  918. )
  919. self.mean_width.append(
  920. np.median(sorted([c["width"] for c in chars])) if chars else 8
  921. )
  922. self.page_cum_height.append(img.size[1] / zoomin/2)
  923. j = 0
  924. while j + 1 < len(chars):
  925. if chars[j]["text"] and chars[j + 1]["text"] \
  926. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  927. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  928. chars[j]["width"]) / 2:
  929. chars[j]["text"] += " "
  930. j += 1
  931. self.__ocr(i + 1, img, chars, zoomin*2)
  932. if callback and i % 6 == 5:
  933. callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
  934. # print("OCR:", timer()-st)
  935. if not self.is_english and not any(
  936. [c for c in self.page_chars]) and self.boxes:
  937. bxes = [b for bxs in self.boxes for b in bxs]
  938. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
  939. "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  940. logging.debug("Is it English:", self.is_english)
  941. self.page_cum_height = np.cumsum(self.page_cum_height)
  942. assert len(self.page_cum_height) == len(self.page_images) + 1
  943. if len(self.boxes) == 0 and zoomin < 9:
  944. self.__images__(fnm, zoomin * 3, page_from, page_to, callback)
  945. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  946. self.__images__(fnm, zoomin)
  947. self._layouts_rec(zoomin)
  948. self._table_transformer_job(zoomin)
  949. self._text_merge()
  950. self._concat_downward()
  951. self._filter_forpages()
  952. tbls = self._extract_table_figure(
  953. need_image, zoomin, return_html, False)
  954. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  955. def remove_tag(self, txt):
  956. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  957. def crop(self, text, ZM=3, need_position=False):
  958. imgs = []
  959. poss = []
  960. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  961. pn, left, right, top, bottom = tag.strip(
  962. "#").strip("@").split("\t")
  963. left, right, top, bottom = float(left), float(
  964. right), float(top), float(bottom)
  965. poss.append(([int(p) - 1 for p in pn.split("-")],
  966. left, right, top, bottom))
  967. if not poss:
  968. if need_position:
  969. return None, None
  970. return
  971. max_width = max(
  972. np.max([right - left for (_, left, right, _, _) in poss]), 6)
  973. GAP = 6
  974. pos = poss[0]
  975. poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
  976. 0, pos[3] - 120), max(pos[3] - GAP, 0)))
  977. pos = poss[-1]
  978. poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
  979. min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
  980. positions = []
  981. for ii, (pns, left, right, top, bottom) in enumerate(poss):
  982. right = left + max_width
  983. bottom *= ZM
  984. for pn in pns[1:]:
  985. bottom += self.page_images[pn - 1].size[1]
  986. imgs.append(
  987. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  988. right *
  989. ZM, min(
  990. bottom, self.page_images[pns[0]].size[1])
  991. ))
  992. )
  993. if 0 < ii < len(poss) - 1:
  994. positions.append((pns[0] + self.page_from, left, right, top, min(
  995. bottom, self.page_images[pns[0]].size[1]) / ZM))
  996. bottom -= self.page_images[pns[0]].size[1]
  997. for pn in pns[1:]:
  998. imgs.append(
  999. self.page_images[pn].crop((left * ZM, 0,
  1000. right * ZM,
  1001. min(bottom,
  1002. self.page_images[pn].size[1])
  1003. ))
  1004. )
  1005. if 0 < ii < len(poss) - 1:
  1006. positions.append((pn + self.page_from, left, right, 0, min(
  1007. bottom, self.page_images[pn].size[1]) / ZM))
  1008. bottom -= self.page_images[pn].size[1]
  1009. if not imgs:
  1010. if need_position:
  1011. return None, None
  1012. return
  1013. height = 0
  1014. for img in imgs:
  1015. height += img.size[1] + GAP
  1016. height = int(height)
  1017. width = int(np.max([i.size[0] for i in imgs]))
  1018. pic = Image.new("RGB",
  1019. (width, height),
  1020. (245, 245, 245))
  1021. height = 0
  1022. for ii, img in enumerate(imgs):
  1023. if ii == 0 or ii + 1 == len(imgs):
  1024. img = img.convert('RGBA')
  1025. overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
  1026. overlay.putalpha(128)
  1027. img = Image.alpha_composite(img, overlay).convert("RGB")
  1028. pic.paste(img, (0, int(height)))
  1029. height += img.size[1] + GAP
  1030. if need_position:
  1031. return pic, positions
  1032. return pic
  1033. def get_position(self, bx, ZM):
  1034. poss = []
  1035. pn = bx["page_number"]
  1036. top = bx["top"] - self.page_cum_height[pn - 1]
  1037. bott = bx["bottom"] - self.page_cum_height[pn - 1]
  1038. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1039. bott, self.page_images[pn - 1].size[1] / ZM)))
  1040. while bott * ZM > self.page_images[pn - 1].size[1]:
  1041. bott -= self.page_images[pn - 1].size[1] / ZM
  1042. top = 0
  1043. pn += 1
  1044. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1045. bott, self.page_images[pn - 1].size[1] / ZM)))
  1046. return poss
  1047. class PlainParser(object):
  1048. def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
  1049. self.outlines = []
  1050. lines = []
  1051. try:
  1052. self.pdf = pdf2_read(
  1053. filename if isinstance(
  1054. filename, str) else BytesIO(filename))
  1055. for page in self.pdf.pages[from_page:to_page]:
  1056. lines.extend([t for t in page.extract_text().split("\n")])
  1057. outlines = self.pdf.outline
  1058. def dfs(arr, depth):
  1059. for a in arr:
  1060. if isinstance(a, dict):
  1061. self.outlines.append((a["/Title"], depth))
  1062. continue
  1063. dfs(a, depth + 1)
  1064. dfs(outlines, 0)
  1065. except Exception:
  1066. logging.exception("Outlines exception")
  1067. if not self.outlines:
  1068. logging.warning("Miss outlines")
  1069. return [(line, "") for line in lines], []
  1070. def crop(self, ck, need_position):
  1071. raise NotImplementedError
  1072. @staticmethod
  1073. def remove_tag(txt):
  1074. raise NotImplementedError
  1075. if __name__ == "__main__":
  1076. pass