You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

pdf_parser.py 46KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178
  1. # Licensed under the Apache License, Version 2.0 (the "License");
  2. # you may not use this file except in compliance with the License.
  3. # You may obtain a copy of the License at
  4. #
  5. # http://www.apache.org/licenses/LICENSE-2.0
  6. #
  7. # Unless required by applicable law or agreed to in writing, software
  8. # distributed under the License is distributed on an "AS IS" BASIS,
  9. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. # See the License for the specific language governing permissions and
  11. # limitations under the License.
  12. #
  13. import logging
  14. import os
  15. import random
  16. import xgboost as xgb
  17. from io import BytesIO
  18. import re
  19. import pdfplumber
  20. from PIL import Image
  21. import numpy as np
  22. from timeit import default_timer as timer
  23. from pypdf import PdfReader as pdf2_read
  24. from api import settings
  25. from api.utils.file_utils import get_project_base_directory
  26. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  27. from rag.nlp import rag_tokenizer
  28. from copy import deepcopy
  29. from huggingface_hub import snapshot_download
  30. class RAGFlowPdfParser:
  31. def __init__(self):
  32. self.ocr = OCR()
  33. if hasattr(self, "model_speciess"):
  34. self.layouter = LayoutRecognizer("layout." + self.model_speciess)
  35. else:
  36. self.layouter = LayoutRecognizer("layout")
  37. self.tbl_det = TableStructureRecognizer()
  38. self.updown_cnt_mdl = xgb.Booster()
  39. if not settings.LIGHTEN:
  40. try:
  41. import torch
  42. if torch.cuda.is_available():
  43. self.updown_cnt_mdl.set_param({"device": "cuda"})
  44. except Exception:
  45. logging.exception("RAGFlowPdfParser __init__")
  46. try:
  47. model_dir = os.path.join(
  48. get_project_base_directory(),
  49. "rag/res/deepdoc")
  50. self.updown_cnt_mdl.load_model(os.path.join(
  51. model_dir, "updown_concat_xgb.model"))
  52. except Exception:
  53. model_dir = snapshot_download(
  54. repo_id="InfiniFlow/text_concat_xgb_v1.0",
  55. local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
  56. local_dir_use_symlinks=False)
  57. self.updown_cnt_mdl.load_model(os.path.join(
  58. model_dir, "updown_concat_xgb.model"))
  59. self.page_from = 0
  60. """
  61. If you have trouble downloading HuggingFace models, -_^ this might help!!
  62. For Linux:
  63. export HF_ENDPOINT=https://hf-mirror.com
  64. For Windows:
  65. Good luck
  66. ^_-
  67. """
  68. def __char_width(self, c):
  69. return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
  70. def __height(self, c):
  71. return c["bottom"] - c["top"]
  72. def _x_dis(self, a, b):
  73. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  74. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  75. def _y_dis(
  76. self, a, b):
  77. return (
  78. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  79. def _match_proj(self, b):
  80. proj_patt = [
  81. r"第[零一二三四五六七八九十百]+章",
  82. r"第[零一二三四五六七八九十百]+[条节]",
  83. r"[零一二三四五六七八九十百]+[、是  ]",
  84. r"[\((][零一二三四五六七八九十百]+[)\)]",
  85. r"[\((][0-9]+[)\)]",
  86. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  87. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  88. r"[⚫•➢①② ]",
  89. ]
  90. return any([re.match(p, b["text"]) for p in proj_patt])
  91. def _updown_concat_features(self, up, down):
  92. w = max(self.__char_width(up), self.__char_width(down))
  93. h = max(self.__height(up), self.__height(down))
  94. y_dis = self._y_dis(up, down)
  95. LEN = 6
  96. tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split(" ")
  97. tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split(" ")
  98. tks_all = up["text"][-LEN:].strip() \
  99. + (" " if re.match(r"[a-zA-Z0-9]+",
  100. up["text"][-1] + down["text"][0]) else "") \
  101. + down["text"][:LEN].strip()
  102. tks_all = rag_tokenizer.tokenize(tks_all).split(" ")
  103. fea = [
  104. up.get("R", -1) == down.get("R", -1),
  105. y_dis / h,
  106. down["page_number"] - up["page_number"],
  107. up["layout_type"] == down["layout_type"],
  108. up["layout_type"] == "text",
  109. down["layout_type"] == "text",
  110. up["layout_type"] == "table",
  111. down["layout_type"] == "table",
  112. True if re.search(
  113. r"([。?!;!?;+))]|[a-z]\.)$",
  114. up["text"]) else False,
  115. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  116. True if re.search(
  117. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  118. down["text"]) else False,
  119. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  120. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  121. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  122. True if re.search(r"[\((][^\))]+$", up["text"])
  123. and re.search(r"[\))]", down["text"]) else False,
  124. self._match_proj(down),
  125. True if re.match(r"[A-Z]", down["text"]) else False,
  126. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  127. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  128. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  129. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  130. ) > 1 and len(
  131. down["text"].strip()) > 1 else False,
  132. up["x0"] > down["x1"],
  133. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  134. self.__height(down)),
  135. self._x_dis(up, down) / max(w, 0.000001),
  136. (len(up["text"]) - len(down["text"])) /
  137. max(len(up["text"]), len(down["text"])),
  138. len(tks_all) - len(tks_up) - len(tks_down),
  139. len(tks_down) - len(tks_up),
  140. tks_down[-1] == tks_up[-1],
  141. max(down["in_row"], up["in_row"]),
  142. abs(down["in_row"] - up["in_row"]),
  143. len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
  144. len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
  145. ]
  146. return fea
  147. @staticmethod
  148. def sort_X_by_page(arr, threashold):
  149. # sort using y1 first and then x1
  150. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  151. for i in range(len(arr) - 1):
  152. for j in range(i, -1, -1):
  153. # restore the order using th
  154. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  155. and arr[j + 1]["top"] < arr[j]["top"] \
  156. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  157. tmp = arr[j]
  158. arr[j] = arr[j + 1]
  159. arr[j + 1] = tmp
  160. return arr
  161. def _has_color(self, o):
  162. if o.get("ncs", "") == "DeviceGray":
  163. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  164. o["non_stroking_color"][0] == 1:
  165. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  166. return False
  167. return True
  168. def _table_transformer_job(self, ZM):
  169. logging.debug("Table processing...")
  170. imgs, pos = [], []
  171. tbcnt = [0]
  172. MARGIN = 10
  173. self.tb_cpns = []
  174. assert len(self.page_layout) == len(self.page_images)
  175. for p, tbls in enumerate(self.page_layout): # for page
  176. tbls = [f for f in tbls if f["type"] == "table"]
  177. tbcnt.append(len(tbls))
  178. if not tbls:
  179. continue
  180. for tb in tbls: # for table
  181. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  182. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  183. left *= ZM
  184. top *= ZM
  185. right *= ZM
  186. bott *= ZM
  187. pos.append((left, top))
  188. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  189. assert len(self.page_images) == len(tbcnt) - 1
  190. if not imgs:
  191. return
  192. recos = self.tbl_det(imgs)
  193. tbcnt = np.cumsum(tbcnt)
  194. for i in range(len(tbcnt) - 1): # for page
  195. pg = []
  196. for j, tb_items in enumerate(
  197. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  198. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  199. for it in tb_items: # for table components
  200. it["x0"] = (it["x0"] + poss[j][0])
  201. it["x1"] = (it["x1"] + poss[j][0])
  202. it["top"] = (it["top"] + poss[j][1])
  203. it["bottom"] = (it["bottom"] + poss[j][1])
  204. for n in ["x0", "x1", "top", "bottom"]:
  205. it[n] /= ZM
  206. it["top"] += self.page_cum_height[i]
  207. it["bottom"] += self.page_cum_height[i]
  208. it["pn"] = i
  209. it["layoutno"] = j
  210. pg.append(it)
  211. self.tb_cpns.extend(pg)
  212. def gather(kwd, fzy=10, ption=0.6):
  213. eles = Recognizer.sort_Y_firstly(
  214. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  215. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  216. return Recognizer.sort_Y_firstly(eles, 0)
  217. # add R,H,C,SP tag to boxes within table layout
  218. headers = gather(r".*header$")
  219. rows = gather(r".* (row|header)")
  220. spans = gather(r".*spanning")
  221. clmns = sorted([r for r in self.tb_cpns if re.match(
  222. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  223. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  224. for b in self.boxes:
  225. if b.get("layout_type", "") != "table":
  226. continue
  227. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  228. if ii is not None:
  229. b["R"] = ii
  230. b["R_top"] = rows[ii]["top"]
  231. b["R_bott"] = rows[ii]["bottom"]
  232. ii = Recognizer.find_overlapped_with_threashold(
  233. b, headers, thr=0.3)
  234. if ii is not None:
  235. b["H_top"] = headers[ii]["top"]
  236. b["H_bott"] = headers[ii]["bottom"]
  237. b["H_left"] = headers[ii]["x0"]
  238. b["H_right"] = headers[ii]["x1"]
  239. b["H"] = ii
  240. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  241. if ii is not None:
  242. b["C"] = ii
  243. b["C_left"] = clmns[ii]["x0"]
  244. b["C_right"] = clmns[ii]["x1"]
  245. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  246. if ii is not None:
  247. b["H_top"] = spans[ii]["top"]
  248. b["H_bott"] = spans[ii]["bottom"]
  249. b["H_left"] = spans[ii]["x0"]
  250. b["H_right"] = spans[ii]["x1"]
  251. b["SP"] = ii
  252. def __ocr(self, pagenum, img, chars, ZM=3):
  253. bxs = self.ocr.detect(np.array(img))
  254. if not bxs:
  255. self.boxes.append([])
  256. return
  257. bxs = [(line[0], line[1][0]) for line in bxs]
  258. bxs = Recognizer.sort_Y_firstly(
  259. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  260. "top": b[0][1] / ZM, "text": "", "txt": t,
  261. "bottom": b[-1][1] / ZM,
  262. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  263. self.mean_height[-1] / 3
  264. )
  265. # merge chars in the same rect
  266. for c in Recognizer.sort_Y_firstly(
  267. chars, self.mean_height[pagenum - 1] // 4):
  268. ii = Recognizer.find_overlapped(c, bxs)
  269. if ii is None:
  270. self.lefted_chars.append(c)
  271. continue
  272. ch = c["bottom"] - c["top"]
  273. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  274. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  275. self.lefted_chars.append(c)
  276. continue
  277. if c["text"] == " " and bxs[ii]["text"]:
  278. if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", bxs[ii]["text"][-1]):
  279. bxs[ii]["text"] += " "
  280. else:
  281. bxs[ii]["text"] += c["text"]
  282. for b in bxs:
  283. if not b["text"]:
  284. left, right, top, bott = b["x0"] * ZM, b["x1"] * \
  285. ZM, b["top"] * ZM, b["bottom"] * ZM
  286. b["text"] = self.ocr.recognize(np.array(img),
  287. np.array([[left, top], [right, top], [right, bott], [left, bott]],
  288. dtype=np.float32))
  289. del b["txt"]
  290. bxs = [b for b in bxs if b["text"]]
  291. if self.mean_height[-1] == 0:
  292. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  293. for b in bxs])
  294. self.boxes.append(bxs)
  295. def _layouts_rec(self, ZM, drop=True):
  296. assert len(self.page_images) == len(self.boxes)
  297. self.boxes, self.page_layout = self.layouter(
  298. self.page_images, self.boxes, ZM, drop=drop)
  299. # cumlative Y
  300. for i in range(len(self.boxes)):
  301. self.boxes[i]["top"] += \
  302. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  303. self.boxes[i]["bottom"] += \
  304. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  305. def _text_merge(self):
  306. # merge adjusted boxes
  307. bxs = self.boxes
  308. def end_with(b, txt):
  309. txt = txt.strip()
  310. tt = b.get("text", "").strip()
  311. return tt and tt.find(txt) == len(tt) - len(txt)
  312. def start_with(b, txts):
  313. tt = b.get("text", "").strip()
  314. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  315. # horizontally merge adjacent box with the same layout
  316. i = 0
  317. while i < len(bxs) - 1:
  318. b = bxs[i]
  319. b_ = bxs[i + 1]
  320. if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
  321. "equation"]:
  322. i += 1
  323. continue
  324. if abs(self._y_dis(b, b_)
  325. ) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
  326. # merge
  327. bxs[i]["x1"] = b_["x1"]
  328. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  329. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  330. bxs[i]["text"] += b_["text"]
  331. bxs.pop(i + 1)
  332. continue
  333. i += 1
  334. continue
  335. dis_thr = 1
  336. dis = b["x1"] - b_["x0"]
  337. if b.get("layout_type", "") != "text" or b_.get(
  338. "layout_type", "") != "text":
  339. if end_with(b, ",") or start_with(b_, "(,"):
  340. dis_thr = -8
  341. else:
  342. i += 1
  343. continue
  344. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  345. and dis >= dis_thr and b["x1"] < b_["x1"]:
  346. # merge
  347. bxs[i]["x1"] = b_["x1"]
  348. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  349. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  350. bxs[i]["text"] += b_["text"]
  351. bxs.pop(i + 1)
  352. continue
  353. i += 1
  354. self.boxes = bxs
  355. def _naive_vertical_merge(self):
  356. bxs = Recognizer.sort_Y_firstly(
  357. self.boxes, np.median(
  358. self.mean_height) / 3)
  359. i = 0
  360. while i + 1 < len(bxs):
  361. b = bxs[i]
  362. b_ = bxs[i + 1]
  363. if b["page_number"] < b_["page_number"] and re.match(
  364. r"[0-9 •一—-]+$", b["text"]):
  365. bxs.pop(i)
  366. continue
  367. if not b["text"].strip():
  368. bxs.pop(i)
  369. continue
  370. concatting_feats = [
  371. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  372. len(b["text"].strip()) > 1 and b["text"].strip(
  373. )[-2] in ",;:'\",‘“、;:",
  374. b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:",
  375. ]
  376. # features for not concating
  377. feats = [
  378. b.get("layoutno", 0) != b_.get("layoutno", 0),
  379. b["text"].strip()[-1] in "。?!?",
  380. self.is_english and b["text"].strip()[-1] in ".!?",
  381. b["page_number"] == b_["page_number"] and b_["top"] -
  382. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  383. b["page_number"] < b_["page_number"] and abs(
  384. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
  385. ]
  386. # split features
  387. detach_feats = [b["x1"] < b_["x0"],
  388. b["x0"] > b_["x1"]]
  389. if (any(feats) and not any(concatting_feats)) or any(detach_feats):
  390. logging.debug("{} {} {} {}".format(
  391. b["text"],
  392. b_["text"],
  393. any(feats),
  394. any(concatting_feats),
  395. ))
  396. i += 1
  397. continue
  398. # merge up and down
  399. b["bottom"] = b_["bottom"]
  400. b["text"] += b_["text"]
  401. b["x0"] = min(b["x0"], b_["x0"])
  402. b["x1"] = max(b["x1"], b_["x1"])
  403. bxs.pop(i + 1)
  404. self.boxes = bxs
  405. def _concat_downward(self, concat_between_pages=True):
  406. # count boxes in the same row as a feature
  407. for i in range(len(self.boxes)):
  408. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  409. self.boxes[i]["in_row"] = 0
  410. j = max(0, i - 12)
  411. while j < min(i + 12, len(self.boxes)):
  412. if j == i:
  413. j += 1
  414. continue
  415. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  416. if abs(ydis) < 1:
  417. self.boxes[i]["in_row"] += 1
  418. elif ydis > 0:
  419. break
  420. j += 1
  421. # concat between rows
  422. boxes = deepcopy(self.boxes)
  423. blocks = []
  424. while boxes:
  425. chunks = []
  426. def dfs(up, dp):
  427. chunks.append(up)
  428. i = dp
  429. while i < min(dp + 12, len(boxes)):
  430. ydis = self._y_dis(up, boxes[i])
  431. smpg = up["page_number"] == boxes[i]["page_number"]
  432. mh = self.mean_height[up["page_number"] - 1]
  433. mw = self.mean_width[up["page_number"] - 1]
  434. if smpg and ydis > mh * 4:
  435. break
  436. if not smpg and ydis > mh * 16:
  437. break
  438. down = boxes[i]
  439. if not concat_between_pages and down["page_number"] > up["page_number"]:
  440. break
  441. if up.get("R", "") != down.get(
  442. "R", "") and up["text"][-1] != ",":
  443. i += 1
  444. continue
  445. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  446. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \
  447. or not down["text"].strip():
  448. i += 1
  449. continue
  450. if not down["text"].strip() or not up["text"].strip():
  451. i += 1
  452. continue
  453. if up["x1"] < down["x0"] - 10 * \
  454. mw or up["x0"] > down["x1"] + 10 * mw:
  455. i += 1
  456. continue
  457. if i - dp < 5 and up.get("layout_type") == "text":
  458. if up.get("layoutno", "1") == down.get(
  459. "layoutno", "2"):
  460. dfs(down, i + 1)
  461. boxes.pop(i)
  462. return
  463. i += 1
  464. continue
  465. fea = self._updown_concat_features(up, down)
  466. if self.updown_cnt_mdl.predict(
  467. xgb.DMatrix([fea]))[0] <= 0.5:
  468. i += 1
  469. continue
  470. dfs(down, i + 1)
  471. boxes.pop(i)
  472. return
  473. dfs(boxes[0], 1)
  474. boxes.pop(0)
  475. if chunks:
  476. blocks.append(chunks)
  477. # concat within each block
  478. boxes = []
  479. for b in blocks:
  480. if len(b) == 1:
  481. boxes.append(b[0])
  482. continue
  483. t = b[0]
  484. for c in b[1:]:
  485. t["text"] = t["text"].strip()
  486. c["text"] = c["text"].strip()
  487. if not c["text"]:
  488. continue
  489. if t["text"] and re.match(
  490. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  491. t["text"] += " "
  492. t["text"] += c["text"]
  493. t["x0"] = min(t["x0"], c["x0"])
  494. t["x1"] = max(t["x1"], c["x1"])
  495. t["page_number"] = min(t["page_number"], c["page_number"])
  496. t["bottom"] = c["bottom"]
  497. if not t["layout_type"] \
  498. and c["layout_type"]:
  499. t["layout_type"] = c["layout_type"]
  500. boxes.append(t)
  501. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  502. def _filter_forpages(self):
  503. if not self.boxes:
  504. return
  505. findit = False
  506. i = 0
  507. while i < len(self.boxes):
  508. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
  509. re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  510. i += 1
  511. continue
  512. findit = True
  513. eng = re.match(
  514. r"[0-9a-zA-Z :'.-]{5,}",
  515. self.boxes[i]["text"].strip())
  516. self.boxes.pop(i)
  517. if i >= len(self.boxes):
  518. break
  519. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  520. self.boxes[i]["text"].strip().split(" ")[:2])
  521. while not prefix:
  522. self.boxes.pop(i)
  523. if i >= len(self.boxes):
  524. break
  525. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  526. self.boxes[i]["text"].strip().split(" ")[:2])
  527. self.boxes.pop(i)
  528. if i >= len(self.boxes) or not prefix:
  529. break
  530. for j in range(i, min(i + 128, len(self.boxes))):
  531. if not re.match(prefix, self.boxes[j]["text"]):
  532. continue
  533. for k in range(i, j):
  534. self.boxes.pop(i)
  535. break
  536. if findit:
  537. return
  538. page_dirty = [0] * len(self.page_images)
  539. for b in self.boxes:
  540. if re.search(r"(··|··|··)", b["text"]):
  541. page_dirty[b["page_number"] - 1] += 1
  542. page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
  543. if not page_dirty:
  544. return
  545. i = 0
  546. while i < len(self.boxes):
  547. if self.boxes[i]["page_number"] in page_dirty:
  548. self.boxes.pop(i)
  549. continue
  550. i += 1
  551. def _merge_with_same_bullet(self):
  552. i = 0
  553. while i + 1 < len(self.boxes):
  554. b = self.boxes[i]
  555. b_ = self.boxes[i + 1]
  556. if not b["text"].strip():
  557. self.boxes.pop(i)
  558. continue
  559. if not b_["text"].strip():
  560. self.boxes.pop(i + 1)
  561. continue
  562. if b["text"].strip()[0] != b_["text"].strip()[0] \
  563. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  564. or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
  565. or b["top"] > b_["bottom"]:
  566. i += 1
  567. continue
  568. b_["text"] = b["text"] + "\n" + b_["text"]
  569. b_["x0"] = min(b["x0"], b_["x0"])
  570. b_["x1"] = max(b["x1"], b_["x1"])
  571. b_["top"] = b["top"]
  572. self.boxes.pop(i)
  573. def _extract_table_figure(self, need_image, ZM,
  574. return_html, need_position):
  575. tables = {}
  576. figures = {}
  577. # extract figure and table boxes
  578. i = 0
  579. lst_lout_no = ""
  580. nomerge_lout_no = []
  581. while i < len(self.boxes):
  582. if "layoutno" not in self.boxes[i]:
  583. i += 1
  584. continue
  585. lout_no = str(self.boxes[i]["page_number"]) + \
  586. "-" + str(self.boxes[i]["layoutno"])
  587. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
  588. "title",
  589. "figure caption",
  590. "reference"]:
  591. nomerge_lout_no.append(lst_lout_no)
  592. if self.boxes[i]["layout_type"] == "table":
  593. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  594. self.boxes.pop(i)
  595. continue
  596. if lout_no not in tables:
  597. tables[lout_no] = []
  598. tables[lout_no].append(self.boxes[i])
  599. self.boxes.pop(i)
  600. lst_lout_no = lout_no
  601. continue
  602. if need_image and self.boxes[i]["layout_type"] == "figure":
  603. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  604. self.boxes.pop(i)
  605. continue
  606. if lout_no not in figures:
  607. figures[lout_no] = []
  608. figures[lout_no].append(self.boxes[i])
  609. self.boxes.pop(i)
  610. lst_lout_no = lout_no
  611. continue
  612. i += 1
  613. # merge table on different pages
  614. nomerge_lout_no = set(nomerge_lout_no)
  615. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  616. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  617. i = len(tbls) - 1
  618. while i - 1 >= 0:
  619. k0, bxs0 = tbls[i - 1]
  620. k, bxs = tbls[i]
  621. i -= 1
  622. if k0 in nomerge_lout_no:
  623. continue
  624. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  625. continue
  626. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  627. continue
  628. mh = self.mean_height[bxs[0]["page_number"] - 1]
  629. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  630. continue
  631. tables[k0].extend(tables[k])
  632. del tables[k]
  633. def x_overlapped(a, b):
  634. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  635. # find captions and pop out
  636. i = 0
  637. while i < len(self.boxes):
  638. c = self.boxes[i]
  639. # mh = self.mean_height[c["page_number"]-1]
  640. if not TableStructureRecognizer.is_caption(c):
  641. i += 1
  642. continue
  643. # find the nearest layouts
  644. def nearest(tbls):
  645. nonlocal c
  646. mink = ""
  647. minv = 1000000000
  648. for k, bxs in tbls.items():
  649. for b in bxs:
  650. if b.get("layout_type", "").find("caption") >= 0:
  651. continue
  652. y_dis = self._y_dis(c, b)
  653. x_dis = self._x_dis(
  654. c, b) if not x_overlapped(
  655. c, b) else 0
  656. dis = y_dis * y_dis + x_dis * x_dis
  657. if dis < minv:
  658. mink = k
  659. minv = dis
  660. return mink, minv
  661. tk, tv = nearest(tables)
  662. fk, fv = nearest(figures)
  663. # if min(tv, fv) > 2000:
  664. # i += 1
  665. # continue
  666. if tv < fv and tk:
  667. tables[tk].insert(0, c)
  668. logging.debug(
  669. "TABLE:" +
  670. self.boxes[i]["text"] +
  671. "; Cap: " +
  672. tk)
  673. elif fk:
  674. figures[fk].insert(0, c)
  675. logging.debug(
  676. "FIGURE:" +
  677. self.boxes[i]["text"] +
  678. "; Cap: " +
  679. tk)
  680. self.boxes.pop(i)
  681. res = []
  682. positions = []
  683. def cropout(bxs, ltype, poss):
  684. nonlocal ZM
  685. pn = set([b["page_number"] - 1 for b in bxs])
  686. if len(pn) < 2:
  687. pn = list(pn)[0]
  688. ht = self.page_cum_height[pn]
  689. b = {
  690. "x0": np.min([b["x0"] for b in bxs]),
  691. "top": np.min([b["top"] for b in bxs]) - ht,
  692. "x1": np.max([b["x1"] for b in bxs]),
  693. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  694. }
  695. louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
  696. ii = Recognizer.find_overlapped(b, louts, naive=True)
  697. if ii is not None:
  698. b = louts[ii]
  699. else:
  700. logging.warn(
  701. f"Missing layout match: {pn + 1},%s" %
  702. (bxs[0].get(
  703. "layoutno", "")))
  704. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  705. if right < left: right = left + 1
  706. poss.append((pn + self.page_from, left, right, top, bott))
  707. return self.page_images[pn] \
  708. .crop((left * ZM, top * ZM,
  709. right * ZM, bott * ZM))
  710. pn = {}
  711. for b in bxs:
  712. p = b["page_number"] - 1
  713. if p not in pn:
  714. pn[p] = []
  715. pn[p].append(b)
  716. pn = sorted(pn.items(), key=lambda x: x[0])
  717. imgs = [cropout(arr, ltype, poss) for p, arr in pn]
  718. pic = Image.new("RGB",
  719. (int(np.max([i.size[0] for i in imgs])),
  720. int(np.sum([m.size[1] for m in imgs]))),
  721. (245, 245, 245))
  722. height = 0
  723. for img in imgs:
  724. pic.paste(img, (0, int(height)))
  725. height += img.size[1]
  726. return pic
  727. # crop figure out and add caption
  728. for k, bxs in figures.items():
  729. txt = "\n".join([b["text"] for b in bxs])
  730. if not txt:
  731. continue
  732. poss = []
  733. res.append(
  734. (cropout(
  735. bxs,
  736. "figure", poss),
  737. [txt]))
  738. positions.append(poss)
  739. for k, bxs in tables.items():
  740. if not bxs:
  741. continue
  742. bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
  743. [(b["bottom"] - b["top"]) / 2 for b in bxs]))
  744. poss = []
  745. res.append((cropout(bxs, "table", poss),
  746. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  747. positions.append(poss)
  748. assert len(positions) == len(res)
  749. if need_position:
  750. return list(zip(res, positions))
  751. return res
  752. def proj_match(self, line):
  753. if len(line) <= 2:
  754. return
  755. if re.match(r"[0-9 ().,%%+/-]+$", line):
  756. return False
  757. for p, j in [
  758. (r"第[零一二三四五六七八九十百]+章", 1),
  759. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  760. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  761. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  762. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  763. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  764. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  765. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  766. (r".{,48}[::??]$", 9),
  767. (r"[0-9]+)", 10),
  768. (r"[\((][0-9]+[)\)]", 11),
  769. (r"[零一二三四五六七八九十百]+是", 12),
  770. (r"[⚫•➢✓]", 12)
  771. ]:
  772. if re.match(p, line):
  773. return j
  774. return
  775. def _line_tag(self, bx, ZM):
  776. pn = [bx["page_number"]]
  777. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  778. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  779. page_images_cnt = len(self.page_images)
  780. if pn[-1] - 1 >= page_images_cnt: return ""
  781. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  782. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  783. pn.append(pn[-1] + 1)
  784. if pn[-1] - 1 >= page_images_cnt:
  785. return ""
  786. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  787. .format("-".join([str(p) for p in pn]),
  788. bx["x0"], bx["x1"], top, bott)
  789. def __filterout_scraps(self, boxes, ZM):
  790. def width(b):
  791. return b["x1"] - b["x0"]
  792. def height(b):
  793. return b["bottom"] - b["top"]
  794. def usefull(b):
  795. if b.get("layout_type"):
  796. return True
  797. if width(
  798. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  799. return True
  800. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  801. return True
  802. return False
  803. res = []
  804. while boxes:
  805. lines = []
  806. widths = []
  807. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  808. mh = self.mean_height[boxes[0]["page_number"] - 1]
  809. mj = self.proj_match(
  810. boxes[0]["text"]) or boxes[0].get(
  811. "layout_type",
  812. "") == "title"
  813. def dfs(line, st):
  814. nonlocal mh, pw, lines, widths
  815. lines.append(line)
  816. widths.append(width(line))
  817. width_mean = np.mean(widths)
  818. mmj = self.proj_match(
  819. line["text"]) or line.get(
  820. "layout_type",
  821. "") == "title"
  822. for i in range(st + 1, min(st + 20, len(boxes))):
  823. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  824. break
  825. if not mmj and self._y_dis(
  826. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  827. break
  828. if not usefull(boxes[i]):
  829. continue
  830. if mmj or \
  831. (self._x_dis(boxes[i], line) < pw / 10): \
  832. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  833. # concat following
  834. dfs(boxes[i], i)
  835. boxes.pop(i)
  836. break
  837. try:
  838. if usefull(boxes[0]):
  839. dfs(boxes[0], 0)
  840. else:
  841. logging.debug("WASTE: " + boxes[0]["text"])
  842. except Exception:
  843. pass
  844. boxes.pop(0)
  845. mw = np.mean(widths)
  846. if mj or mw / pw >= 0.35 or mw > 200:
  847. res.append(
  848. "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  849. else:
  850. logging.debug("REMOVED: " +
  851. "<<".join([c["text"] for c in lines]))
  852. return "\n\n".join(res)
  853. @staticmethod
  854. def total_page_number(fnm, binary=None):
  855. try:
  856. pdf = pdfplumber.open(
  857. fnm) if not binary else pdfplumber.open(BytesIO(binary))
  858. return len(pdf.pages)
  859. except Exception:
  860. logging.exception("total_page_number")
  861. def __images__(self, fnm, zoomin=3, page_from=0,
  862. page_to=299, callback=None):
  863. self.lefted_chars = []
  864. self.mean_height = []
  865. self.mean_width = []
  866. self.boxes = []
  867. self.garbages = {}
  868. self.page_cum_height = [0]
  869. self.page_layout = []
  870. self.page_from = page_from
  871. st = timer()
  872. try:
  873. self.pdf = pdfplumber.open(fnm) if isinstance(
  874. fnm, str) else pdfplumber.open(BytesIO(fnm))
  875. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  876. enumerate(self.pdf.pages[page_from:page_to])]
  877. self.page_images_x2 = [p.to_image(resolution=72 * zoomin * 2).annotated for i, p in
  878. enumerate(self.pdf.pages[page_from:page_to])]
  879. self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in
  880. self.pdf.pages[page_from:page_to]]
  881. self.total_page = len(self.pdf.pages)
  882. except Exception:
  883. logging.exception("RAGFlowPdfParser __images__")
  884. self.outlines = []
  885. try:
  886. self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
  887. outlines = self.pdf.outline
  888. def dfs(arr, depth):
  889. for a in arr:
  890. if isinstance(a, dict):
  891. self.outlines.append((a["/Title"], depth))
  892. continue
  893. dfs(a, depth + 1)
  894. dfs(outlines, 0)
  895. except Exception as e:
  896. logging.warning(f"Outlines exception: {e}")
  897. if not self.outlines:
  898. logging.warning("Miss outlines")
  899. logging.debug("Images converted.")
  900. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
  901. random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
  902. range(len(self.page_chars))]
  903. if sum([1 if e else 0 for e in self.is_english]) > len(
  904. self.page_images) / 2:
  905. self.is_english = True
  906. else:
  907. self.is_english = False
  908. st = timer()
  909. for i, img in enumerate(self.page_images_x2):
  910. chars = self.page_chars[i] if not self.is_english else []
  911. self.mean_height.append(
  912. np.median(sorted([c["height"] for c in chars])) if chars else 0
  913. )
  914. self.mean_width.append(
  915. np.median(sorted([c["width"] for c in chars])) if chars else 8
  916. )
  917. self.page_cum_height.append(img.size[1] / zoomin/2)
  918. j = 0
  919. while j + 1 < len(chars):
  920. if chars[j]["text"] and chars[j + 1]["text"] \
  921. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  922. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  923. chars[j]["width"]) / 2:
  924. chars[j]["text"] += " "
  925. j += 1
  926. self.__ocr(i + 1, img, chars, zoomin*2)
  927. if callback and i % 6 == 5:
  928. callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
  929. # print("OCR:", timer()-st)
  930. if not self.is_english and not any(
  931. [c for c in self.page_chars]) and self.boxes:
  932. bxes = [b for bxs in self.boxes for b in bxs]
  933. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
  934. "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  935. logging.debug("Is it English:", self.is_english)
  936. self.page_cum_height = np.cumsum(self.page_cum_height)
  937. assert len(self.page_cum_height) == len(self.page_images) + 1
  938. if len(self.boxes) == 0 and zoomin < 9: self.__images__(fnm, zoomin * 3, page_from,
  939. page_to, callback)
  940. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  941. self.__images__(fnm, zoomin)
  942. self._layouts_rec(zoomin)
  943. self._table_transformer_job(zoomin)
  944. self._text_merge()
  945. self._concat_downward()
  946. self._filter_forpages()
  947. tbls = self._extract_table_figure(
  948. need_image, zoomin, return_html, False)
  949. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  950. def remove_tag(self, txt):
  951. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  952. def crop(self, text, ZM=3, need_position=False):
  953. imgs = []
  954. poss = []
  955. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  956. pn, left, right, top, bottom = tag.strip(
  957. "#").strip("@").split("\t")
  958. left, right, top, bottom = float(left), float(
  959. right), float(top), float(bottom)
  960. poss.append(([int(p) - 1 for p in pn.split("-")],
  961. left, right, top, bottom))
  962. if not poss:
  963. if need_position:
  964. return None, None
  965. return
  966. max_width = max(
  967. np.max([right - left for (_, left, right, _, _) in poss]), 6)
  968. GAP = 6
  969. pos = poss[0]
  970. poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
  971. 0, pos[3] - 120), max(pos[3] - GAP, 0)))
  972. pos = poss[-1]
  973. poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
  974. min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
  975. positions = []
  976. for ii, (pns, left, right, top, bottom) in enumerate(poss):
  977. right = left + max_width
  978. bottom *= ZM
  979. for pn in pns[1:]:
  980. bottom += self.page_images[pn - 1].size[1]
  981. imgs.append(
  982. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  983. right *
  984. ZM, min(
  985. bottom, self.page_images[pns[0]].size[1])
  986. ))
  987. )
  988. if 0 < ii < len(poss) - 1:
  989. positions.append((pns[0] + self.page_from, left, right, top, min(
  990. bottom, self.page_images[pns[0]].size[1]) / ZM))
  991. bottom -= self.page_images[pns[0]].size[1]
  992. for pn in pns[1:]:
  993. imgs.append(
  994. self.page_images[pn].crop((left * ZM, 0,
  995. right * ZM,
  996. min(bottom,
  997. self.page_images[pn].size[1])
  998. ))
  999. )
  1000. if 0 < ii < len(poss) - 1:
  1001. positions.append((pn + self.page_from, left, right, 0, min(
  1002. bottom, self.page_images[pn].size[1]) / ZM))
  1003. bottom -= self.page_images[pn].size[1]
  1004. if not imgs:
  1005. if need_position:
  1006. return None, None
  1007. return
  1008. height = 0
  1009. for img in imgs:
  1010. height += img.size[1] + GAP
  1011. height = int(height)
  1012. width = int(np.max([i.size[0] for i in imgs]))
  1013. pic = Image.new("RGB",
  1014. (width, height),
  1015. (245, 245, 245))
  1016. height = 0
  1017. for ii, img in enumerate(imgs):
  1018. if ii == 0 or ii + 1 == len(imgs):
  1019. img = img.convert('RGBA')
  1020. overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
  1021. overlay.putalpha(128)
  1022. img = Image.alpha_composite(img, overlay).convert("RGB")
  1023. pic.paste(img, (0, int(height)))
  1024. height += img.size[1] + GAP
  1025. if need_position:
  1026. return pic, positions
  1027. return pic
  1028. def get_position(self, bx, ZM):
  1029. poss = []
  1030. pn = bx["page_number"]
  1031. top = bx["top"] - self.page_cum_height[pn - 1]
  1032. bott = bx["bottom"] - self.page_cum_height[pn - 1]
  1033. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1034. bott, self.page_images[pn - 1].size[1] / ZM)))
  1035. while bott * ZM > self.page_images[pn - 1].size[1]:
  1036. bott -= self.page_images[pn - 1].size[1] / ZM
  1037. top = 0
  1038. pn += 1
  1039. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1040. bott, self.page_images[pn - 1].size[1] / ZM)))
  1041. return poss
  1042. class PlainParser(object):
  1043. def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
  1044. self.outlines = []
  1045. lines = []
  1046. try:
  1047. self.pdf = pdf2_read(
  1048. filename if isinstance(
  1049. filename, str) else BytesIO(filename))
  1050. for page in self.pdf.pages[from_page:to_page]:
  1051. lines.extend([t for t in page.extract_text().split("\n")])
  1052. outlines = self.pdf.outline
  1053. def dfs(arr, depth):
  1054. for a in arr:
  1055. if isinstance(a, dict):
  1056. self.outlines.append((a["/Title"], depth))
  1057. continue
  1058. dfs(a, depth + 1)
  1059. dfs(outlines, 0)
  1060. except Exception:
  1061. logging.exception("Outlines exception")
  1062. if not self.outlines:
  1063. logging.warning("Miss outlines")
  1064. return [(l, "") for l in lines], []
  1065. def crop(self, ck, need_position):
  1066. raise NotImplementedError
  1067. @staticmethod
  1068. def remove_tag(txt):
  1069. raise NotImplementedError
  1070. if __name__ == "__main__":
  1071. pass