You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import logging
  17. import os
  18. import random
  19. from timeit import default_timer as timer
  20. import sys
  21. import threading
  22. import trio
  23. import xgboost as xgb
  24. from io import BytesIO
  25. import re
  26. import pdfplumber
  27. from PIL import Image
  28. import numpy as np
  29. from pypdf import PdfReader as pdf2_read
  30. from api import settings
  31. from api.utils.file_utils import get_project_base_directory
  32. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  33. from rag.nlp import rag_tokenizer
  34. from copy import deepcopy
  35. from huggingface_hub import snapshot_download
  36. LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
  37. if LOCK_KEY_pdfplumber not in sys.modules:
  38. sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
  39. class RAGFlowPdfParser:
  40. def __init__(self, parallel_devices: int | None = None):
  41. """
  42. If you have trouble downloading HuggingFace models, -_^ this might help!!
  43. For Linux:
  44. export HF_ENDPOINT=https://hf-mirror.com
  45. For Windows:
  46. Good luck
  47. ^_-
  48. """
  49. self.ocr = OCR(parallel_devices = parallel_devices)
  50. self.parallel_devices = parallel_devices
  51. self.parallel_limiter = None
  52. if parallel_devices is not None and parallel_devices > 1:
  53. self.parallel_limiter = [trio.CapacityLimiter(1) for _ in range(parallel_devices)]
  54. if hasattr(self, "model_speciess"):
  55. self.layouter = LayoutRecognizer("layout." + self.model_speciess)
  56. else:
  57. self.layouter = LayoutRecognizer("layout")
  58. self.tbl_det = TableStructureRecognizer()
  59. self.updown_cnt_mdl = xgb.Booster()
  60. if not settings.LIGHTEN:
  61. try:
  62. import torch.cuda
  63. if torch.cuda.is_available():
  64. self.updown_cnt_mdl.set_param({"device": "cuda"})
  65. except Exception:
  66. logging.exception("RAGFlowPdfParser __init__")
  67. try:
  68. model_dir = os.path.join(
  69. get_project_base_directory(),
  70. "rag/res/deepdoc")
  71. self.updown_cnt_mdl.load_model(os.path.join(
  72. model_dir, "updown_concat_xgb.model"))
  73. except Exception:
  74. model_dir = snapshot_download(
  75. repo_id="InfiniFlow/text_concat_xgb_v1.0",
  76. local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
  77. local_dir_use_symlinks=False)
  78. self.updown_cnt_mdl.load_model(os.path.join(
  79. model_dir, "updown_concat_xgb.model"))
  80. self.page_from = 0
  81. def __char_width(self, c):
  82. return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
  83. def __height(self, c):
  84. return c["bottom"] - c["top"]
  85. def _x_dis(self, a, b):
  86. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  87. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  88. def _y_dis(
  89. self, a, b):
  90. return (
  91. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  92. def _match_proj(self, b):
  93. proj_patt = [
  94. r"第[零一二三四五六七八九十百]+章",
  95. r"第[零一二三四五六七八九十百]+[条节]",
  96. r"[零一二三四五六七八九十百]+[、是  ]",
  97. r"[\((][零一二三四五六七八九十百]+[)\)]",
  98. r"[\((][0-9]+[)\)]",
  99. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  100. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  101. r"[⚫•➢①② ]",
  102. ]
  103. return any([re.match(p, b["text"]) for p in proj_patt])
  104. def _updown_concat_features(self, up, down):
  105. w = max(self.__char_width(up), self.__char_width(down))
  106. h = max(self.__height(up), self.__height(down))
  107. y_dis = self._y_dis(up, down)
  108. LEN = 6
  109. tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split()
  110. tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split()
  111. tks_all = up["text"][-LEN:].strip() \
  112. + (" " if re.match(r"[a-zA-Z0-9]+",
  113. up["text"][-1] + down["text"][0]) else "") \
  114. + down["text"][:LEN].strip()
  115. tks_all = rag_tokenizer.tokenize(tks_all).split()
  116. fea = [
  117. up.get("R", -1) == down.get("R", -1),
  118. y_dis / h,
  119. down["page_number"] - up["page_number"],
  120. up["layout_type"] == down["layout_type"],
  121. up["layout_type"] == "text",
  122. down["layout_type"] == "text",
  123. up["layout_type"] == "table",
  124. down["layout_type"] == "table",
  125. True if re.search(
  126. r"([。?!;!?;+))]|[a-z]\.)$",
  127. up["text"]) else False,
  128. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  129. True if re.search(
  130. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  131. down["text"]) else False,
  132. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  133. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  134. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  135. True if re.search(r"[\((][^\))]+$", up["text"])
  136. and re.search(r"[\))]", down["text"]) else False,
  137. self._match_proj(down),
  138. True if re.match(r"[A-Z]", down["text"]) else False,
  139. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  140. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  141. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  142. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  143. ) > 1 and len(
  144. down["text"].strip()) > 1 else False,
  145. up["x0"] > down["x1"],
  146. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  147. self.__height(down)),
  148. self._x_dis(up, down) / max(w, 0.000001),
  149. (len(up["text"]) - len(down["text"])) /
  150. max(len(up["text"]), len(down["text"])),
  151. len(tks_all) - len(tks_up) - len(tks_down),
  152. len(tks_down) - len(tks_up),
  153. tks_down[-1] == tks_up[-1] if tks_down and tks_up else False,
  154. max(down["in_row"], up["in_row"]),
  155. abs(down["in_row"] - up["in_row"]),
  156. len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
  157. len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
  158. ]
  159. return fea
  160. @staticmethod
  161. def sort_X_by_page(arr, threashold):
  162. # sort using y1 first and then x1
  163. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  164. for i in range(len(arr) - 1):
  165. for j in range(i, -1, -1):
  166. # restore the order using th
  167. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  168. and arr[j + 1]["top"] < arr[j]["top"] \
  169. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  170. tmp = arr[j]
  171. arr[j] = arr[j + 1]
  172. arr[j + 1] = tmp
  173. return arr
  174. def _has_color(self, o):
  175. if o.get("ncs", "") == "DeviceGray":
  176. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  177. o["non_stroking_color"][0] == 1:
  178. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  179. return False
  180. return True
  181. def _table_transformer_job(self, ZM):
  182. logging.debug("Table processing...")
  183. imgs, pos = [], []
  184. tbcnt = [0]
  185. MARGIN = 10
  186. self.tb_cpns = []
  187. assert len(self.page_layout) == len(self.page_images)
  188. for p, tbls in enumerate(self.page_layout): # for page
  189. tbls = [f for f in tbls if f["type"] == "table"]
  190. tbcnt.append(len(tbls))
  191. if not tbls:
  192. continue
  193. for tb in tbls: # for table
  194. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  195. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  196. left *= ZM
  197. top *= ZM
  198. right *= ZM
  199. bott *= ZM
  200. pos.append((left, top))
  201. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  202. assert len(self.page_images) == len(tbcnt) - 1
  203. if not imgs:
  204. return
  205. recos = self.tbl_det(imgs)
  206. tbcnt = np.cumsum(tbcnt)
  207. for i in range(len(tbcnt) - 1): # for page
  208. pg = []
  209. for j, tb_items in enumerate(
  210. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  211. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  212. for it in tb_items: # for table components
  213. it["x0"] = (it["x0"] + poss[j][0])
  214. it["x1"] = (it["x1"] + poss[j][0])
  215. it["top"] = (it["top"] + poss[j][1])
  216. it["bottom"] = (it["bottom"] + poss[j][1])
  217. for n in ["x0", "x1", "top", "bottom"]:
  218. it[n] /= ZM
  219. it["top"] += self.page_cum_height[i]
  220. it["bottom"] += self.page_cum_height[i]
  221. it["pn"] = i
  222. it["layoutno"] = j
  223. pg.append(it)
  224. self.tb_cpns.extend(pg)
  225. def gather(kwd, fzy=10, ption=0.6):
  226. eles = Recognizer.sort_Y_firstly(
  227. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  228. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  229. return Recognizer.sort_Y_firstly(eles, 0)
  230. # add R,H,C,SP tag to boxes within table layout
  231. headers = gather(r".*header$")
  232. rows = gather(r".* (row|header)")
  233. spans = gather(r".*spanning")
  234. clmns = sorted([r for r in self.tb_cpns if re.match(
  235. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  236. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  237. for b in self.boxes:
  238. if b.get("layout_type", "") != "table":
  239. continue
  240. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  241. if ii is not None:
  242. b["R"] = ii
  243. b["R_top"] = rows[ii]["top"]
  244. b["R_bott"] = rows[ii]["bottom"]
  245. ii = Recognizer.find_overlapped_with_threashold(
  246. b, headers, thr=0.3)
  247. if ii is not None:
  248. b["H_top"] = headers[ii]["top"]
  249. b["H_bott"] = headers[ii]["bottom"]
  250. b["H_left"] = headers[ii]["x0"]
  251. b["H_right"] = headers[ii]["x1"]
  252. b["H"] = ii
  253. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  254. if ii is not None:
  255. b["C"] = ii
  256. b["C_left"] = clmns[ii]["x0"]
  257. b["C_right"] = clmns[ii]["x1"]
  258. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  259. if ii is not None:
  260. b["H_top"] = spans[ii]["top"]
  261. b["H_bott"] = spans[ii]["bottom"]
  262. b["H_left"] = spans[ii]["x0"]
  263. b["H_right"] = spans[ii]["x1"]
  264. b["SP"] = ii
  265. def __ocr(self, pagenum, img, chars, ZM=3, device_id: int | None = None):
  266. start = timer()
  267. bxs = self.ocr.detect(np.array(img), device_id)
  268. logging.info(f"__ocr detecting boxes of a image cost ({timer() - start}s)")
  269. start = timer()
  270. if not bxs:
  271. self.boxes.append([])
  272. return
  273. bxs = [(line[0], line[1][0]) for line in bxs]
  274. bxs = Recognizer.sort_Y_firstly(
  275. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  276. "top": b[0][1] / ZM, "text": "", "txt": t,
  277. "bottom": b[-1][1] / ZM,
  278. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  279. self.mean_height[-1] / 3
  280. )
  281. # merge chars in the same rect
  282. for c in Recognizer.sort_Y_firstly(
  283. chars, self.mean_height[pagenum - 1] // 4):
  284. ii = Recognizer.find_overlapped(c, bxs)
  285. if ii is None:
  286. self.lefted_chars.append(c)
  287. continue
  288. ch = c["bottom"] - c["top"]
  289. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  290. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  291. self.lefted_chars.append(c)
  292. continue
  293. if c["text"] == " " and bxs[ii]["text"]:
  294. if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", bxs[ii]["text"][-1]):
  295. bxs[ii]["text"] += " "
  296. else:
  297. bxs[ii]["text"] += c["text"]
  298. logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s")
  299. start = timer()
  300. boxes_to_reg = []
  301. img_np = np.array(img)
  302. for b in bxs:
  303. if not b["text"]:
  304. left, right, top, bott = b["x0"] * ZM, b["x1"] * \
  305. ZM, b["top"] * ZM, b["bottom"] * ZM
  306. b["box_image"] = self.ocr.get_rotate_crop_image(img_np, np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32))
  307. boxes_to_reg.append(b)
  308. del b["txt"]
  309. texts = self.ocr.recognize_batch([b["box_image"] for b in boxes_to_reg], device_id)
  310. for i in range(len(boxes_to_reg)):
  311. boxes_to_reg[i]["text"] = texts[i]
  312. del boxes_to_reg[i]["box_image"]
  313. logging.info(f"__ocr recognize {len(bxs)} boxes cost {timer() - start}s")
  314. bxs = [b for b in bxs if b["text"]]
  315. if self.mean_height[-1] == 0:
  316. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  317. for b in bxs])
  318. self.boxes.append(bxs)
  319. def _layouts_rec(self, ZM, drop=True):
  320. assert len(self.page_images) == len(self.boxes)
  321. self.boxes, self.page_layout = self.layouter(
  322. self.page_images, self.boxes, ZM, drop=drop)
  323. # cumlative Y
  324. for i in range(len(self.boxes)):
  325. self.boxes[i]["top"] += \
  326. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  327. self.boxes[i]["bottom"] += \
  328. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  329. def _text_merge(self):
  330. # merge adjusted boxes
  331. bxs = self.boxes
  332. def end_with(b, txt):
  333. txt = txt.strip()
  334. tt = b.get("text", "").strip()
  335. return tt and tt.find(txt) == len(tt) - len(txt)
  336. def start_with(b, txts):
  337. tt = b.get("text", "").strip()
  338. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  339. # horizontally merge adjacent box with the same layout
  340. i = 0
  341. while i < len(bxs) - 1:
  342. b = bxs[i]
  343. b_ = bxs[i + 1]
  344. if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
  345. "equation"]:
  346. i += 1
  347. continue
  348. if abs(self._y_dis(b, b_)
  349. ) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
  350. # merge
  351. bxs[i]["x1"] = b_["x1"]
  352. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  353. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  354. bxs[i]["text"] += b_["text"]
  355. bxs.pop(i + 1)
  356. continue
  357. i += 1
  358. continue
  359. dis_thr = 1
  360. dis = b["x1"] - b_["x0"]
  361. if b.get("layout_type", "") != "text" or b_.get(
  362. "layout_type", "") != "text":
  363. if end_with(b, ",") or start_with(b_, "(,"):
  364. dis_thr = -8
  365. else:
  366. i += 1
  367. continue
  368. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  369. and dis >= dis_thr and b["x1"] < b_["x1"]:
  370. # merge
  371. bxs[i]["x1"] = b_["x1"]
  372. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  373. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  374. bxs[i]["text"] += b_["text"]
  375. bxs.pop(i + 1)
  376. continue
  377. i += 1
  378. self.boxes = bxs
  379. def _naive_vertical_merge(self):
  380. bxs = Recognizer.sort_Y_firstly(
  381. self.boxes, np.median(
  382. self.mean_height) / 3)
  383. i = 0
  384. while i + 1 < len(bxs):
  385. b = bxs[i]
  386. b_ = bxs[i + 1]
  387. if b["page_number"] < b_["page_number"] and re.match(
  388. r"[0-9 •一—-]+$", b["text"]):
  389. bxs.pop(i)
  390. continue
  391. if not b["text"].strip():
  392. bxs.pop(i)
  393. continue
  394. concatting_feats = [
  395. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  396. len(b["text"].strip()) > 1 and b["text"].strip(
  397. )[-2] in ",;:'\",‘“、;:",
  398. b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:",
  399. ]
  400. # features for not concating
  401. feats = [
  402. b.get("layoutno", 0) != b_.get("layoutno", 0),
  403. b["text"].strip()[-1] in "。?!?",
  404. self.is_english and b["text"].strip()[-1] in ".!?",
  405. b["page_number"] == b_["page_number"] and b_["top"] -
  406. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  407. b["page_number"] < b_["page_number"] and abs(
  408. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
  409. ]
  410. # split features
  411. detach_feats = [b["x1"] < b_["x0"],
  412. b["x0"] > b_["x1"]]
  413. if (any(feats) and not any(concatting_feats)) or any(detach_feats):
  414. logging.debug("{} {} {} {}".format(
  415. b["text"],
  416. b_["text"],
  417. any(feats),
  418. any(concatting_feats),
  419. ))
  420. i += 1
  421. continue
  422. # merge up and down
  423. b["bottom"] = b_["bottom"]
  424. b["text"] += b_["text"]
  425. b["x0"] = min(b["x0"], b_["x0"])
  426. b["x1"] = max(b["x1"], b_["x1"])
  427. bxs.pop(i + 1)
  428. self.boxes = bxs
  429. def _concat_downward(self, concat_between_pages=True):
  430. # count boxes in the same row as a feature
  431. for i in range(len(self.boxes)):
  432. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  433. self.boxes[i]["in_row"] = 0
  434. j = max(0, i - 12)
  435. while j < min(i + 12, len(self.boxes)):
  436. if j == i:
  437. j += 1
  438. continue
  439. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  440. if abs(ydis) < 1:
  441. self.boxes[i]["in_row"] += 1
  442. elif ydis > 0:
  443. break
  444. j += 1
  445. # concat between rows
  446. boxes = deepcopy(self.boxes)
  447. blocks = []
  448. while boxes:
  449. chunks = []
  450. def dfs(up, dp):
  451. chunks.append(up)
  452. i = dp
  453. while i < min(dp + 12, len(boxes)):
  454. ydis = self._y_dis(up, boxes[i])
  455. smpg = up["page_number"] == boxes[i]["page_number"]
  456. mh = self.mean_height[up["page_number"] - 1]
  457. mw = self.mean_width[up["page_number"] - 1]
  458. if smpg and ydis > mh * 4:
  459. break
  460. if not smpg and ydis > mh * 16:
  461. break
  462. down = boxes[i]
  463. if not concat_between_pages and down["page_number"] > up["page_number"]:
  464. break
  465. if up.get("R", "") != down.get(
  466. "R", "") and up["text"][-1] != ",":
  467. i += 1
  468. continue
  469. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  470. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \
  471. or not down["text"].strip():
  472. i += 1
  473. continue
  474. if not down["text"].strip() or not up["text"].strip():
  475. i += 1
  476. continue
  477. if up["x1"] < down["x0"] - 10 * \
  478. mw or up["x0"] > down["x1"] + 10 * mw:
  479. i += 1
  480. continue
  481. if i - dp < 5 and up.get("layout_type") == "text":
  482. if up.get("layoutno", "1") == down.get(
  483. "layoutno", "2"):
  484. dfs(down, i + 1)
  485. boxes.pop(i)
  486. return
  487. i += 1
  488. continue
  489. fea = self._updown_concat_features(up, down)
  490. if self.updown_cnt_mdl.predict(
  491. xgb.DMatrix([fea]))[0] <= 0.5:
  492. i += 1
  493. continue
  494. dfs(down, i + 1)
  495. boxes.pop(i)
  496. return
  497. dfs(boxes[0], 1)
  498. boxes.pop(0)
  499. if chunks:
  500. blocks.append(chunks)
  501. # concat within each block
  502. boxes = []
  503. for b in blocks:
  504. if len(b) == 1:
  505. boxes.append(b[0])
  506. continue
  507. t = b[0]
  508. for c in b[1:]:
  509. t["text"] = t["text"].strip()
  510. c["text"] = c["text"].strip()
  511. if not c["text"]:
  512. continue
  513. if t["text"] and re.match(
  514. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  515. t["text"] += " "
  516. t["text"] += c["text"]
  517. t["x0"] = min(t["x0"], c["x0"])
  518. t["x1"] = max(t["x1"], c["x1"])
  519. t["page_number"] = min(t["page_number"], c["page_number"])
  520. t["bottom"] = c["bottom"]
  521. if not t["layout_type"] \
  522. and c["layout_type"]:
  523. t["layout_type"] = c["layout_type"]
  524. boxes.append(t)
  525. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  526. def _filter_forpages(self):
  527. if not self.boxes:
  528. return
  529. findit = False
  530. i = 0
  531. while i < len(self.boxes):
  532. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
  533. re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  534. i += 1
  535. continue
  536. findit = True
  537. eng = re.match(
  538. r"[0-9a-zA-Z :'.-]{5,}",
  539. self.boxes[i]["text"].strip())
  540. self.boxes.pop(i)
  541. if i >= len(self.boxes):
  542. break
  543. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  544. self.boxes[i]["text"].strip().split()[:2])
  545. while not prefix:
  546. self.boxes.pop(i)
  547. if i >= len(self.boxes):
  548. break
  549. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  550. self.boxes[i]["text"].strip().split()[:2])
  551. self.boxes.pop(i)
  552. if i >= len(self.boxes) or not prefix:
  553. break
  554. for j in range(i, min(i + 128, len(self.boxes))):
  555. if not re.match(prefix, self.boxes[j]["text"]):
  556. continue
  557. for k in range(i, j):
  558. self.boxes.pop(i)
  559. break
  560. if findit:
  561. return
  562. page_dirty = [0] * len(self.page_images)
  563. for b in self.boxes:
  564. if re.search(r"(··|··|··)", b["text"]):
  565. page_dirty[b["page_number"] - 1] += 1
  566. page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
  567. if not page_dirty:
  568. return
  569. i = 0
  570. while i < len(self.boxes):
  571. if self.boxes[i]["page_number"] in page_dirty:
  572. self.boxes.pop(i)
  573. continue
  574. i += 1
  575. def _merge_with_same_bullet(self):
  576. i = 0
  577. while i + 1 < len(self.boxes):
  578. b = self.boxes[i]
  579. b_ = self.boxes[i + 1]
  580. if not b["text"].strip():
  581. self.boxes.pop(i)
  582. continue
  583. if not b_["text"].strip():
  584. self.boxes.pop(i + 1)
  585. continue
  586. if b["text"].strip()[0] != b_["text"].strip()[0] \
  587. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  588. or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
  589. or b["top"] > b_["bottom"]:
  590. i += 1
  591. continue
  592. b_["text"] = b["text"] + "\n" + b_["text"]
  593. b_["x0"] = min(b["x0"], b_["x0"])
  594. b_["x1"] = max(b["x1"], b_["x1"])
  595. b_["top"] = b["top"]
  596. self.boxes.pop(i)
  597. def _extract_table_figure(self, need_image, ZM,
  598. return_html, need_position):
  599. tables = {}
  600. figures = {}
  601. # extract figure and table boxes
  602. i = 0
  603. lst_lout_no = ""
  604. nomerge_lout_no = []
  605. while i < len(self.boxes):
  606. if "layoutno" not in self.boxes[i]:
  607. i += 1
  608. continue
  609. lout_no = str(self.boxes[i]["page_number"]) + \
  610. "-" + str(self.boxes[i]["layoutno"])
  611. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
  612. "title",
  613. "figure caption",
  614. "reference"]:
  615. nomerge_lout_no.append(lst_lout_no)
  616. if self.boxes[i]["layout_type"] == "table":
  617. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  618. self.boxes.pop(i)
  619. continue
  620. if lout_no not in tables:
  621. tables[lout_no] = []
  622. tables[lout_no].append(self.boxes[i])
  623. self.boxes.pop(i)
  624. lst_lout_no = lout_no
  625. continue
  626. if need_image and self.boxes[i]["layout_type"] == "figure":
  627. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  628. self.boxes.pop(i)
  629. continue
  630. if lout_no not in figures:
  631. figures[lout_no] = []
  632. figures[lout_no].append(self.boxes[i])
  633. self.boxes.pop(i)
  634. lst_lout_no = lout_no
  635. continue
  636. i += 1
  637. # merge table on different pages
  638. nomerge_lout_no = set(nomerge_lout_no)
  639. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  640. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  641. i = len(tbls) - 1
  642. while i - 1 >= 0:
  643. k0, bxs0 = tbls[i - 1]
  644. k, bxs = tbls[i]
  645. i -= 1
  646. if k0 in nomerge_lout_no:
  647. continue
  648. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  649. continue
  650. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  651. continue
  652. mh = self.mean_height[bxs[0]["page_number"] - 1]
  653. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  654. continue
  655. tables[k0].extend(tables[k])
  656. del tables[k]
  657. def x_overlapped(a, b):
  658. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  659. # find captions and pop out
  660. i = 0
  661. while i < len(self.boxes):
  662. c = self.boxes[i]
  663. # mh = self.mean_height[c["page_number"]-1]
  664. if not TableStructureRecognizer.is_caption(c):
  665. i += 1
  666. continue
  667. # find the nearest layouts
  668. def nearest(tbls):
  669. nonlocal c
  670. mink = ""
  671. minv = 1000000000
  672. for k, bxs in tbls.items():
  673. for b in bxs:
  674. if b.get("layout_type", "").find("caption") >= 0:
  675. continue
  676. y_dis = self._y_dis(c, b)
  677. x_dis = self._x_dis(
  678. c, b) if not x_overlapped(
  679. c, b) else 0
  680. dis = y_dis * y_dis + x_dis * x_dis
  681. if dis < minv:
  682. mink = k
  683. minv = dis
  684. return mink, minv
  685. tk, tv = nearest(tables)
  686. fk, fv = nearest(figures)
  687. # if min(tv, fv) > 2000:
  688. # i += 1
  689. # continue
  690. if tv < fv and tk:
  691. tables[tk].insert(0, c)
  692. logging.debug(
  693. "TABLE:" +
  694. self.boxes[i]["text"] +
  695. "; Cap: " +
  696. tk)
  697. elif fk:
  698. figures[fk].insert(0, c)
  699. logging.debug(
  700. "FIGURE:" +
  701. self.boxes[i]["text"] +
  702. "; Cap: " +
  703. tk)
  704. self.boxes.pop(i)
  705. res = []
  706. positions = []
  707. def cropout(bxs, ltype, poss):
  708. nonlocal ZM
  709. pn = set([b["page_number"] - 1 for b in bxs])
  710. if len(pn) < 2:
  711. pn = list(pn)[0]
  712. ht = self.page_cum_height[pn]
  713. b = {
  714. "x0": np.min([b["x0"] for b in bxs]),
  715. "top": np.min([b["top"] for b in bxs]) - ht,
  716. "x1": np.max([b["x1"] for b in bxs]),
  717. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  718. }
  719. louts = [layout for layout in self.page_layout[pn] if layout["type"] == ltype]
  720. ii = Recognizer.find_overlapped(b, louts, naive=True)
  721. if ii is not None:
  722. b = louts[ii]
  723. else:
  724. logging.warning(
  725. f"Missing layout match: {pn + 1},%s" %
  726. (bxs[0].get(
  727. "layoutno", "")))
  728. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  729. if right < left:
  730. right = left + 1
  731. poss.append((pn + self.page_from, left, right, top, bott))
  732. return self.page_images[pn] \
  733. .crop((left * ZM, top * ZM,
  734. right * ZM, bott * ZM))
  735. pn = {}
  736. for b in bxs:
  737. p = b["page_number"] - 1
  738. if p not in pn:
  739. pn[p] = []
  740. pn[p].append(b)
  741. pn = sorted(pn.items(), key=lambda x: x[0])
  742. imgs = [cropout(arr, ltype, poss) for p, arr in pn]
  743. pic = Image.new("RGB",
  744. (int(np.max([i.size[0] for i in imgs])),
  745. int(np.sum([m.size[1] for m in imgs]))),
  746. (245, 245, 245))
  747. height = 0
  748. for img in imgs:
  749. pic.paste(img, (0, int(height)))
  750. height += img.size[1]
  751. return pic
  752. # crop figure out and add caption
  753. for k, bxs in figures.items():
  754. txt = "\n".join([b["text"] for b in bxs])
  755. if not txt:
  756. continue
  757. poss = []
  758. res.append(
  759. (cropout(
  760. bxs,
  761. "figure", poss),
  762. [txt]))
  763. positions.append(poss)
  764. for k, bxs in tables.items():
  765. if not bxs:
  766. continue
  767. bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
  768. [(b["bottom"] - b["top"]) / 2 for b in bxs]))
  769. poss = []
  770. res.append((cropout(bxs, "table", poss),
  771. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  772. positions.append(poss)
  773. assert len(positions) == len(res)
  774. if need_position:
  775. return list(zip(res, positions))
  776. return res
  777. def proj_match(self, line):
  778. if len(line) <= 2:
  779. return
  780. if re.match(r"[0-9 ().,%%+/-]+$", line):
  781. return False
  782. for p, j in [
  783. (r"第[零一二三四五六七八九十百]+章", 1),
  784. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  785. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  786. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  787. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  788. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  789. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  790. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  791. (r".{,48}[::??]$", 9),
  792. (r"[0-9]+)", 10),
  793. (r"[\((][0-9]+[)\)]", 11),
  794. (r"[零一二三四五六七八九十百]+是", 12),
  795. (r"[⚫•➢✓]", 12)
  796. ]:
  797. if re.match(p, line):
  798. return j
  799. return
  800. def _line_tag(self, bx, ZM):
  801. pn = [bx["page_number"]]
  802. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  803. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  804. page_images_cnt = len(self.page_images)
  805. if pn[-1] - 1 >= page_images_cnt:
  806. return ""
  807. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  808. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  809. pn.append(pn[-1] + 1)
  810. if pn[-1] - 1 >= page_images_cnt:
  811. return ""
  812. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  813. .format("-".join([str(p) for p in pn]),
  814. bx["x0"], bx["x1"], top, bott)
  815. def __filterout_scraps(self, boxes, ZM):
  816. def width(b):
  817. return b["x1"] - b["x0"]
  818. def height(b):
  819. return b["bottom"] - b["top"]
  820. def usefull(b):
  821. if b.get("layout_type"):
  822. return True
  823. if width(
  824. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  825. return True
  826. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  827. return True
  828. return False
  829. res = []
  830. while boxes:
  831. lines = []
  832. widths = []
  833. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  834. mh = self.mean_height[boxes[0]["page_number"] - 1]
  835. mj = self.proj_match(
  836. boxes[0]["text"]) or boxes[0].get(
  837. "layout_type",
  838. "") == "title"
  839. def dfs(line, st):
  840. nonlocal mh, pw, lines, widths
  841. lines.append(line)
  842. widths.append(width(line))
  843. mmj = self.proj_match(
  844. line["text"]) or line.get(
  845. "layout_type",
  846. "") == "title"
  847. for i in range(st + 1, min(st + 20, len(boxes))):
  848. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  849. break
  850. if not mmj and self._y_dis(
  851. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  852. break
  853. if not usefull(boxes[i]):
  854. continue
  855. if mmj or \
  856. (self._x_dis(boxes[i], line) < pw / 10): \
  857. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  858. # concat following
  859. dfs(boxes[i], i)
  860. boxes.pop(i)
  861. break
  862. try:
  863. if usefull(boxes[0]):
  864. dfs(boxes[0], 0)
  865. else:
  866. logging.debug("WASTE: " + boxes[0]["text"])
  867. except Exception:
  868. pass
  869. boxes.pop(0)
  870. mw = np.mean(widths)
  871. if mj or mw / pw >= 0.35 or mw > 200:
  872. res.append(
  873. "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  874. else:
  875. logging.debug("REMOVED: " +
  876. "<<".join([c["text"] for c in lines]))
  877. return "\n\n".join(res)
  878. @staticmethod
  879. def total_page_number(fnm, binary=None):
  880. try:
  881. with sys.modules[LOCK_KEY_pdfplumber]:
  882. pdf = pdfplumber.open(
  883. fnm) if not binary else pdfplumber.open(BytesIO(binary))
  884. total_page = len(pdf.pages)
  885. pdf.close()
  886. return total_page
  887. except Exception:
  888. logging.exception("total_page_number")
  889. def __images__(self, fnm, zoomin=3, page_from=0,
  890. page_to=299, callback=None):
  891. self.lefted_chars = []
  892. self.mean_height = []
  893. self.mean_width = []
  894. self.boxes = []
  895. self.garbages = {}
  896. self.page_cum_height = [0]
  897. self.page_layout = []
  898. self.page_from = page_from
  899. start = timer()
  900. try:
  901. with sys.modules[LOCK_KEY_pdfplumber]:
  902. self.pdf = pdfplumber.open(fnm) if isinstance(
  903. fnm, str) else pdfplumber.open(BytesIO(fnm))
  904. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  905. enumerate(self.pdf.pages[page_from:page_to])]
  906. try:
  907. self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
  908. except Exception as e:
  909. logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
  910. self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
  911. self.total_page = len(self.pdf.pages)
  912. except Exception:
  913. logging.exception("RAGFlowPdfParser __images__")
  914. logging.info(f"__images__ dedupe_chars cost {timer() - start}s")
  915. self.outlines = []
  916. try:
  917. self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
  918. outlines = self.pdf.outline
  919. def dfs(arr, depth):
  920. for a in arr:
  921. if isinstance(a, dict):
  922. self.outlines.append((a["/Title"], depth))
  923. continue
  924. dfs(a, depth + 1)
  925. dfs(outlines, 0)
  926. except Exception as e:
  927. logging.warning(f"Outlines exception: {e}")
  928. finally:
  929. self.pdf.close()
  930. if not self.outlines:
  931. logging.warning("Miss outlines")
  932. logging.debug("Images converted.")
  933. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
  934. random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
  935. range(len(self.page_chars))]
  936. if sum([1 if e else 0 for e in self.is_english]) > len(
  937. self.page_images) / 2:
  938. self.is_english = True
  939. else:
  940. self.is_english = False
  941. async def __img_ocr(i, id, img, chars, limiter):
  942. j = 0
  943. while j + 1 < len(chars):
  944. if chars[j]["text"] and chars[j + 1]["text"] \
  945. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  946. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  947. chars[j]["width"]) / 2:
  948. chars[j]["text"] += " "
  949. j += 1
  950. if limiter:
  951. async with limiter:
  952. await trio.to_thread.run_sync(lambda: self.__ocr(i + 1, img, chars, zoomin, id))
  953. else:
  954. self.__ocr(i + 1, img, chars, zoomin, id)
  955. if callback and i % 6 == 5:
  956. callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
  957. async def __img_ocr_launcher():
  958. def __ocr_preprocess():
  959. chars = self.page_chars[i] if not self.is_english else []
  960. self.mean_height.append(
  961. np.median(sorted([c["height"] for c in chars])) if chars else 0
  962. )
  963. self.mean_width.append(
  964. np.median(sorted([c["width"] for c in chars])) if chars else 8
  965. )
  966. self.page_cum_height.append(img.size[1] / zoomin)
  967. return chars
  968. if self.parallel_limiter:
  969. async with trio.open_nursery() as nursery:
  970. for i, img in enumerate(self.page_images):
  971. chars = __ocr_preprocess()
  972. nursery.start_soon(__img_ocr, i, i % self.parallel_devices, img, chars,
  973. self.parallel_limiter[i % self.parallel_devices])
  974. await trio.sleep(0.1)
  975. else:
  976. for i, img in enumerate(self.page_images):
  977. chars = __ocr_preprocess()
  978. await __img_ocr(i, 0, img, chars, None)
  979. start = timer()
  980. trio.run(__img_ocr_launcher)
  981. logging.info(f"__images__ {len(self.page_images)} pages cost {timer() - start}s")
  982. if not self.is_english and not any(
  983. [c for c in self.page_chars]) and self.boxes:
  984. bxes = [b for bxs in self.boxes for b in bxs]
  985. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
  986. "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  987. logging.debug("Is it English:", self.is_english)
  988. self.page_cum_height = np.cumsum(self.page_cum_height)
  989. assert len(self.page_cum_height) == len(self.page_images) + 1
  990. if len(self.boxes) == 0 and zoomin < 9:
  991. self.__images__(fnm, zoomin * 3, page_from, page_to, callback)
  992. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  993. self.__images__(fnm, zoomin)
  994. self._layouts_rec(zoomin)
  995. self._table_transformer_job(zoomin)
  996. self._text_merge()
  997. self._concat_downward()
  998. self._filter_forpages()
  999. tbls = self._extract_table_figure(
  1000. need_image, zoomin, return_html, False)
  1001. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  1002. def remove_tag(self, txt):
  1003. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  1004. def crop(self, text, ZM=3, need_position=False):
  1005. imgs = []
  1006. poss = []
  1007. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  1008. pn, left, right, top, bottom = tag.strip(
  1009. "#").strip("@").split("\t")
  1010. left, right, top, bottom = float(left), float(
  1011. right), float(top), float(bottom)
  1012. poss.append(([int(p) - 1 for p in pn.split("-")],
  1013. left, right, top, bottom))
  1014. if not poss:
  1015. if need_position:
  1016. return None, None
  1017. return
  1018. max_width = max(
  1019. np.max([right - left for (_, left, right, _, _) in poss]), 6)
  1020. GAP = 6
  1021. pos = poss[0]
  1022. poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
  1023. 0, pos[3] - 120), max(pos[3] - GAP, 0)))
  1024. pos = poss[-1]
  1025. poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
  1026. min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
  1027. positions = []
  1028. for ii, (pns, left, right, top, bottom) in enumerate(poss):
  1029. right = left + max_width
  1030. bottom *= ZM
  1031. for pn in pns[1:]:
  1032. bottom += self.page_images[pn - 1].size[1]
  1033. imgs.append(
  1034. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  1035. right *
  1036. ZM, min(
  1037. bottom, self.page_images[pns[0]].size[1])
  1038. ))
  1039. )
  1040. if 0 < ii < len(poss) - 1:
  1041. positions.append((pns[0] + self.page_from, left, right, top, min(
  1042. bottom, self.page_images[pns[0]].size[1]) / ZM))
  1043. bottom -= self.page_images[pns[0]].size[1]
  1044. for pn in pns[1:]:
  1045. imgs.append(
  1046. self.page_images[pn].crop((left * ZM, 0,
  1047. right * ZM,
  1048. min(bottom,
  1049. self.page_images[pn].size[1])
  1050. ))
  1051. )
  1052. if 0 < ii < len(poss) - 1:
  1053. positions.append((pn + self.page_from, left, right, 0, min(
  1054. bottom, self.page_images[pn].size[1]) / ZM))
  1055. bottom -= self.page_images[pn].size[1]
  1056. if not imgs:
  1057. if need_position:
  1058. return None, None
  1059. return
  1060. height = 0
  1061. for img in imgs:
  1062. height += img.size[1] + GAP
  1063. height = int(height)
  1064. width = int(np.max([i.size[0] for i in imgs]))
  1065. pic = Image.new("RGB",
  1066. (width, height),
  1067. (245, 245, 245))
  1068. height = 0
  1069. for ii, img in enumerate(imgs):
  1070. if ii == 0 or ii + 1 == len(imgs):
  1071. img = img.convert('RGBA')
  1072. overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
  1073. overlay.putalpha(128)
  1074. img = Image.alpha_composite(img, overlay).convert("RGB")
  1075. pic.paste(img, (0, int(height)))
  1076. height += img.size[1] + GAP
  1077. if need_position:
  1078. return pic, positions
  1079. return pic
  1080. def get_position(self, bx, ZM):
  1081. poss = []
  1082. pn = bx["page_number"]
  1083. top = bx["top"] - self.page_cum_height[pn - 1]
  1084. bott = bx["bottom"] - self.page_cum_height[pn - 1]
  1085. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1086. bott, self.page_images[pn - 1].size[1] / ZM)))
  1087. while bott * ZM > self.page_images[pn - 1].size[1]:
  1088. bott -= self.page_images[pn - 1].size[1] / ZM
  1089. top = 0
  1090. pn += 1
  1091. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1092. bott, self.page_images[pn - 1].size[1] / ZM)))
  1093. return poss
  1094. class PlainParser:
  1095. def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
  1096. self.outlines = []
  1097. lines = []
  1098. try:
  1099. self.pdf = pdf2_read(
  1100. filename if isinstance(
  1101. filename, str) else BytesIO(filename))
  1102. for page in self.pdf.pages[from_page:to_page]:
  1103. lines.extend([t for t in page.extract_text().split("\n")])
  1104. outlines = self.pdf.outline
  1105. def dfs(arr, depth):
  1106. for a in arr:
  1107. if isinstance(a, dict):
  1108. self.outlines.append((a["/Title"], depth))
  1109. continue
  1110. dfs(a, depth + 1)
  1111. dfs(outlines, 0)
  1112. except Exception:
  1113. logging.exception("Outlines exception")
  1114. if not self.outlines:
  1115. logging.warning("Miss outlines")
  1116. return [(line, "") for line in lines], []
  1117. def crop(self, ck, need_position):
  1118. raise NotImplementedError
  1119. @staticmethod
  1120. def remove_tag(txt):
  1121. raise NotImplementedError
  1122. if __name__ == "__main__":
  1123. pass