Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

pdf_parser.py 47KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import logging
  17. import os
  18. import random
  19. from timeit import default_timer as timer
  20. import sys
  21. import threading
  22. import xgboost as xgb
  23. from io import BytesIO
  24. import re
  25. import pdfplumber
  26. from PIL import Image
  27. import numpy as np
  28. from pypdf import PdfReader as pdf2_read
  29. from api import settings
  30. from api.utils.file_utils import get_project_base_directory
  31. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  32. from rag.nlp import rag_tokenizer
  33. from copy import deepcopy
  34. from huggingface_hub import snapshot_download
  35. LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
  36. if LOCK_KEY_pdfplumber not in sys.modules:
  37. sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
  38. class RAGFlowPdfParser:
  39. def __init__(self):
  40. """
  41. If you have trouble downloading HuggingFace models, -_^ this might help!!
  42. For Linux:
  43. export HF_ENDPOINT=https://hf-mirror.com
  44. For Windows:
  45. Good luck
  46. ^_-
  47. """
  48. self.ocr = OCR()
  49. if hasattr(self, "model_speciess"):
  50. self.layouter = LayoutRecognizer("layout." + self.model_speciess)
  51. else:
  52. self.layouter = LayoutRecognizer("layout")
  53. self.tbl_det = TableStructureRecognizer()
  54. self.updown_cnt_mdl = xgb.Booster()
  55. if not settings.LIGHTEN:
  56. try:
  57. import torch
  58. if torch.cuda.is_available():
  59. self.updown_cnt_mdl.set_param({"device": "cuda"})
  60. except Exception:
  61. logging.exception("RAGFlowPdfParser __init__")
  62. try:
  63. model_dir = os.path.join(
  64. get_project_base_directory(),
  65. "rag/res/deepdoc")
  66. self.updown_cnt_mdl.load_model(os.path.join(
  67. model_dir, "updown_concat_xgb.model"))
  68. except Exception:
  69. model_dir = snapshot_download(
  70. repo_id="InfiniFlow/text_concat_xgb_v1.0",
  71. local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
  72. local_dir_use_symlinks=False)
  73. self.updown_cnt_mdl.load_model(os.path.join(
  74. model_dir, "updown_concat_xgb.model"))
  75. self.page_from = 0
  76. def __char_width(self, c):
  77. return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
  78. def __height(self, c):
  79. return c["bottom"] - c["top"]
  80. def _x_dis(self, a, b):
  81. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  82. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  83. def _y_dis(
  84. self, a, b):
  85. return (
  86. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  87. def _match_proj(self, b):
  88. proj_patt = [
  89. r"第[零一二三四五六七八九十百]+章",
  90. r"第[零一二三四五六七八九十百]+[条节]",
  91. r"[零一二三四五六七八九十百]+[、是  ]",
  92. r"[\((][零一二三四五六七八九十百]+[)\)]",
  93. r"[\((][0-9]+[)\)]",
  94. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  95. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  96. r"[⚫•➢①② ]",
  97. ]
  98. return any([re.match(p, b["text"]) for p in proj_patt])
  99. def _updown_concat_features(self, up, down):
  100. w = max(self.__char_width(up), self.__char_width(down))
  101. h = max(self.__height(up), self.__height(down))
  102. y_dis = self._y_dis(up, down)
  103. LEN = 6
  104. tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split()
  105. tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split()
  106. tks_all = up["text"][-LEN:].strip() \
  107. + (" " if re.match(r"[a-zA-Z0-9]+",
  108. up["text"][-1] + down["text"][0]) else "") \
  109. + down["text"][:LEN].strip()
  110. tks_all = rag_tokenizer.tokenize(tks_all).split()
  111. fea = [
  112. up.get("R", -1) == down.get("R", -1),
  113. y_dis / h,
  114. down["page_number"] - up["page_number"],
  115. up["layout_type"] == down["layout_type"],
  116. up["layout_type"] == "text",
  117. down["layout_type"] == "text",
  118. up["layout_type"] == "table",
  119. down["layout_type"] == "table",
  120. True if re.search(
  121. r"([。?!;!?;+))]|[a-z]\.)$",
  122. up["text"]) else False,
  123. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  124. True if re.search(
  125. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  126. down["text"]) else False,
  127. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  128. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  129. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  130. True if re.search(r"[\((][^\))]+$", up["text"])
  131. and re.search(r"[\))]", down["text"]) else False,
  132. self._match_proj(down),
  133. True if re.match(r"[A-Z]", down["text"]) else False,
  134. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  135. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  136. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  137. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  138. ) > 1 and len(
  139. down["text"].strip()) > 1 else False,
  140. up["x0"] > down["x1"],
  141. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  142. self.__height(down)),
  143. self._x_dis(up, down) / max(w, 0.000001),
  144. (len(up["text"]) - len(down["text"])) /
  145. max(len(up["text"]), len(down["text"])),
  146. len(tks_all) - len(tks_up) - len(tks_down),
  147. len(tks_down) - len(tks_up),
  148. tks_down[-1] == tks_up[-1] if tks_down and tks_up else False,
  149. max(down["in_row"], up["in_row"]),
  150. abs(down["in_row"] - up["in_row"]),
  151. len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
  152. len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
  153. ]
  154. return fea
  155. @staticmethod
  156. def sort_X_by_page(arr, threashold):
  157. # sort using y1 first and then x1
  158. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  159. for i in range(len(arr) - 1):
  160. for j in range(i, -1, -1):
  161. # restore the order using th
  162. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  163. and arr[j + 1]["top"] < arr[j]["top"] \
  164. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  165. tmp = arr[j]
  166. arr[j] = arr[j + 1]
  167. arr[j + 1] = tmp
  168. return arr
  169. def _has_color(self, o):
  170. if o.get("ncs", "") == "DeviceGray":
  171. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  172. o["non_stroking_color"][0] == 1:
  173. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  174. return False
  175. return True
  176. def _table_transformer_job(self, ZM):
  177. logging.debug("Table processing...")
  178. imgs, pos = [], []
  179. tbcnt = [0]
  180. MARGIN = 10
  181. self.tb_cpns = []
  182. assert len(self.page_layout) == len(self.page_images)
  183. for p, tbls in enumerate(self.page_layout): # for page
  184. tbls = [f for f in tbls if f["type"] == "table"]
  185. tbcnt.append(len(tbls))
  186. if not tbls:
  187. continue
  188. for tb in tbls: # for table
  189. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  190. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  191. left *= ZM
  192. top *= ZM
  193. right *= ZM
  194. bott *= ZM
  195. pos.append((left, top))
  196. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  197. assert len(self.page_images) == len(tbcnt) - 1
  198. if not imgs:
  199. return
  200. recos = self.tbl_det(imgs)
  201. tbcnt = np.cumsum(tbcnt)
  202. for i in range(len(tbcnt) - 1): # for page
  203. pg = []
  204. for j, tb_items in enumerate(
  205. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  206. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  207. for it in tb_items: # for table components
  208. it["x0"] = (it["x0"] + poss[j][0])
  209. it["x1"] = (it["x1"] + poss[j][0])
  210. it["top"] = (it["top"] + poss[j][1])
  211. it["bottom"] = (it["bottom"] + poss[j][1])
  212. for n in ["x0", "x1", "top", "bottom"]:
  213. it[n] /= ZM
  214. it["top"] += self.page_cum_height[i]
  215. it["bottom"] += self.page_cum_height[i]
  216. it["pn"] = i
  217. it["layoutno"] = j
  218. pg.append(it)
  219. self.tb_cpns.extend(pg)
  220. def gather(kwd, fzy=10, ption=0.6):
  221. eles = Recognizer.sort_Y_firstly(
  222. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  223. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  224. return Recognizer.sort_Y_firstly(eles, 0)
  225. # add R,H,C,SP tag to boxes within table layout
  226. headers = gather(r".*header$")
  227. rows = gather(r".* (row|header)")
  228. spans = gather(r".*spanning")
  229. clmns = sorted([r for r in self.tb_cpns if re.match(
  230. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  231. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  232. for b in self.boxes:
  233. if b.get("layout_type", "") != "table":
  234. continue
  235. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  236. if ii is not None:
  237. b["R"] = ii
  238. b["R_top"] = rows[ii]["top"]
  239. b["R_bott"] = rows[ii]["bottom"]
  240. ii = Recognizer.find_overlapped_with_threashold(
  241. b, headers, thr=0.3)
  242. if ii is not None:
  243. b["H_top"] = headers[ii]["top"]
  244. b["H_bott"] = headers[ii]["bottom"]
  245. b["H_left"] = headers[ii]["x0"]
  246. b["H_right"] = headers[ii]["x1"]
  247. b["H"] = ii
  248. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  249. if ii is not None:
  250. b["C"] = ii
  251. b["C_left"] = clmns[ii]["x0"]
  252. b["C_right"] = clmns[ii]["x1"]
  253. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  254. if ii is not None:
  255. b["H_top"] = spans[ii]["top"]
  256. b["H_bott"] = spans[ii]["bottom"]
  257. b["H_left"] = spans[ii]["x0"]
  258. b["H_right"] = spans[ii]["x1"]
  259. b["SP"] = ii
  260. def __ocr(self, pagenum, img, chars, ZM=3):
  261. start = timer()
  262. bxs = self.ocr.detect(np.array(img))
  263. logging.info(f"__ocr detecting boxes of a image cost ({timer() - start}s)")
  264. start = timer()
  265. if not bxs:
  266. self.boxes.append([])
  267. return
  268. bxs = [(line[0], line[1][0]) for line in bxs]
  269. bxs = Recognizer.sort_Y_firstly(
  270. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  271. "top": b[0][1] / ZM, "text": "", "txt": t,
  272. "bottom": b[-1][1] / ZM,
  273. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  274. self.mean_height[-1] / 3
  275. )
  276. # merge chars in the same rect
  277. for c in Recognizer.sort_Y_firstly(
  278. chars, self.mean_height[pagenum - 1] // 4):
  279. ii = Recognizer.find_overlapped(c, bxs)
  280. if ii is None:
  281. self.lefted_chars.append(c)
  282. continue
  283. ch = c["bottom"] - c["top"]
  284. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  285. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  286. self.lefted_chars.append(c)
  287. continue
  288. if c["text"] == " " and bxs[ii]["text"]:
  289. if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", bxs[ii]["text"][-1]):
  290. bxs[ii]["text"] += " "
  291. else:
  292. bxs[ii]["text"] += c["text"]
  293. logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s")
  294. start = timer()
  295. boxes_to_reg = []
  296. img_np = np.array(img)
  297. for b in bxs:
  298. if not b["text"]:
  299. left, right, top, bott = b["x0"] * ZM, b["x1"] * \
  300. ZM, b["top"] * ZM, b["bottom"] * ZM
  301. b["box_image"] = self.ocr.get_rotate_crop_image(img_np, np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32))
  302. boxes_to_reg.append(b)
  303. del b["txt"]
  304. texts = self.ocr.recognize_batch([b["box_image"] for b in boxes_to_reg])
  305. for i in range(len(boxes_to_reg)):
  306. boxes_to_reg[i]["text"] = texts[i]
  307. del boxes_to_reg[i]["box_image"]
  308. logging.info(f"__ocr recognize {len(bxs)} boxes cost {timer() - start}s")
  309. bxs = [b for b in bxs if b["text"]]
  310. if self.mean_height[-1] == 0:
  311. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  312. for b in bxs])
  313. self.boxes.append(bxs)
  314. def _layouts_rec(self, ZM, drop=True):
  315. assert len(self.page_images) == len(self.boxes)
  316. self.boxes, self.page_layout = self.layouter(
  317. self.page_images, self.boxes, ZM, drop=drop)
  318. # cumlative Y
  319. for i in range(len(self.boxes)):
  320. self.boxes[i]["top"] += \
  321. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  322. self.boxes[i]["bottom"] += \
  323. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  324. def _text_merge(self):
  325. # merge adjusted boxes
  326. bxs = self.boxes
  327. def end_with(b, txt):
  328. txt = txt.strip()
  329. tt = b.get("text", "").strip()
  330. return tt and tt.find(txt) == len(tt) - len(txt)
  331. def start_with(b, txts):
  332. tt = b.get("text", "").strip()
  333. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  334. # horizontally merge adjacent box with the same layout
  335. i = 0
  336. while i < len(bxs) - 1:
  337. b = bxs[i]
  338. b_ = bxs[i + 1]
  339. if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
  340. "equation"]:
  341. i += 1
  342. continue
  343. if abs(self._y_dis(b, b_)
  344. ) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
  345. # merge
  346. bxs[i]["x1"] = b_["x1"]
  347. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  348. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  349. bxs[i]["text"] += b_["text"]
  350. bxs.pop(i + 1)
  351. continue
  352. i += 1
  353. continue
  354. dis_thr = 1
  355. dis = b["x1"] - b_["x0"]
  356. if b.get("layout_type", "") != "text" or b_.get(
  357. "layout_type", "") != "text":
  358. if end_with(b, ",") or start_with(b_, "(,"):
  359. dis_thr = -8
  360. else:
  361. i += 1
  362. continue
  363. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  364. and dis >= dis_thr and b["x1"] < b_["x1"]:
  365. # merge
  366. bxs[i]["x1"] = b_["x1"]
  367. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  368. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  369. bxs[i]["text"] += b_["text"]
  370. bxs.pop(i + 1)
  371. continue
  372. i += 1
  373. self.boxes = bxs
  374. def _naive_vertical_merge(self):
  375. bxs = Recognizer.sort_Y_firstly(
  376. self.boxes, np.median(
  377. self.mean_height) / 3)
  378. i = 0
  379. while i + 1 < len(bxs):
  380. b = bxs[i]
  381. b_ = bxs[i + 1]
  382. if b["page_number"] < b_["page_number"] and re.match(
  383. r"[0-9 •一—-]+$", b["text"]):
  384. bxs.pop(i)
  385. continue
  386. if not b["text"].strip():
  387. bxs.pop(i)
  388. continue
  389. concatting_feats = [
  390. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  391. len(b["text"].strip()) > 1 and b["text"].strip(
  392. )[-2] in ",;:'\",‘“、;:",
  393. b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:",
  394. ]
  395. # features for not concating
  396. feats = [
  397. b.get("layoutno", 0) != b_.get("layoutno", 0),
  398. b["text"].strip()[-1] in "。?!?",
  399. self.is_english and b["text"].strip()[-1] in ".!?",
  400. b["page_number"] == b_["page_number"] and b_["top"] -
  401. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  402. b["page_number"] < b_["page_number"] and abs(
  403. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
  404. ]
  405. # split features
  406. detach_feats = [b["x1"] < b_["x0"],
  407. b["x0"] > b_["x1"]]
  408. if (any(feats) and not any(concatting_feats)) or any(detach_feats):
  409. logging.debug("{} {} {} {}".format(
  410. b["text"],
  411. b_["text"],
  412. any(feats),
  413. any(concatting_feats),
  414. ))
  415. i += 1
  416. continue
  417. # merge up and down
  418. b["bottom"] = b_["bottom"]
  419. b["text"] += b_["text"]
  420. b["x0"] = min(b["x0"], b_["x0"])
  421. b["x1"] = max(b["x1"], b_["x1"])
  422. bxs.pop(i + 1)
  423. self.boxes = bxs
  424. def _concat_downward(self, concat_between_pages=True):
  425. # count boxes in the same row as a feature
  426. for i in range(len(self.boxes)):
  427. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  428. self.boxes[i]["in_row"] = 0
  429. j = max(0, i - 12)
  430. while j < min(i + 12, len(self.boxes)):
  431. if j == i:
  432. j += 1
  433. continue
  434. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  435. if abs(ydis) < 1:
  436. self.boxes[i]["in_row"] += 1
  437. elif ydis > 0:
  438. break
  439. j += 1
  440. # concat between rows
  441. boxes = deepcopy(self.boxes)
  442. blocks = []
  443. while boxes:
  444. chunks = []
  445. def dfs(up, dp):
  446. chunks.append(up)
  447. i = dp
  448. while i < min(dp + 12, len(boxes)):
  449. ydis = self._y_dis(up, boxes[i])
  450. smpg = up["page_number"] == boxes[i]["page_number"]
  451. mh = self.mean_height[up["page_number"] - 1]
  452. mw = self.mean_width[up["page_number"] - 1]
  453. if smpg and ydis > mh * 4:
  454. break
  455. if not smpg and ydis > mh * 16:
  456. break
  457. down = boxes[i]
  458. if not concat_between_pages and down["page_number"] > up["page_number"]:
  459. break
  460. if up.get("R", "") != down.get(
  461. "R", "") and up["text"][-1] != ",":
  462. i += 1
  463. continue
  464. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  465. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \
  466. or not down["text"].strip():
  467. i += 1
  468. continue
  469. if not down["text"].strip() or not up["text"].strip():
  470. i += 1
  471. continue
  472. if up["x1"] < down["x0"] - 10 * \
  473. mw or up["x0"] > down["x1"] + 10 * mw:
  474. i += 1
  475. continue
  476. if i - dp < 5 and up.get("layout_type") == "text":
  477. if up.get("layoutno", "1") == down.get(
  478. "layoutno", "2"):
  479. dfs(down, i + 1)
  480. boxes.pop(i)
  481. return
  482. i += 1
  483. continue
  484. fea = self._updown_concat_features(up, down)
  485. if self.updown_cnt_mdl.predict(
  486. xgb.DMatrix([fea]))[0] <= 0.5:
  487. i += 1
  488. continue
  489. dfs(down, i + 1)
  490. boxes.pop(i)
  491. return
  492. dfs(boxes[0], 1)
  493. boxes.pop(0)
  494. if chunks:
  495. blocks.append(chunks)
  496. # concat within each block
  497. boxes = []
  498. for b in blocks:
  499. if len(b) == 1:
  500. boxes.append(b[0])
  501. continue
  502. t = b[0]
  503. for c in b[1:]:
  504. t["text"] = t["text"].strip()
  505. c["text"] = c["text"].strip()
  506. if not c["text"]:
  507. continue
  508. if t["text"] and re.match(
  509. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  510. t["text"] += " "
  511. t["text"] += c["text"]
  512. t["x0"] = min(t["x0"], c["x0"])
  513. t["x1"] = max(t["x1"], c["x1"])
  514. t["page_number"] = min(t["page_number"], c["page_number"])
  515. t["bottom"] = c["bottom"]
  516. if not t["layout_type"] \
  517. and c["layout_type"]:
  518. t["layout_type"] = c["layout_type"]
  519. boxes.append(t)
  520. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  521. def _filter_forpages(self):
  522. if not self.boxes:
  523. return
  524. findit = False
  525. i = 0
  526. while i < len(self.boxes):
  527. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
  528. re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  529. i += 1
  530. continue
  531. findit = True
  532. eng = re.match(
  533. r"[0-9a-zA-Z :'.-]{5,}",
  534. self.boxes[i]["text"].strip())
  535. self.boxes.pop(i)
  536. if i >= len(self.boxes):
  537. break
  538. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  539. self.boxes[i]["text"].strip().split()[:2])
  540. while not prefix:
  541. self.boxes.pop(i)
  542. if i >= len(self.boxes):
  543. break
  544. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  545. self.boxes[i]["text"].strip().split()[:2])
  546. self.boxes.pop(i)
  547. if i >= len(self.boxes) or not prefix:
  548. break
  549. for j in range(i, min(i + 128, len(self.boxes))):
  550. if not re.match(prefix, self.boxes[j]["text"]):
  551. continue
  552. for k in range(i, j):
  553. self.boxes.pop(i)
  554. break
  555. if findit:
  556. return
  557. page_dirty = [0] * len(self.page_images)
  558. for b in self.boxes:
  559. if re.search(r"(··|··|··)", b["text"]):
  560. page_dirty[b["page_number"] - 1] += 1
  561. page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
  562. if not page_dirty:
  563. return
  564. i = 0
  565. while i < len(self.boxes):
  566. if self.boxes[i]["page_number"] in page_dirty:
  567. self.boxes.pop(i)
  568. continue
  569. i += 1
  570. def _merge_with_same_bullet(self):
  571. i = 0
  572. while i + 1 < len(self.boxes):
  573. b = self.boxes[i]
  574. b_ = self.boxes[i + 1]
  575. if not b["text"].strip():
  576. self.boxes.pop(i)
  577. continue
  578. if not b_["text"].strip():
  579. self.boxes.pop(i + 1)
  580. continue
  581. if b["text"].strip()[0] != b_["text"].strip()[0] \
  582. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  583. or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
  584. or b["top"] > b_["bottom"]:
  585. i += 1
  586. continue
  587. b_["text"] = b["text"] + "\n" + b_["text"]
  588. b_["x0"] = min(b["x0"], b_["x0"])
  589. b_["x1"] = max(b["x1"], b_["x1"])
  590. b_["top"] = b["top"]
  591. self.boxes.pop(i)
  592. def _extract_table_figure(self, need_image, ZM,
  593. return_html, need_position):
  594. tables = {}
  595. figures = {}
  596. # extract figure and table boxes
  597. i = 0
  598. lst_lout_no = ""
  599. nomerge_lout_no = []
  600. while i < len(self.boxes):
  601. if "layoutno" not in self.boxes[i]:
  602. i += 1
  603. continue
  604. lout_no = str(self.boxes[i]["page_number"]) + \
  605. "-" + str(self.boxes[i]["layoutno"])
  606. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
  607. "title",
  608. "figure caption",
  609. "reference"]:
  610. nomerge_lout_no.append(lst_lout_no)
  611. if self.boxes[i]["layout_type"] == "table":
  612. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  613. self.boxes.pop(i)
  614. continue
  615. if lout_no not in tables:
  616. tables[lout_no] = []
  617. tables[lout_no].append(self.boxes[i])
  618. self.boxes.pop(i)
  619. lst_lout_no = lout_no
  620. continue
  621. if need_image and self.boxes[i]["layout_type"] == "figure":
  622. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  623. self.boxes.pop(i)
  624. continue
  625. if lout_no not in figures:
  626. figures[lout_no] = []
  627. figures[lout_no].append(self.boxes[i])
  628. self.boxes.pop(i)
  629. lst_lout_no = lout_no
  630. continue
  631. i += 1
  632. # merge table on different pages
  633. nomerge_lout_no = set(nomerge_lout_no)
  634. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  635. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  636. i = len(tbls) - 1
  637. while i - 1 >= 0:
  638. k0, bxs0 = tbls[i - 1]
  639. k, bxs = tbls[i]
  640. i -= 1
  641. if k0 in nomerge_lout_no:
  642. continue
  643. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  644. continue
  645. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  646. continue
  647. mh = self.mean_height[bxs[0]["page_number"] - 1]
  648. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  649. continue
  650. tables[k0].extend(tables[k])
  651. del tables[k]
  652. def x_overlapped(a, b):
  653. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  654. # find captions and pop out
  655. i = 0
  656. while i < len(self.boxes):
  657. c = self.boxes[i]
  658. # mh = self.mean_height[c["page_number"]-1]
  659. if not TableStructureRecognizer.is_caption(c):
  660. i += 1
  661. continue
  662. # find the nearest layouts
  663. def nearest(tbls):
  664. nonlocal c
  665. mink = ""
  666. minv = 1000000000
  667. for k, bxs in tbls.items():
  668. for b in bxs:
  669. if b.get("layout_type", "").find("caption") >= 0:
  670. continue
  671. y_dis = self._y_dis(c, b)
  672. x_dis = self._x_dis(
  673. c, b) if not x_overlapped(
  674. c, b) else 0
  675. dis = y_dis * y_dis + x_dis * x_dis
  676. if dis < minv:
  677. mink = k
  678. minv = dis
  679. return mink, minv
  680. tk, tv = nearest(tables)
  681. fk, fv = nearest(figures)
  682. # if min(tv, fv) > 2000:
  683. # i += 1
  684. # continue
  685. if tv < fv and tk:
  686. tables[tk].insert(0, c)
  687. logging.debug(
  688. "TABLE:" +
  689. self.boxes[i]["text"] +
  690. "; Cap: " +
  691. tk)
  692. elif fk:
  693. figures[fk].insert(0, c)
  694. logging.debug(
  695. "FIGURE:" +
  696. self.boxes[i]["text"] +
  697. "; Cap: " +
  698. tk)
  699. self.boxes.pop(i)
  700. res = []
  701. positions = []
  702. def cropout(bxs, ltype, poss):
  703. nonlocal ZM
  704. pn = set([b["page_number"] - 1 for b in bxs])
  705. if len(pn) < 2:
  706. pn = list(pn)[0]
  707. ht = self.page_cum_height[pn]
  708. b = {
  709. "x0": np.min([b["x0"] for b in bxs]),
  710. "top": np.min([b["top"] for b in bxs]) - ht,
  711. "x1": np.max([b["x1"] for b in bxs]),
  712. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  713. }
  714. louts = [layout for layout in self.page_layout[pn] if layout["type"] == ltype]
  715. ii = Recognizer.find_overlapped(b, louts, naive=True)
  716. if ii is not None:
  717. b = louts[ii]
  718. else:
  719. logging.warning(
  720. f"Missing layout match: {pn + 1},%s" %
  721. (bxs[0].get(
  722. "layoutno", "")))
  723. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  724. if right < left:
  725. right = left + 1
  726. poss.append((pn + self.page_from, left, right, top, bott))
  727. return self.page_images[pn] \
  728. .crop((left * ZM, top * ZM,
  729. right * ZM, bott * ZM))
  730. pn = {}
  731. for b in bxs:
  732. p = b["page_number"] - 1
  733. if p not in pn:
  734. pn[p] = []
  735. pn[p].append(b)
  736. pn = sorted(pn.items(), key=lambda x: x[0])
  737. imgs = [cropout(arr, ltype, poss) for p, arr in pn]
  738. pic = Image.new("RGB",
  739. (int(np.max([i.size[0] for i in imgs])),
  740. int(np.sum([m.size[1] for m in imgs]))),
  741. (245, 245, 245))
  742. height = 0
  743. for img in imgs:
  744. pic.paste(img, (0, int(height)))
  745. height += img.size[1]
  746. return pic
  747. # crop figure out and add caption
  748. for k, bxs in figures.items():
  749. txt = "\n".join([b["text"] for b in bxs])
  750. if not txt:
  751. continue
  752. poss = []
  753. res.append(
  754. (cropout(
  755. bxs,
  756. "figure", poss),
  757. [txt]))
  758. positions.append(poss)
  759. for k, bxs in tables.items():
  760. if not bxs:
  761. continue
  762. bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
  763. [(b["bottom"] - b["top"]) / 2 for b in bxs]))
  764. poss = []
  765. res.append((cropout(bxs, "table", poss),
  766. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  767. positions.append(poss)
  768. assert len(positions) == len(res)
  769. if need_position:
  770. return list(zip(res, positions))
  771. return res
  772. def proj_match(self, line):
  773. if len(line) <= 2:
  774. return
  775. if re.match(r"[0-9 ().,%%+/-]+$", line):
  776. return False
  777. for p, j in [
  778. (r"第[零一二三四五六七八九十百]+章", 1),
  779. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  780. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  781. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  782. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  783. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  784. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  785. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  786. (r".{,48}[::??]$", 9),
  787. (r"[0-9]+)", 10),
  788. (r"[\((][0-9]+[)\)]", 11),
  789. (r"[零一二三四五六七八九十百]+是", 12),
  790. (r"[⚫•➢✓]", 12)
  791. ]:
  792. if re.match(p, line):
  793. return j
  794. return
  795. def _line_tag(self, bx, ZM):
  796. pn = [bx["page_number"]]
  797. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  798. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  799. page_images_cnt = len(self.page_images)
  800. if pn[-1] - 1 >= page_images_cnt:
  801. return ""
  802. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  803. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  804. pn.append(pn[-1] + 1)
  805. if pn[-1] - 1 >= page_images_cnt:
  806. return ""
  807. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  808. .format("-".join([str(p) for p in pn]),
  809. bx["x0"], bx["x1"], top, bott)
  810. def __filterout_scraps(self, boxes, ZM):
  811. def width(b):
  812. return b["x1"] - b["x0"]
  813. def height(b):
  814. return b["bottom"] - b["top"]
  815. def usefull(b):
  816. if b.get("layout_type"):
  817. return True
  818. if width(
  819. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  820. return True
  821. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  822. return True
  823. return False
  824. res = []
  825. while boxes:
  826. lines = []
  827. widths = []
  828. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  829. mh = self.mean_height[boxes[0]["page_number"] - 1]
  830. mj = self.proj_match(
  831. boxes[0]["text"]) or boxes[0].get(
  832. "layout_type",
  833. "") == "title"
  834. def dfs(line, st):
  835. nonlocal mh, pw, lines, widths
  836. lines.append(line)
  837. widths.append(width(line))
  838. mmj = self.proj_match(
  839. line["text"]) or line.get(
  840. "layout_type",
  841. "") == "title"
  842. for i in range(st + 1, min(st + 20, len(boxes))):
  843. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  844. break
  845. if not mmj and self._y_dis(
  846. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  847. break
  848. if not usefull(boxes[i]):
  849. continue
  850. if mmj or \
  851. (self._x_dis(boxes[i], line) < pw / 10): \
  852. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  853. # concat following
  854. dfs(boxes[i], i)
  855. boxes.pop(i)
  856. break
  857. try:
  858. if usefull(boxes[0]):
  859. dfs(boxes[0], 0)
  860. else:
  861. logging.debug("WASTE: " + boxes[0]["text"])
  862. except Exception:
  863. pass
  864. boxes.pop(0)
  865. mw = np.mean(widths)
  866. if mj or mw / pw >= 0.35 or mw > 200:
  867. res.append(
  868. "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  869. else:
  870. logging.debug("REMOVED: " +
  871. "<<".join([c["text"] for c in lines]))
  872. return "\n\n".join(res)
  873. @staticmethod
  874. def total_page_number(fnm, binary=None):
  875. try:
  876. with sys.modules[LOCK_KEY_pdfplumber]:
  877. pdf = pdfplumber.open(
  878. fnm) if not binary else pdfplumber.open(BytesIO(binary))
  879. total_page = len(pdf.pages)
  880. pdf.close()
  881. return total_page
  882. except Exception:
  883. logging.exception("total_page_number")
  884. def __images__(self, fnm, zoomin=3, page_from=0,
  885. page_to=299, callback=None):
  886. self.lefted_chars = []
  887. self.mean_height = []
  888. self.mean_width = []
  889. self.boxes = []
  890. self.garbages = {}
  891. self.page_cum_height = [0]
  892. self.page_layout = []
  893. self.page_from = page_from
  894. start = timer()
  895. try:
  896. with sys.modules[LOCK_KEY_pdfplumber]:
  897. self.pdf = pdfplumber.open(fnm) if isinstance(
  898. fnm, str) else pdfplumber.open(BytesIO(fnm))
  899. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  900. enumerate(self.pdf.pages[page_from:page_to])]
  901. try:
  902. self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
  903. except Exception as e:
  904. logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
  905. self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
  906. self.total_page = len(self.pdf.pages)
  907. except Exception:
  908. logging.exception("RAGFlowPdfParser __images__")
  909. logging.info(f"__images__ dedupe_chars cost {timer() - start}s")
  910. self.outlines = []
  911. try:
  912. self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
  913. outlines = self.pdf.outline
  914. def dfs(arr, depth):
  915. for a in arr:
  916. if isinstance(a, dict):
  917. self.outlines.append((a["/Title"], depth))
  918. continue
  919. dfs(a, depth + 1)
  920. dfs(outlines, 0)
  921. except Exception as e:
  922. logging.warning(f"Outlines exception: {e}")
  923. finally:
  924. self.pdf.close()
  925. if not self.outlines:
  926. logging.warning("Miss outlines")
  927. logging.debug("Images converted.")
  928. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
  929. random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
  930. range(len(self.page_chars))]
  931. if sum([1 if e else 0 for e in self.is_english]) > len(
  932. self.page_images) / 2:
  933. self.is_english = True
  934. else:
  935. self.is_english = False
  936. start = timer()
  937. for i, img in enumerate(self.page_images):
  938. chars = self.page_chars[i] if not self.is_english else []
  939. self.mean_height.append(
  940. np.median(sorted([c["height"] for c in chars])) if chars else 0
  941. )
  942. self.mean_width.append(
  943. np.median(sorted([c["width"] for c in chars])) if chars else 8
  944. )
  945. self.page_cum_height.append(img.size[1] / zoomin)
  946. j = 0
  947. while j + 1 < len(chars):
  948. if chars[j]["text"] and chars[j + 1]["text"] \
  949. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  950. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  951. chars[j]["width"]) / 2:
  952. chars[j]["text"] += " "
  953. j += 1
  954. self.__ocr(i + 1, img, chars, zoomin)
  955. if callback and i % 6 == 5:
  956. callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
  957. logging.info(f"__images__ {len(self.page_images)} pages cost {timer() - start}s")
  958. if not self.is_english and not any(
  959. [c for c in self.page_chars]) and self.boxes:
  960. bxes = [b for bxs in self.boxes for b in bxs]
  961. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
  962. "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  963. logging.debug("Is it English:", self.is_english)
  964. self.page_cum_height = np.cumsum(self.page_cum_height)
  965. assert len(self.page_cum_height) == len(self.page_images) + 1
  966. if len(self.boxes) == 0 and zoomin < 9:
  967. self.__images__(fnm, zoomin * 3, page_from, page_to, callback)
  968. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  969. self.__images__(fnm, zoomin)
  970. self._layouts_rec(zoomin)
  971. self._table_transformer_job(zoomin)
  972. self._text_merge()
  973. self._concat_downward()
  974. self._filter_forpages()
  975. tbls = self._extract_table_figure(
  976. need_image, zoomin, return_html, False)
  977. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  978. def remove_tag(self, txt):
  979. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  980. def crop(self, text, ZM=3, need_position=False):
  981. imgs = []
  982. poss = []
  983. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  984. pn, left, right, top, bottom = tag.strip(
  985. "#").strip("@").split("\t")
  986. left, right, top, bottom = float(left), float(
  987. right), float(top), float(bottom)
  988. poss.append(([int(p) - 1 for p in pn.split("-")],
  989. left, right, top, bottom))
  990. if not poss:
  991. if need_position:
  992. return None, None
  993. return
  994. max_width = max(
  995. np.max([right - left for (_, left, right, _, _) in poss]), 6)
  996. GAP = 6
  997. pos = poss[0]
  998. poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
  999. 0, pos[3] - 120), max(pos[3] - GAP, 0)))
  1000. pos = poss[-1]
  1001. poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
  1002. min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
  1003. positions = []
  1004. for ii, (pns, left, right, top, bottom) in enumerate(poss):
  1005. right = left + max_width
  1006. bottom *= ZM
  1007. for pn in pns[1:]:
  1008. bottom += self.page_images[pn - 1].size[1]
  1009. imgs.append(
  1010. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  1011. right *
  1012. ZM, min(
  1013. bottom, self.page_images[pns[0]].size[1])
  1014. ))
  1015. )
  1016. if 0 < ii < len(poss) - 1:
  1017. positions.append((pns[0] + self.page_from, left, right, top, min(
  1018. bottom, self.page_images[pns[0]].size[1]) / ZM))
  1019. bottom -= self.page_images[pns[0]].size[1]
  1020. for pn in pns[1:]:
  1021. imgs.append(
  1022. self.page_images[pn].crop((left * ZM, 0,
  1023. right * ZM,
  1024. min(bottom,
  1025. self.page_images[pn].size[1])
  1026. ))
  1027. )
  1028. if 0 < ii < len(poss) - 1:
  1029. positions.append((pn + self.page_from, left, right, 0, min(
  1030. bottom, self.page_images[pn].size[1]) / ZM))
  1031. bottom -= self.page_images[pn].size[1]
  1032. if not imgs:
  1033. if need_position:
  1034. return None, None
  1035. return
  1036. height = 0
  1037. for img in imgs:
  1038. height += img.size[1] + GAP
  1039. height = int(height)
  1040. width = int(np.max([i.size[0] for i in imgs]))
  1041. pic = Image.new("RGB",
  1042. (width, height),
  1043. (245, 245, 245))
  1044. height = 0
  1045. for ii, img in enumerate(imgs):
  1046. if ii == 0 or ii + 1 == len(imgs):
  1047. img = img.convert('RGBA')
  1048. overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
  1049. overlay.putalpha(128)
  1050. img = Image.alpha_composite(img, overlay).convert("RGB")
  1051. pic.paste(img, (0, int(height)))
  1052. height += img.size[1] + GAP
  1053. if need_position:
  1054. return pic, positions
  1055. return pic
  1056. def get_position(self, bx, ZM):
  1057. poss = []
  1058. pn = bx["page_number"]
  1059. top = bx["top"] - self.page_cum_height[pn - 1]
  1060. bott = bx["bottom"] - self.page_cum_height[pn - 1]
  1061. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1062. bott, self.page_images[pn - 1].size[1] / ZM)))
  1063. while bott * ZM > self.page_images[pn - 1].size[1]:
  1064. bott -= self.page_images[pn - 1].size[1] / ZM
  1065. top = 0
  1066. pn += 1
  1067. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1068. bott, self.page_images[pn - 1].size[1] / ZM)))
  1069. return poss
  1070. class PlainParser(object):
  1071. def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
  1072. self.outlines = []
  1073. lines = []
  1074. try:
  1075. self.pdf = pdf2_read(
  1076. filename if isinstance(
  1077. filename, str) else BytesIO(filename))
  1078. for page in self.pdf.pages[from_page:to_page]:
  1079. lines.extend([t for t in page.extract_text().split("\n")])
  1080. outlines = self.pdf.outline
  1081. def dfs(arr, depth):
  1082. for a in arr:
  1083. if isinstance(a, dict):
  1084. self.outlines.append((a["/Title"], depth))
  1085. continue
  1086. dfs(a, depth + 1)
  1087. dfs(outlines, 0)
  1088. except Exception:
  1089. logging.exception("Outlines exception")
  1090. if not self.outlines:
  1091. logging.warning("Miss outlines")
  1092. return [(line, "") for line in lines], []
  1093. def crop(self, ck, need_position):
  1094. raise NotImplementedError
  1095. @staticmethod
  1096. def remove_tag(txt):
  1097. raise NotImplementedError
  1098. if __name__ == "__main__":
  1099. pass