Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import logging
  17. import os
  18. import random
  19. import re
  20. import sys
  21. import threading
  22. from copy import deepcopy
  23. from io import BytesIO
  24. from timeit import default_timer as timer
  25. import numpy as np
  26. import pdfplumber
  27. import trio
  28. import xgboost as xgb
  29. from huggingface_hub import snapshot_download
  30. from PIL import Image
  31. from pypdf import PdfReader as pdf2_read
  32. from api import settings
  33. from api.utils.file_utils import get_project_base_directory
  34. from deepdoc.vision import OCR, LayoutRecognizer, Recognizer, TableStructureRecognizer
  35. from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
  36. from rag.nlp import rag_tokenizer
  37. from rag.prompts import vision_llm_describe_prompt
  38. from rag.settings import PARALLEL_DEVICES
  39. LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
  40. if LOCK_KEY_pdfplumber not in sys.modules:
  41. sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
  42. class RAGFlowPdfParser:
  43. def __init__(self, **kwargs):
  44. """
  45. If you have trouble downloading HuggingFace models, -_^ this might help!!
  46. For Linux:
  47. export HF_ENDPOINT=https://hf-mirror.com
  48. For Windows:
  49. Good luck
  50. ^_-
  51. """
  52. self.ocr = OCR()
  53. self.parallel_limiter = None
  54. if PARALLEL_DEVICES > 1:
  55. self.parallel_limiter = [trio.CapacityLimiter(1) for _ in range(PARALLEL_DEVICES)]
  56. if hasattr(self, "model_speciess"):
  57. self.layouter = LayoutRecognizer("layout." + self.model_speciess)
  58. else:
  59. self.layouter = LayoutRecognizer("layout")
  60. self.tbl_det = TableStructureRecognizer()
  61. self.updown_cnt_mdl = xgb.Booster()
  62. if not settings.LIGHTEN:
  63. try:
  64. import torch.cuda
  65. if torch.cuda.is_available():
  66. self.updown_cnt_mdl.set_param({"device": "cuda"})
  67. except Exception:
  68. logging.exception("RAGFlowPdfParser __init__")
  69. try:
  70. model_dir = os.path.join(
  71. get_project_base_directory(),
  72. "rag/res/deepdoc")
  73. self.updown_cnt_mdl.load_model(os.path.join(
  74. model_dir, "updown_concat_xgb.model"))
  75. except Exception:
  76. model_dir = snapshot_download(
  77. repo_id="InfiniFlow/text_concat_xgb_v1.0",
  78. local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
  79. local_dir_use_symlinks=False)
  80. self.updown_cnt_mdl.load_model(os.path.join(
  81. model_dir, "updown_concat_xgb.model"))
  82. self.page_from = 0
  83. def __char_width(self, c):
  84. return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
  85. def __height(self, c):
  86. return c["bottom"] - c["top"]
  87. def _x_dis(self, a, b):
  88. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  89. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  90. def _y_dis(
  91. self, a, b):
  92. return (
  93. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  94. def _match_proj(self, b):
  95. proj_patt = [
  96. r"第[零一二三四五六七八九十百]+章",
  97. r"第[零一二三四五六七八九十百]+[条节]",
  98. r"[零一二三四五六七八九十百]+[、是  ]",
  99. r"[\((][零一二三四五六七八九十百]+[)\)]",
  100. r"[\((][0-9]+[)\)]",
  101. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  102. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  103. r"[⚫•➢①② ]",
  104. ]
  105. return any([re.match(p, b["text"]) for p in proj_patt])
  106. def _updown_concat_features(self, up, down):
  107. w = max(self.__char_width(up), self.__char_width(down))
  108. h = max(self.__height(up), self.__height(down))
  109. y_dis = self._y_dis(up, down)
  110. LEN = 6
  111. tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split()
  112. tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split()
  113. tks_all = up["text"][-LEN:].strip() \
  114. + (" " if re.match(r"[a-zA-Z0-9]+",
  115. up["text"][-1] + down["text"][0]) else "") \
  116. + down["text"][:LEN].strip()
  117. tks_all = rag_tokenizer.tokenize(tks_all).split()
  118. fea = [
  119. up.get("R", -1) == down.get("R", -1),
  120. y_dis / h,
  121. down["page_number"] - up["page_number"],
  122. up["layout_type"] == down["layout_type"],
  123. up["layout_type"] == "text",
  124. down["layout_type"] == "text",
  125. up["layout_type"] == "table",
  126. down["layout_type"] == "table",
  127. True if re.search(
  128. r"([。?!;!?;+))]|[a-z]\.)$",
  129. up["text"]) else False,
  130. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  131. True if re.search(
  132. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  133. down["text"]) else False,
  134. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  135. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  136. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  137. True if re.search(r"[\((][^\))]+$", up["text"])
  138. and re.search(r"[\))]", down["text"]) else False,
  139. self._match_proj(down),
  140. True if re.match(r"[A-Z]", down["text"]) else False,
  141. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  142. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  143. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  144. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  145. ) > 1 and len(
  146. down["text"].strip()) > 1 else False,
  147. up["x0"] > down["x1"],
  148. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  149. self.__height(down)),
  150. self._x_dis(up, down) / max(w, 0.000001),
  151. (len(up["text"]) - len(down["text"])) /
  152. max(len(up["text"]), len(down["text"])),
  153. len(tks_all) - len(tks_up) - len(tks_down),
  154. len(tks_down) - len(tks_up),
  155. tks_down[-1] == tks_up[-1] if tks_down and tks_up else False,
  156. max(down["in_row"], up["in_row"]),
  157. abs(down["in_row"] - up["in_row"]),
  158. len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
  159. len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
  160. ]
  161. return fea
  162. @staticmethod
  163. def sort_X_by_page(arr, threshold):
  164. # sort using y1 first and then x1
  165. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  166. for i in range(len(arr) - 1):
  167. for j in range(i, -1, -1):
  168. # restore the order using th
  169. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threshold \
  170. and arr[j + 1]["top"] < arr[j]["top"] \
  171. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  172. tmp = arr[j]
  173. arr[j] = arr[j + 1]
  174. arr[j + 1] = tmp
  175. return arr
  176. def _has_color(self, o):
  177. if o.get("ncs", "") == "DeviceGray":
  178. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  179. o["non_stroking_color"][0] == 1:
  180. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  181. return False
  182. return True
  183. def _table_transformer_job(self, ZM):
  184. logging.debug("Table processing...")
  185. imgs, pos = [], []
  186. tbcnt = [0]
  187. MARGIN = 10
  188. self.tb_cpns = []
  189. assert len(self.page_layout) == len(self.page_images)
  190. for p, tbls in enumerate(self.page_layout): # for page
  191. tbls = [f for f in tbls if f["type"] == "table"]
  192. tbcnt.append(len(tbls))
  193. if not tbls:
  194. continue
  195. for tb in tbls: # for table
  196. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  197. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  198. left *= ZM
  199. top *= ZM
  200. right *= ZM
  201. bott *= ZM
  202. pos.append((left, top))
  203. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  204. assert len(self.page_images) == len(tbcnt) - 1
  205. if not imgs:
  206. return
  207. recos = self.tbl_det(imgs)
  208. tbcnt = np.cumsum(tbcnt)
  209. for i in range(len(tbcnt) - 1): # for page
  210. pg = []
  211. for j, tb_items in enumerate(
  212. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  213. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  214. for it in tb_items: # for table components
  215. it["x0"] = (it["x0"] + poss[j][0])
  216. it["x1"] = (it["x1"] + poss[j][0])
  217. it["top"] = (it["top"] + poss[j][1])
  218. it["bottom"] = (it["bottom"] + poss[j][1])
  219. for n in ["x0", "x1", "top", "bottom"]:
  220. it[n] /= ZM
  221. it["top"] += self.page_cum_height[i]
  222. it["bottom"] += self.page_cum_height[i]
  223. it["pn"] = i
  224. it["layoutno"] = j
  225. pg.append(it)
  226. self.tb_cpns.extend(pg)
  227. def gather(kwd, fzy=10, ption=0.6):
  228. eles = Recognizer.sort_Y_firstly(
  229. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  230. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  231. return Recognizer.sort_Y_firstly(eles, 0)
  232. # add R,H,C,SP tag to boxes within table layout
  233. headers = gather(r".*header$")
  234. rows = gather(r".* (row|header)")
  235. spans = gather(r".*spanning")
  236. clmns = sorted([r for r in self.tb_cpns if re.match(
  237. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  238. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  239. for b in self.boxes:
  240. if b.get("layout_type", "") != "table":
  241. continue
  242. ii = Recognizer.find_overlapped_with_threshold(b, rows, thr=0.3)
  243. if ii is not None:
  244. b["R"] = ii
  245. b["R_top"] = rows[ii]["top"]
  246. b["R_bott"] = rows[ii]["bottom"]
  247. ii = Recognizer.find_overlapped_with_threshold(
  248. b, headers, thr=0.3)
  249. if ii is not None:
  250. b["H_top"] = headers[ii]["top"]
  251. b["H_bott"] = headers[ii]["bottom"]
  252. b["H_left"] = headers[ii]["x0"]
  253. b["H_right"] = headers[ii]["x1"]
  254. b["H"] = ii
  255. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  256. if ii is not None:
  257. b["C"] = ii
  258. b["C_left"] = clmns[ii]["x0"]
  259. b["C_right"] = clmns[ii]["x1"]
  260. ii = Recognizer.find_overlapped_with_threshold(b, spans, thr=0.3)
  261. if ii is not None:
  262. b["H_top"] = spans[ii]["top"]
  263. b["H_bott"] = spans[ii]["bottom"]
  264. b["H_left"] = spans[ii]["x0"]
  265. b["H_right"] = spans[ii]["x1"]
  266. b["SP"] = ii
  267. def __ocr(self, pagenum, img, chars, ZM=3, device_id: int | None = None):
  268. start = timer()
  269. bxs = self.ocr.detect(np.array(img), device_id)
  270. logging.info(f"__ocr detecting boxes of a image cost ({timer() - start}s)")
  271. start = timer()
  272. if not bxs:
  273. self.boxes.append([])
  274. return
  275. bxs = [(line[0], line[1][0]) for line in bxs]
  276. bxs = Recognizer.sort_Y_firstly(
  277. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  278. "top": b[0][1] / ZM, "text": "", "txt": t,
  279. "bottom": b[-1][1] / ZM,
  280. "chars": [],
  281. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  282. self.mean_height[pagenum-1] / 3
  283. )
  284. # merge chars in the same rect
  285. for c in chars:
  286. ii = Recognizer.find_overlapped(c, bxs)
  287. if ii is None:
  288. self.lefted_chars.append(c)
  289. continue
  290. ch = c["bottom"] - c["top"]
  291. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  292. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  293. self.lefted_chars.append(c)
  294. continue
  295. bxs[ii]["chars"].append(c)
  296. for b in bxs:
  297. if not b["chars"]:
  298. del b["chars"]
  299. continue
  300. m_ht = np.mean([c["height"] for c in b["chars"]])
  301. for c in Recognizer.sort_Y_firstly(b["chars"], m_ht):
  302. if c["text"] == " " and b["text"]:
  303. if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", b["text"][-1]):
  304. b["text"] += " "
  305. else:
  306. b["text"] += c["text"]
  307. del b["chars"]
  308. logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s")
  309. start = timer()
  310. boxes_to_reg = []
  311. img_np = np.array(img)
  312. for b in bxs:
  313. if not b["text"]:
  314. left, right, top, bott = b["x0"] * ZM, b["x1"] * \
  315. ZM, b["top"] * ZM, b["bottom"] * ZM
  316. b["box_image"] = self.ocr.get_rotate_crop_image(img_np, np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32))
  317. boxes_to_reg.append(b)
  318. del b["txt"]
  319. texts = self.ocr.recognize_batch([b["box_image"] for b in boxes_to_reg], device_id)
  320. for i in range(len(boxes_to_reg)):
  321. boxes_to_reg[i]["text"] = texts[i]
  322. del boxes_to_reg[i]["box_image"]
  323. logging.info(f"__ocr recognize {len(bxs)} boxes cost {timer() - start}s")
  324. bxs = [b for b in bxs if b["text"]]
  325. if self.mean_height[pagenum-1] == 0:
  326. self.mean_height[pagenum-1] = np.median([b["bottom"] - b["top"]
  327. for b in bxs])
  328. self.boxes.append(bxs)
  329. def _layouts_rec(self, ZM, drop=True):
  330. assert len(self.page_images) == len(self.boxes)
  331. self.boxes, self.page_layout = self.layouter(
  332. self.page_images, self.boxes, ZM, drop=drop)
  333. # cumlative Y
  334. for i in range(len(self.boxes)):
  335. self.boxes[i]["top"] += \
  336. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  337. self.boxes[i]["bottom"] += \
  338. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  339. def _text_merge(self):
  340. # merge adjusted boxes
  341. bxs = self.boxes
  342. def end_with(b, txt):
  343. txt = txt.strip()
  344. tt = b.get("text", "").strip()
  345. return tt and tt.find(txt) == len(tt) - len(txt)
  346. def start_with(b, txts):
  347. tt = b.get("text", "").strip()
  348. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  349. # horizontally merge adjacent box with the same layout
  350. i = 0
  351. while i < len(bxs) - 1:
  352. b = bxs[i]
  353. b_ = bxs[i + 1]
  354. if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
  355. "equation"]:
  356. i += 1
  357. continue
  358. if abs(self._y_dis(b, b_)
  359. ) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
  360. # merge
  361. bxs[i]["x1"] = b_["x1"]
  362. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  363. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  364. bxs[i]["text"] += b_["text"]
  365. bxs.pop(i + 1)
  366. continue
  367. i += 1
  368. continue
  369. dis_thr = 1
  370. dis = b["x1"] - b_["x0"]
  371. if b.get("layout_type", "") != "text" or b_.get(
  372. "layout_type", "") != "text":
  373. if end_with(b, ",") or start_with(b_, "(,"):
  374. dis_thr = -8
  375. else:
  376. i += 1
  377. continue
  378. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  379. and dis >= dis_thr and b["x1"] < b_["x1"]:
  380. # merge
  381. bxs[i]["x1"] = b_["x1"]
  382. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  383. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  384. bxs[i]["text"] += b_["text"]
  385. bxs.pop(i + 1)
  386. continue
  387. i += 1
  388. self.boxes = bxs
  389. def _naive_vertical_merge(self):
  390. bxs = Recognizer.sort_Y_firstly(
  391. self.boxes, np.median(
  392. self.mean_height) / 3)
  393. i = 0
  394. while i + 1 < len(bxs):
  395. b = bxs[i]
  396. b_ = bxs[i + 1]
  397. if b["page_number"] < b_["page_number"] and re.match(
  398. r"[0-9 •一—-]+$", b["text"]):
  399. bxs.pop(i)
  400. continue
  401. if not b["text"].strip():
  402. bxs.pop(i)
  403. continue
  404. concatting_feats = [
  405. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  406. len(b["text"].strip()) > 1 and b["text"].strip(
  407. )[-2] in ",;:'\",‘“、;:",
  408. b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:",
  409. ]
  410. # features for not concating
  411. feats = [
  412. b.get("layoutno", 0) != b_.get("layoutno", 0),
  413. b["text"].strip()[-1] in "。?!?",
  414. self.is_english and b["text"].strip()[-1] in ".!?",
  415. b["page_number"] == b_["page_number"] and b_["top"] -
  416. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  417. b["page_number"] < b_["page_number"] and abs(
  418. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
  419. ]
  420. # split features
  421. detach_feats = [b["x1"] < b_["x0"],
  422. b["x0"] > b_["x1"]]
  423. if (any(feats) and not any(concatting_feats)) or any(detach_feats):
  424. logging.debug("{} {} {} {}".format(
  425. b["text"],
  426. b_["text"],
  427. any(feats),
  428. any(concatting_feats),
  429. ))
  430. i += 1
  431. continue
  432. # merge up and down
  433. b["bottom"] = b_["bottom"]
  434. b["text"] += b_["text"]
  435. b["x0"] = min(b["x0"], b_["x0"])
  436. b["x1"] = max(b["x1"], b_["x1"])
  437. bxs.pop(i + 1)
  438. self.boxes = bxs
  439. def _concat_downward(self, concat_between_pages=True):
  440. self.boxes = Recognizer.sort_Y_firstly(self.boxes, 0)
  441. return
  442. # count boxes in the same row as a feature
  443. for i in range(len(self.boxes)):
  444. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  445. self.boxes[i]["in_row"] = 0
  446. j = max(0, i - 12)
  447. while j < min(i + 12, len(self.boxes)):
  448. if j == i:
  449. j += 1
  450. continue
  451. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  452. if abs(ydis) < 1:
  453. self.boxes[i]["in_row"] += 1
  454. elif ydis > 0:
  455. break
  456. j += 1
  457. # concat between rows
  458. boxes = deepcopy(self.boxes)
  459. blocks = []
  460. while boxes:
  461. chunks = []
  462. def dfs(up, dp):
  463. chunks.append(up)
  464. i = dp
  465. while i < min(dp + 12, len(boxes)):
  466. ydis = self._y_dis(up, boxes[i])
  467. smpg = up["page_number"] == boxes[i]["page_number"]
  468. mh = self.mean_height[up["page_number"] - 1]
  469. mw = self.mean_width[up["page_number"] - 1]
  470. if smpg and ydis > mh * 4:
  471. break
  472. if not smpg and ydis > mh * 16:
  473. break
  474. down = boxes[i]
  475. if not concat_between_pages and down["page_number"] > up["page_number"]:
  476. break
  477. if up.get("R", "") != down.get(
  478. "R", "") and up["text"][-1] != ",":
  479. i += 1
  480. continue
  481. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  482. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \
  483. or not down["text"].strip():
  484. i += 1
  485. continue
  486. if not down["text"].strip() or not up["text"].strip():
  487. i += 1
  488. continue
  489. if up["x1"] < down["x0"] - 10 * \
  490. mw or up["x0"] > down["x1"] + 10 * mw:
  491. i += 1
  492. continue
  493. if i - dp < 5 and up.get("layout_type") == "text":
  494. if up.get("layoutno", "1") == down.get(
  495. "layoutno", "2"):
  496. dfs(down, i + 1)
  497. boxes.pop(i)
  498. return
  499. i += 1
  500. continue
  501. fea = self._updown_concat_features(up, down)
  502. if self.updown_cnt_mdl.predict(
  503. xgb.DMatrix([fea]))[0] <= 0.5:
  504. i += 1
  505. continue
  506. dfs(down, i + 1)
  507. boxes.pop(i)
  508. return
  509. dfs(boxes[0], 1)
  510. boxes.pop(0)
  511. if chunks:
  512. blocks.append(chunks)
  513. # concat within each block
  514. boxes = []
  515. for b in blocks:
  516. if len(b) == 1:
  517. boxes.append(b[0])
  518. continue
  519. t = b[0]
  520. for c in b[1:]:
  521. t["text"] = t["text"].strip()
  522. c["text"] = c["text"].strip()
  523. if not c["text"]:
  524. continue
  525. if t["text"] and re.match(
  526. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  527. t["text"] += " "
  528. t["text"] += c["text"]
  529. t["x0"] = min(t["x0"], c["x0"])
  530. t["x1"] = max(t["x1"], c["x1"])
  531. t["page_number"] = min(t["page_number"], c["page_number"])
  532. t["bottom"] = c["bottom"]
  533. if not t["layout_type"] \
  534. and c["layout_type"]:
  535. t["layout_type"] = c["layout_type"]
  536. boxes.append(t)
  537. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  538. def _filter_forpages(self):
  539. if not self.boxes:
  540. return
  541. findit = False
  542. i = 0
  543. while i < len(self.boxes):
  544. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
  545. re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  546. i += 1
  547. continue
  548. findit = True
  549. eng = re.match(
  550. r"[0-9a-zA-Z :'.-]{5,}",
  551. self.boxes[i]["text"].strip())
  552. self.boxes.pop(i)
  553. if i >= len(self.boxes):
  554. break
  555. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  556. self.boxes[i]["text"].strip().split()[:2])
  557. while not prefix:
  558. self.boxes.pop(i)
  559. if i >= len(self.boxes):
  560. break
  561. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  562. self.boxes[i]["text"].strip().split()[:2])
  563. self.boxes.pop(i)
  564. if i >= len(self.boxes) or not prefix:
  565. break
  566. for j in range(i, min(i + 128, len(self.boxes))):
  567. if not re.match(prefix, self.boxes[j]["text"]):
  568. continue
  569. for k in range(i, j):
  570. self.boxes.pop(i)
  571. break
  572. if findit:
  573. return
  574. page_dirty = [0] * len(self.page_images)
  575. for b in self.boxes:
  576. if re.search(r"(··|··|··)", b["text"]):
  577. page_dirty[b["page_number"] - 1] += 1
  578. page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
  579. if not page_dirty:
  580. return
  581. i = 0
  582. while i < len(self.boxes):
  583. if self.boxes[i]["page_number"] in page_dirty:
  584. self.boxes.pop(i)
  585. continue
  586. i += 1
  587. def _merge_with_same_bullet(self):
  588. i = 0
  589. while i + 1 < len(self.boxes):
  590. b = self.boxes[i]
  591. b_ = self.boxes[i + 1]
  592. if not b["text"].strip():
  593. self.boxes.pop(i)
  594. continue
  595. if not b_["text"].strip():
  596. self.boxes.pop(i + 1)
  597. continue
  598. if b["text"].strip()[0] != b_["text"].strip()[0] \
  599. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  600. or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
  601. or b["top"] > b_["bottom"]:
  602. i += 1
  603. continue
  604. b_["text"] = b["text"] + "\n" + b_["text"]
  605. b_["x0"] = min(b["x0"], b_["x0"])
  606. b_["x1"] = max(b["x1"], b_["x1"])
  607. b_["top"] = b["top"]
  608. self.boxes.pop(i)
  609. def _extract_table_figure(self, need_image, ZM, return_html, need_position, separate_tables_figures=False):
  610. tables = {}
  611. figures = {}
  612. # extract figure and table boxes
  613. i = 0
  614. lst_lout_no = ""
  615. nomerge_lout_no = []
  616. while i < len(self.boxes):
  617. if "layoutno" not in self.boxes[i]:
  618. i += 1
  619. continue
  620. lout_no = str(self.boxes[i]["page_number"]) + \
  621. "-" + str(self.boxes[i]["layoutno"])
  622. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
  623. "title",
  624. "figure caption",
  625. "reference"]:
  626. nomerge_lout_no.append(lst_lout_no)
  627. if self.boxes[i]["layout_type"] == "table":
  628. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  629. self.boxes.pop(i)
  630. continue
  631. if lout_no not in tables:
  632. tables[lout_no] = []
  633. tables[lout_no].append(self.boxes[i])
  634. self.boxes.pop(i)
  635. lst_lout_no = lout_no
  636. continue
  637. if need_image and self.boxes[i]["layout_type"] == "figure":
  638. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  639. self.boxes.pop(i)
  640. continue
  641. if lout_no not in figures:
  642. figures[lout_no] = []
  643. figures[lout_no].append(self.boxes[i])
  644. self.boxes.pop(i)
  645. lst_lout_no = lout_no
  646. continue
  647. i += 1
  648. # merge table on different pages
  649. nomerge_lout_no = set(nomerge_lout_no)
  650. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  651. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  652. i = len(tbls) - 1
  653. while i - 1 >= 0:
  654. k0, bxs0 = tbls[i - 1]
  655. k, bxs = tbls[i]
  656. i -= 1
  657. if k0 in nomerge_lout_no:
  658. continue
  659. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  660. continue
  661. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  662. continue
  663. mh = self.mean_height[bxs[0]["page_number"] - 1]
  664. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  665. continue
  666. tables[k0].extend(tables[k])
  667. del tables[k]
  668. def x_overlapped(a, b):
  669. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  670. # find captions and pop out
  671. i = 0
  672. while i < len(self.boxes):
  673. c = self.boxes[i]
  674. # mh = self.mean_height[c["page_number"]-1]
  675. if not TableStructureRecognizer.is_caption(c):
  676. i += 1
  677. continue
  678. # find the nearest layouts
  679. def nearest(tbls):
  680. nonlocal c
  681. mink = ""
  682. minv = 1000000000
  683. for k, bxs in tbls.items():
  684. for b in bxs:
  685. if b.get("layout_type", "").find("caption") >= 0:
  686. continue
  687. y_dis = self._y_dis(c, b)
  688. x_dis = self._x_dis(
  689. c, b) if not x_overlapped(
  690. c, b) else 0
  691. dis = y_dis * y_dis + x_dis * x_dis
  692. if dis < minv:
  693. mink = k
  694. minv = dis
  695. return mink, minv
  696. tk, tv = nearest(tables)
  697. fk, fv = nearest(figures)
  698. # if min(tv, fv) > 2000:
  699. # i += 1
  700. # continue
  701. if tv < fv and tk:
  702. tables[tk].insert(0, c)
  703. logging.debug(
  704. "TABLE:" +
  705. self.boxes[i]["text"] +
  706. "; Cap: " +
  707. tk)
  708. elif fk:
  709. figures[fk].insert(0, c)
  710. logging.debug(
  711. "FIGURE:" +
  712. self.boxes[i]["text"] +
  713. "; Cap: " +
  714. tk)
  715. self.boxes.pop(i)
  716. def cropout(bxs, ltype, poss):
  717. nonlocal ZM
  718. pn = set([b["page_number"] - 1 for b in bxs])
  719. if len(pn) < 2:
  720. pn = list(pn)[0]
  721. ht = self.page_cum_height[pn]
  722. b = {
  723. "x0": np.min([b["x0"] for b in bxs]),
  724. "top": np.min([b["top"] for b in bxs]) - ht,
  725. "x1": np.max([b["x1"] for b in bxs]),
  726. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  727. }
  728. louts = [layout for layout in self.page_layout[pn] if layout["type"] == ltype]
  729. ii = Recognizer.find_overlapped(b, louts, naive=True)
  730. if ii is not None:
  731. b = louts[ii]
  732. else:
  733. logging.warning(
  734. f"Missing layout match: {pn + 1},%s" %
  735. (bxs[0].get(
  736. "layoutno", "")))
  737. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  738. if right < left:
  739. right = left + 1
  740. poss.append((pn + self.page_from, left, right, top, bott))
  741. return self.page_images[pn] \
  742. .crop((left * ZM, top * ZM,
  743. right * ZM, bott * ZM))
  744. pn = {}
  745. for b in bxs:
  746. p = b["page_number"] - 1
  747. if p not in pn:
  748. pn[p] = []
  749. pn[p].append(b)
  750. pn = sorted(pn.items(), key=lambda x: x[0])
  751. imgs = [cropout(arr, ltype, poss) for p, arr in pn]
  752. pic = Image.new("RGB",
  753. (int(np.max([i.size[0] for i in imgs])),
  754. int(np.sum([m.size[1] for m in imgs]))),
  755. (245, 245, 245))
  756. height = 0
  757. for img in imgs:
  758. pic.paste(img, (0, int(height)))
  759. height += img.size[1]
  760. return pic
  761. res = []
  762. positions = []
  763. figure_results = []
  764. figure_positions = []
  765. # crop figure out and add caption
  766. for k, bxs in figures.items():
  767. txt = "\n".join([b["text"] for b in bxs])
  768. if not txt:
  769. continue
  770. poss = []
  771. if separate_tables_figures:
  772. figure_results.append(
  773. (cropout(
  774. bxs,
  775. "figure", poss),
  776. [txt]))
  777. figure_positions.append(poss)
  778. else:
  779. res.append(
  780. (cropout(
  781. bxs,
  782. "figure", poss),
  783. [txt]))
  784. positions.append(poss)
  785. for k, bxs in tables.items():
  786. if not bxs:
  787. continue
  788. bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
  789. [(b["bottom"] - b["top"]) / 2 for b in bxs]))
  790. poss = []
  791. res.append((cropout(bxs, "table", poss),
  792. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  793. positions.append(poss)
  794. if separate_tables_figures:
  795. assert len(positions) + len(figure_positions) == len(res) + len(figure_results)
  796. if need_position:
  797. return list(zip(res, positions)), list(zip(figure_results, figure_positions))
  798. else:
  799. return res, figure_results
  800. else:
  801. assert len(positions) == len(res)
  802. if need_position:
  803. return list(zip(res, positions))
  804. else:
  805. return res
  806. def proj_match(self, line):
  807. if len(line) <= 2:
  808. return
  809. if re.match(r"[0-9 ().,%%+/-]+$", line):
  810. return False
  811. for p, j in [
  812. (r"第[零一二三四五六七八九十百]+章", 1),
  813. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  814. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  815. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  816. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  817. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  818. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  819. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  820. (r".{,48}[::??]$", 9),
  821. (r"[0-9]+)", 10),
  822. (r"[\((][0-9]+[)\)]", 11),
  823. (r"[零一二三四五六七八九十百]+是", 12),
  824. (r"[⚫•➢✓]", 12)
  825. ]:
  826. if re.match(p, line):
  827. return j
  828. return
  829. def _line_tag(self, bx, ZM):
  830. pn = [bx["page_number"]]
  831. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  832. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  833. page_images_cnt = len(self.page_images)
  834. if pn[-1] - 1 >= page_images_cnt:
  835. return ""
  836. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  837. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  838. pn.append(pn[-1] + 1)
  839. if pn[-1] - 1 >= page_images_cnt:
  840. return ""
  841. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  842. .format("-".join([str(p) for p in pn]),
  843. bx["x0"], bx["x1"], top, bott)
  844. def __filterout_scraps(self, boxes, ZM):
  845. def width(b):
  846. return b["x1"] - b["x0"]
  847. def height(b):
  848. return b["bottom"] - b["top"]
  849. def usefull(b):
  850. if b.get("layout_type"):
  851. return True
  852. if width(
  853. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  854. return True
  855. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  856. return True
  857. return False
  858. res = []
  859. while boxes:
  860. lines = []
  861. widths = []
  862. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  863. mh = self.mean_height[boxes[0]["page_number"] - 1]
  864. mj = self.proj_match(
  865. boxes[0]["text"]) or boxes[0].get(
  866. "layout_type",
  867. "") == "title"
  868. def dfs(line, st):
  869. nonlocal mh, pw, lines, widths
  870. lines.append(line)
  871. widths.append(width(line))
  872. mmj = self.proj_match(
  873. line["text"]) or line.get(
  874. "layout_type",
  875. "") == "title"
  876. for i in range(st + 1, min(st + 20, len(boxes))):
  877. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  878. break
  879. if not mmj and self._y_dis(
  880. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  881. break
  882. if not usefull(boxes[i]):
  883. continue
  884. if mmj or \
  885. (self._x_dis(boxes[i], line) < pw / 10): \
  886. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  887. # concat following
  888. dfs(boxes[i], i)
  889. boxes.pop(i)
  890. break
  891. try:
  892. if usefull(boxes[0]):
  893. dfs(boxes[0], 0)
  894. else:
  895. logging.debug("WASTE: " + boxes[0]["text"])
  896. except Exception:
  897. pass
  898. boxes.pop(0)
  899. mw = np.mean(widths)
  900. if mj or mw / pw >= 0.35 or mw > 200:
  901. res.append(
  902. "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  903. else:
  904. logging.debug("REMOVED: " +
  905. "<<".join([c["text"] for c in lines]))
  906. return "\n\n".join(res)
  907. @staticmethod
  908. def total_page_number(fnm, binary=None):
  909. try:
  910. with sys.modules[LOCK_KEY_pdfplumber]:
  911. pdf = pdfplumber.open(
  912. fnm) if not binary else pdfplumber.open(BytesIO(binary))
  913. total_page = len(pdf.pages)
  914. pdf.close()
  915. return total_page
  916. except Exception:
  917. logging.exception("total_page_number")
  918. def __images__(self, fnm, zoomin=3, page_from=0,
  919. page_to=299, callback=None):
  920. self.lefted_chars = []
  921. self.mean_height = []
  922. self.mean_width = []
  923. self.boxes = []
  924. self.garbages = {}
  925. self.page_cum_height = [0]
  926. self.page_layout = []
  927. self.page_from = page_from
  928. start = timer()
  929. try:
  930. with sys.modules[LOCK_KEY_pdfplumber]:
  931. with (pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))) as pdf:
  932. self.pdf = pdf
  933. self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).annotated for i, p in
  934. enumerate(self.pdf.pages[page_from:page_to])]
  935. try:
  936. self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
  937. except Exception as e:
  938. logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
  939. self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
  940. self.total_page = len(self.pdf.pages)
  941. except Exception:
  942. logging.exception("RAGFlowPdfParser __images__")
  943. logging.info(f"__images__ dedupe_chars cost {timer() - start}s")
  944. self.outlines = []
  945. try:
  946. with (pdf2_read(fnm if isinstance(fnm, str)
  947. else BytesIO(fnm))) as pdf:
  948. self.pdf = pdf
  949. outlines = self.pdf.outline
  950. def dfs(arr, depth):
  951. for a in arr:
  952. if isinstance(a, dict):
  953. self.outlines.append((a["/Title"], depth))
  954. continue
  955. dfs(a, depth + 1)
  956. dfs(outlines, 0)
  957. except Exception as e:
  958. logging.warning(f"Outlines exception: {e}")
  959. if not self.outlines:
  960. logging.warning("Miss outlines")
  961. logging.debug("Images converted.")
  962. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
  963. random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
  964. range(len(self.page_chars))]
  965. if sum([1 if e else 0 for e in self.is_english]) > len(
  966. self.page_images) / 2:
  967. self.is_english = True
  968. else:
  969. self.is_english = False
  970. async def __img_ocr(i, id, img, chars, limiter):
  971. j = 0
  972. while j + 1 < len(chars):
  973. if chars[j]["text"] and chars[j + 1]["text"] \
  974. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  975. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  976. chars[j]["width"]) / 2:
  977. chars[j]["text"] += " "
  978. j += 1
  979. if limiter:
  980. async with limiter:
  981. await trio.to_thread.run_sync(lambda: self.__ocr(i + 1, img, chars, zoomin, id))
  982. else:
  983. self.__ocr(i + 1, img, chars, zoomin, id)
  984. if callback and i % 6 == 5:
  985. callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
  986. async def __img_ocr_launcher():
  987. def __ocr_preprocess():
  988. chars = self.page_chars[i] if not self.is_english else []
  989. self.mean_height.append(
  990. np.median(sorted([c["height"] for c in chars])) if chars else 0
  991. )
  992. self.mean_width.append(
  993. np.median(sorted([c["width"] for c in chars])) if chars else 8
  994. )
  995. self.page_cum_height.append(img.size[1] / zoomin)
  996. return chars
  997. if self.parallel_limiter:
  998. async with trio.open_nursery() as nursery:
  999. for i, img in enumerate(self.page_images):
  1000. chars = __ocr_preprocess()
  1001. nursery.start_soon(__img_ocr, i, i % PARALLEL_DEVICES, img, chars,
  1002. self.parallel_limiter[i % PARALLEL_DEVICES])
  1003. await trio.sleep(0.1)
  1004. else:
  1005. for i, img in enumerate(self.page_images):
  1006. chars = __ocr_preprocess()
  1007. await __img_ocr(i, 0, img, chars, None)
  1008. start = timer()
  1009. trio.run(__img_ocr_launcher)
  1010. logging.info(f"__images__ {len(self.page_images)} pages cost {timer() - start}s")
  1011. if not self.is_english and not any(
  1012. [c for c in self.page_chars]) and self.boxes:
  1013. bxes = [b for bxs in self.boxes for b in bxs]
  1014. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
  1015. "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  1016. logging.debug("Is it English:", self.is_english)
  1017. self.page_cum_height = np.cumsum(self.page_cum_height)
  1018. assert len(self.page_cum_height) == len(self.page_images) + 1
  1019. if len(self.boxes) == 0 and zoomin < 9:
  1020. self.__images__(fnm, zoomin * 3, page_from, page_to, callback)
  1021. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  1022. self.__images__(fnm, zoomin)
  1023. self._layouts_rec(zoomin)
  1024. self._table_transformer_job(zoomin)
  1025. self._text_merge()
  1026. self._concat_downward()
  1027. self._filter_forpages()
  1028. tbls = self._extract_table_figure(
  1029. need_image, zoomin, return_html, False)
  1030. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  1031. @staticmethod
  1032. def remove_tag(txt):
  1033. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  1034. def crop(self, text, ZM=3, need_position=False):
  1035. imgs = []
  1036. poss = []
  1037. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  1038. pn, left, right, top, bottom = tag.strip(
  1039. "#").strip("@").split("\t")
  1040. left, right, top, bottom = float(left), float(
  1041. right), float(top), float(bottom)
  1042. poss.append(([int(p) - 1 for p in pn.split("-")],
  1043. left, right, top, bottom))
  1044. if not poss:
  1045. if need_position:
  1046. return None, None
  1047. return
  1048. max_width = max(
  1049. np.max([right - left for (_, left, right, _, _) in poss]), 6)
  1050. GAP = 6
  1051. pos = poss[0]
  1052. poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
  1053. 0, pos[3] - 120), max(pos[3] - GAP, 0)))
  1054. pos = poss[-1]
  1055. poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
  1056. min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
  1057. positions = []
  1058. for ii, (pns, left, right, top, bottom) in enumerate(poss):
  1059. right = left + max_width
  1060. bottom *= ZM
  1061. for pn in pns[1:]:
  1062. bottom += self.page_images[pn - 1].size[1]
  1063. imgs.append(
  1064. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  1065. right *
  1066. ZM, min(
  1067. bottom, self.page_images[pns[0]].size[1])
  1068. ))
  1069. )
  1070. if 0 < ii < len(poss) - 1:
  1071. positions.append((pns[0] + self.page_from, left, right, top, min(
  1072. bottom, self.page_images[pns[0]].size[1]) / ZM))
  1073. bottom -= self.page_images[pns[0]].size[1]
  1074. for pn in pns[1:]:
  1075. imgs.append(
  1076. self.page_images[pn].crop((left * ZM, 0,
  1077. right * ZM,
  1078. min(bottom,
  1079. self.page_images[pn].size[1])
  1080. ))
  1081. )
  1082. if 0 < ii < len(poss) - 1:
  1083. positions.append((pn + self.page_from, left, right, 0, min(
  1084. bottom, self.page_images[pn].size[1]) / ZM))
  1085. bottom -= self.page_images[pn].size[1]
  1086. if not imgs:
  1087. if need_position:
  1088. return None, None
  1089. return
  1090. height = 0
  1091. for img in imgs:
  1092. height += img.size[1] + GAP
  1093. height = int(height)
  1094. width = int(np.max([i.size[0] for i in imgs]))
  1095. pic = Image.new("RGB",
  1096. (width, height),
  1097. (245, 245, 245))
  1098. height = 0
  1099. for ii, img in enumerate(imgs):
  1100. if ii == 0 or ii + 1 == len(imgs):
  1101. img = img.convert('RGBA')
  1102. overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
  1103. overlay.putalpha(128)
  1104. img = Image.alpha_composite(img, overlay).convert("RGB")
  1105. pic.paste(img, (0, int(height)))
  1106. height += img.size[1] + GAP
  1107. if need_position:
  1108. return pic, positions
  1109. return pic
  1110. def get_position(self, bx, ZM):
  1111. poss = []
  1112. pn = bx["page_number"]
  1113. top = bx["top"] - self.page_cum_height[pn - 1]
  1114. bott = bx["bottom"] - self.page_cum_height[pn - 1]
  1115. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1116. bott, self.page_images[pn - 1].size[1] / ZM)))
  1117. while bott * ZM > self.page_images[pn - 1].size[1]:
  1118. bott -= self.page_images[pn - 1].size[1] / ZM
  1119. top = 0
  1120. pn += 1
  1121. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1122. bott, self.page_images[pn - 1].size[1] / ZM)))
  1123. return poss
  1124. class PlainParser:
  1125. def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
  1126. self.outlines = []
  1127. lines = []
  1128. try:
  1129. self.pdf = pdf2_read(
  1130. filename if isinstance(
  1131. filename, str) else BytesIO(filename))
  1132. for page in self.pdf.pages[from_page:to_page]:
  1133. lines.extend([t for t in page.extract_text().split("\n")])
  1134. outlines = self.pdf.outline
  1135. def dfs(arr, depth):
  1136. for a in arr:
  1137. if isinstance(a, dict):
  1138. self.outlines.append((a["/Title"], depth))
  1139. continue
  1140. dfs(a, depth + 1)
  1141. dfs(outlines, 0)
  1142. except Exception:
  1143. logging.exception("Outlines exception")
  1144. if not self.outlines:
  1145. logging.warning("Miss outlines")
  1146. return [(line, "") for line in lines], []
  1147. def crop(self, ck, need_position):
  1148. raise NotImplementedError
  1149. @staticmethod
  1150. def remove_tag(txt):
  1151. raise NotImplementedError
  1152. class VisionParser(RAGFlowPdfParser):
  1153. def __init__(self, vision_model, *args, **kwargs):
  1154. super().__init__(*args, **kwargs)
  1155. self.vision_model = vision_model
  1156. def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
  1157. try:
  1158. with sys.modules[LOCK_KEY_pdfplumber]:
  1159. self.pdf = pdfplumber.open(fnm) if isinstance(
  1160. fnm, str) else pdfplumber.open(BytesIO(fnm))
  1161. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  1162. enumerate(self.pdf.pages[page_from:page_to])]
  1163. self.total_page = len(self.pdf.pages)
  1164. except Exception:
  1165. self.page_images = None
  1166. self.total_page = 0
  1167. logging.exception("VisionParser __images__")
  1168. def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
  1169. callback = kwargs.get("callback", lambda prog, msg: None)
  1170. self.__images__(fnm=filename, zoomin=3, page_from=from_page, page_to=to_page, **kwargs)
  1171. total_pdf_pages = self.total_page
  1172. start_page = max(0, from_page)
  1173. end_page = min(to_page, total_pdf_pages)
  1174. all_docs = []
  1175. for idx, img_binary in enumerate(self.page_images or []):
  1176. pdf_page_num = idx # 0-based
  1177. if pdf_page_num < start_page or pdf_page_num >= end_page:
  1178. continue
  1179. docs = picture_vision_llm_chunk(
  1180. binary=img_binary,
  1181. vision_model=self.vision_model,
  1182. prompt=vision_llm_describe_prompt(page=pdf_page_num+1),
  1183. callback=callback,
  1184. )
  1185. if docs:
  1186. all_docs.append(docs)
  1187. return [(doc, "") for doc in all_docs], []
  1188. if __name__ == "__main__":
  1189. pass