Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

pdf_parser.py 47KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import logging
  17. import os
  18. import random
  19. from timeit import default_timer as timer
  20. import xgboost as xgb
  21. from io import BytesIO
  22. import re
  23. import pdfplumber
  24. from PIL import Image
  25. import numpy as np
  26. from pypdf import PdfReader as pdf2_read
  27. from api import settings
  28. from api.utils.file_utils import get_project_base_directory
  29. from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
  30. from rag.nlp import rag_tokenizer
  31. from copy import deepcopy
  32. from huggingface_hub import snapshot_download
  33. class RAGFlowPdfParser:
  34. def __init__(self):
  35. self.ocr = OCR()
  36. if hasattr(self, "model_speciess"):
  37. self.layouter = LayoutRecognizer("layout." + self.model_speciess)
  38. else:
  39. self.layouter = LayoutRecognizer("layout")
  40. self.tbl_det = TableStructureRecognizer()
  41. self.updown_cnt_mdl = xgb.Booster()
  42. if not settings.LIGHTEN:
  43. try:
  44. import torch
  45. if torch.cuda.is_available():
  46. self.updown_cnt_mdl.set_param({"device": "cuda"})
  47. except Exception:
  48. logging.exception("RAGFlowPdfParser __init__")
  49. try:
  50. model_dir = os.path.join(
  51. get_project_base_directory(),
  52. "rag/res/deepdoc")
  53. self.updown_cnt_mdl.load_model(os.path.join(
  54. model_dir, "updown_concat_xgb.model"))
  55. except Exception:
  56. model_dir = snapshot_download(
  57. repo_id="InfiniFlow/text_concat_xgb_v1.0",
  58. local_dir=os.path.join(get_project_base_directory(), "rag/res/deepdoc"),
  59. local_dir_use_symlinks=False)
  60. self.updown_cnt_mdl.load_model(os.path.join(
  61. model_dir, "updown_concat_xgb.model"))
  62. self.page_from = 0
  63. """
  64. If you have trouble downloading HuggingFace models, -_^ this might help!!
  65. For Linux:
  66. export HF_ENDPOINT=https://hf-mirror.com
  67. For Windows:
  68. Good luck
  69. ^_-
  70. """
  71. def __char_width(self, c):
  72. return (c["x1"] - c["x0"]) // max(len(c["text"]), 1)
  73. def __height(self, c):
  74. return c["bottom"] - c["top"]
  75. def _x_dis(self, a, b):
  76. return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
  77. abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
  78. def _y_dis(
  79. self, a, b):
  80. return (
  81. b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
  82. def _match_proj(self, b):
  83. proj_patt = [
  84. r"第[零一二三四五六七八九十百]+章",
  85. r"第[零一二三四五六七八九十百]+[条节]",
  86. r"[零一二三四五六七八九十百]+[、是  ]",
  87. r"[\((][零一二三四五六七八九十百]+[)\)]",
  88. r"[\((][0-9]+[)\)]",
  89. r"[0-9]+(、|\.[  ]|)|\.[^0-9./a-zA-Z_%><-]{4,})",
  90. r"[0-9]+\.[0-9.]+(、|\.[  ])",
  91. r"[⚫•➢①② ]",
  92. ]
  93. return any([re.match(p, b["text"]) for p in proj_patt])
  94. def _updown_concat_features(self, up, down):
  95. w = max(self.__char_width(up), self.__char_width(down))
  96. h = max(self.__height(up), self.__height(down))
  97. y_dis = self._y_dis(up, down)
  98. LEN = 6
  99. tks_down = rag_tokenizer.tokenize(down["text"][:LEN]).split()
  100. tks_up = rag_tokenizer.tokenize(up["text"][-LEN:]).split()
  101. tks_all = up["text"][-LEN:].strip() \
  102. + (" " if re.match(r"[a-zA-Z0-9]+",
  103. up["text"][-1] + down["text"][0]) else "") \
  104. + down["text"][:LEN].strip()
  105. tks_all = rag_tokenizer.tokenize(tks_all).split()
  106. fea = [
  107. up.get("R", -1) == down.get("R", -1),
  108. y_dis / h,
  109. down["page_number"] - up["page_number"],
  110. up["layout_type"] == down["layout_type"],
  111. up["layout_type"] == "text",
  112. down["layout_type"] == "text",
  113. up["layout_type"] == "table",
  114. down["layout_type"] == "table",
  115. True if re.search(
  116. r"([。?!;!?;+))]|[a-z]\.)$",
  117. up["text"]) else False,
  118. True if re.search(r"[,:‘“、0-9(+-]$", up["text"]) else False,
  119. True if re.search(
  120. r"(^.?[/,?;:\],。;:’”?!》】)-])",
  121. down["text"]) else False,
  122. True if re.match(r"[\((][^\(\)()]+[)\)]$", up["text"]) else False,
  123. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  124. True if re.search(r"[,,][^。.]+$", up["text"]) else False,
  125. True if re.search(r"[\((][^\))]+$", up["text"])
  126. and re.search(r"[\))]", down["text"]) else False,
  127. self._match_proj(down),
  128. True if re.match(r"[A-Z]", down["text"]) else False,
  129. True if re.match(r"[A-Z]", up["text"][-1]) else False,
  130. True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
  131. True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
  132. up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
  133. ) > 1 and len(
  134. down["text"].strip()) > 1 else False,
  135. up["x0"] > down["x1"],
  136. abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
  137. self.__height(down)),
  138. self._x_dis(up, down) / max(w, 0.000001),
  139. (len(up["text"]) - len(down["text"])) /
  140. max(len(up["text"]), len(down["text"])),
  141. len(tks_all) - len(tks_up) - len(tks_down),
  142. len(tks_down) - len(tks_up),
  143. tks_down[-1] == tks_up[-1] if tks_down and tks_up else False,
  144. max(down["in_row"], up["in_row"]),
  145. abs(down["in_row"] - up["in_row"]),
  146. len(tks_down) == 1 and rag_tokenizer.tag(tks_down[0]).find("n") >= 0,
  147. len(tks_up) == 1 and rag_tokenizer.tag(tks_up[0]).find("n") >= 0
  148. ]
  149. return fea
  150. @staticmethod
  151. def sort_X_by_page(arr, threashold):
  152. # sort using y1 first and then x1
  153. arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
  154. for i in range(len(arr) - 1):
  155. for j in range(i, -1, -1):
  156. # restore the order using th
  157. if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
  158. and arr[j + 1]["top"] < arr[j]["top"] \
  159. and arr[j + 1]["page_number"] == arr[j]["page_number"]:
  160. tmp = arr[j]
  161. arr[j] = arr[j + 1]
  162. arr[j + 1] = tmp
  163. return arr
  164. def _has_color(self, o):
  165. if o.get("ncs", "") == "DeviceGray":
  166. if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
  167. o["non_stroking_color"][0] == 1:
  168. if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
  169. return False
  170. return True
  171. def _table_transformer_job(self, ZM):
  172. logging.debug("Table processing...")
  173. imgs, pos = [], []
  174. tbcnt = [0]
  175. MARGIN = 10
  176. self.tb_cpns = []
  177. assert len(self.page_layout) == len(self.page_images)
  178. for p, tbls in enumerate(self.page_layout): # for page
  179. tbls = [f for f in tbls if f["type"] == "table"]
  180. tbcnt.append(len(tbls))
  181. if not tbls:
  182. continue
  183. for tb in tbls: # for table
  184. left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
  185. tb["x1"] + MARGIN, tb["bottom"] + MARGIN
  186. left *= ZM
  187. top *= ZM
  188. right *= ZM
  189. bott *= ZM
  190. pos.append((left, top))
  191. imgs.append(self.page_images[p].crop((left, top, right, bott)))
  192. assert len(self.page_images) == len(tbcnt) - 1
  193. if not imgs:
  194. return
  195. recos = self.tbl_det(imgs)
  196. tbcnt = np.cumsum(tbcnt)
  197. for i in range(len(tbcnt) - 1): # for page
  198. pg = []
  199. for j, tb_items in enumerate(
  200. recos[tbcnt[i]: tbcnt[i + 1]]): # for table
  201. poss = pos[tbcnt[i]: tbcnt[i + 1]]
  202. for it in tb_items: # for table components
  203. it["x0"] = (it["x0"] + poss[j][0])
  204. it["x1"] = (it["x1"] + poss[j][0])
  205. it["top"] = (it["top"] + poss[j][1])
  206. it["bottom"] = (it["bottom"] + poss[j][1])
  207. for n in ["x0", "x1", "top", "bottom"]:
  208. it[n] /= ZM
  209. it["top"] += self.page_cum_height[i]
  210. it["bottom"] += self.page_cum_height[i]
  211. it["pn"] = i
  212. it["layoutno"] = j
  213. pg.append(it)
  214. self.tb_cpns.extend(pg)
  215. def gather(kwd, fzy=10, ption=0.6):
  216. eles = Recognizer.sort_Y_firstly(
  217. [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
  218. eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
  219. return Recognizer.sort_Y_firstly(eles, 0)
  220. # add R,H,C,SP tag to boxes within table layout
  221. headers = gather(r".*header$")
  222. rows = gather(r".* (row|header)")
  223. spans = gather(r".*spanning")
  224. clmns = sorted([r for r in self.tb_cpns if re.match(
  225. r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
  226. clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
  227. for b in self.boxes:
  228. if b.get("layout_type", "") != "table":
  229. continue
  230. ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
  231. if ii is not None:
  232. b["R"] = ii
  233. b["R_top"] = rows[ii]["top"]
  234. b["R_bott"] = rows[ii]["bottom"]
  235. ii = Recognizer.find_overlapped_with_threashold(
  236. b, headers, thr=0.3)
  237. if ii is not None:
  238. b["H_top"] = headers[ii]["top"]
  239. b["H_bott"] = headers[ii]["bottom"]
  240. b["H_left"] = headers[ii]["x0"]
  241. b["H_right"] = headers[ii]["x1"]
  242. b["H"] = ii
  243. ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
  244. if ii is not None:
  245. b["C"] = ii
  246. b["C_left"] = clmns[ii]["x0"]
  247. b["C_right"] = clmns[ii]["x1"]
  248. ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
  249. if ii is not None:
  250. b["H_top"] = spans[ii]["top"]
  251. b["H_bott"] = spans[ii]["bottom"]
  252. b["H_left"] = spans[ii]["x0"]
  253. b["H_right"] = spans[ii]["x1"]
  254. b["SP"] = ii
  255. def __ocr(self, pagenum, img, chars, ZM=3):
  256. start = timer()
  257. bxs = self.ocr.detect(np.array(img))
  258. logging.info(f"__ocr detecting boxes of a image cost ({timer() - start}s)")
  259. start = timer()
  260. if not bxs:
  261. self.boxes.append([])
  262. return
  263. bxs = [(line[0], line[1][0]) for line in bxs]
  264. bxs = Recognizer.sort_Y_firstly(
  265. [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
  266. "top": b[0][1] / ZM, "text": "", "txt": t,
  267. "bottom": b[-1][1] / ZM,
  268. "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
  269. self.mean_height[-1] / 3
  270. )
  271. # merge chars in the same rect
  272. for c in Recognizer.sort_Y_firstly(
  273. chars, self.mean_height[pagenum - 1] // 4):
  274. ii = Recognizer.find_overlapped(c, bxs)
  275. if ii is None:
  276. self.lefted_chars.append(c)
  277. continue
  278. ch = c["bottom"] - c["top"]
  279. bh = bxs[ii]["bottom"] - bxs[ii]["top"]
  280. if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
  281. self.lefted_chars.append(c)
  282. continue
  283. if c["text"] == " " and bxs[ii]["text"]:
  284. if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", bxs[ii]["text"][-1]):
  285. bxs[ii]["text"] += " "
  286. else:
  287. bxs[ii]["text"] += c["text"]
  288. logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s")
  289. start = timer()
  290. boxes_to_reg = []
  291. img_np = np.array(img)
  292. for b in bxs:
  293. if not b["text"]:
  294. left, right, top, bott = b["x0"] * ZM, b["x1"] * \
  295. ZM, b["top"] * ZM, b["bottom"] * ZM
  296. b["box_image"] = self.ocr.get_rotate_crop_image(img_np, np.array([[left, top], [right, top], [right, bott], [left, bott]], dtype=np.float32))
  297. boxes_to_reg.append(b)
  298. del b["txt"]
  299. texts = self.ocr.recognize_batch([b["box_image"] for b in boxes_to_reg])
  300. for i in range(len(boxes_to_reg)):
  301. boxes_to_reg[i]["text"] = texts[i]
  302. del boxes_to_reg[i]["box_image"]
  303. logging.info(f"__ocr recognize {len(bxs)} boxes cost {timer() - start}s")
  304. bxs = [b for b in bxs if b["text"]]
  305. if self.mean_height[-1] == 0:
  306. self.mean_height[-1] = np.median([b["bottom"] - b["top"]
  307. for b in bxs])
  308. self.boxes.append(bxs)
  309. def _layouts_rec(self, ZM, drop=True):
  310. assert len(self.page_images) == len(self.boxes)
  311. self.boxes, self.page_layout = self.layouter(
  312. self.page_images, self.boxes, ZM, drop=drop)
  313. # cumlative Y
  314. for i in range(len(self.boxes)):
  315. self.boxes[i]["top"] += \
  316. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  317. self.boxes[i]["bottom"] += \
  318. self.page_cum_height[self.boxes[i]["page_number"] - 1]
  319. def _text_merge(self):
  320. # merge adjusted boxes
  321. bxs = self.boxes
  322. def end_with(b, txt):
  323. txt = txt.strip()
  324. tt = b.get("text", "").strip()
  325. return tt and tt.find(txt) == len(tt) - len(txt)
  326. def start_with(b, txts):
  327. tt = b.get("text", "").strip()
  328. return tt and any([tt.find(t.strip()) == 0 for t in txts])
  329. # horizontally merge adjacent box with the same layout
  330. i = 0
  331. while i < len(bxs) - 1:
  332. b = bxs[i]
  333. b_ = bxs[i + 1]
  334. if b.get("layoutno", "0") != b_.get("layoutno", "1") or b.get("layout_type", "") in ["table", "figure",
  335. "equation"]:
  336. i += 1
  337. continue
  338. if abs(self._y_dis(b, b_)
  339. ) < self.mean_height[bxs[i]["page_number"] - 1] / 3:
  340. # merge
  341. bxs[i]["x1"] = b_["x1"]
  342. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  343. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  344. bxs[i]["text"] += b_["text"]
  345. bxs.pop(i + 1)
  346. continue
  347. i += 1
  348. continue
  349. dis_thr = 1
  350. dis = b["x1"] - b_["x0"]
  351. if b.get("layout_type", "") != "text" or b_.get(
  352. "layout_type", "") != "text":
  353. if end_with(b, ",") or start_with(b_, "(,"):
  354. dis_thr = -8
  355. else:
  356. i += 1
  357. continue
  358. if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
  359. and dis >= dis_thr and b["x1"] < b_["x1"]:
  360. # merge
  361. bxs[i]["x1"] = b_["x1"]
  362. bxs[i]["top"] = (b["top"] + b_["top"]) / 2
  363. bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
  364. bxs[i]["text"] += b_["text"]
  365. bxs.pop(i + 1)
  366. continue
  367. i += 1
  368. self.boxes = bxs
  369. def _naive_vertical_merge(self):
  370. bxs = Recognizer.sort_Y_firstly(
  371. self.boxes, np.median(
  372. self.mean_height) / 3)
  373. i = 0
  374. while i + 1 < len(bxs):
  375. b = bxs[i]
  376. b_ = bxs[i + 1]
  377. if b["page_number"] < b_["page_number"] and re.match(
  378. r"[0-9 •一—-]+$", b["text"]):
  379. bxs.pop(i)
  380. continue
  381. if not b["text"].strip():
  382. bxs.pop(i)
  383. continue
  384. concatting_feats = [
  385. b["text"].strip()[-1] in ",;:'\",、‘“;:-",
  386. len(b["text"].strip()) > 1 and b["text"].strip(
  387. )[-2] in ",;:'\",‘“、;:",
  388. b_["text"].strip() and b_["text"].strip()[0] in "。;?!?”)),,、:",
  389. ]
  390. # features for not concating
  391. feats = [
  392. b.get("layoutno", 0) != b_.get("layoutno", 0),
  393. b["text"].strip()[-1] in "。?!?",
  394. self.is_english and b["text"].strip()[-1] in ".!?",
  395. b["page_number"] == b_["page_number"] and b_["top"] -
  396. b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
  397. b["page_number"] < b_["page_number"] and abs(
  398. b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4,
  399. ]
  400. # split features
  401. detach_feats = [b["x1"] < b_["x0"],
  402. b["x0"] > b_["x1"]]
  403. if (any(feats) and not any(concatting_feats)) or any(detach_feats):
  404. logging.debug("{} {} {} {}".format(
  405. b["text"],
  406. b_["text"],
  407. any(feats),
  408. any(concatting_feats),
  409. ))
  410. i += 1
  411. continue
  412. # merge up and down
  413. b["bottom"] = b_["bottom"]
  414. b["text"] += b_["text"]
  415. b["x0"] = min(b["x0"], b_["x0"])
  416. b["x1"] = max(b["x1"], b_["x1"])
  417. bxs.pop(i + 1)
  418. self.boxes = bxs
  419. def _concat_downward(self, concat_between_pages=True):
  420. # count boxes in the same row as a feature
  421. for i in range(len(self.boxes)):
  422. mh = self.mean_height[self.boxes[i]["page_number"] - 1]
  423. self.boxes[i]["in_row"] = 0
  424. j = max(0, i - 12)
  425. while j < min(i + 12, len(self.boxes)):
  426. if j == i:
  427. j += 1
  428. continue
  429. ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
  430. if abs(ydis) < 1:
  431. self.boxes[i]["in_row"] += 1
  432. elif ydis > 0:
  433. break
  434. j += 1
  435. # concat between rows
  436. boxes = deepcopy(self.boxes)
  437. blocks = []
  438. while boxes:
  439. chunks = []
  440. def dfs(up, dp):
  441. chunks.append(up)
  442. i = dp
  443. while i < min(dp + 12, len(boxes)):
  444. ydis = self._y_dis(up, boxes[i])
  445. smpg = up["page_number"] == boxes[i]["page_number"]
  446. mh = self.mean_height[up["page_number"] - 1]
  447. mw = self.mean_width[up["page_number"] - 1]
  448. if smpg and ydis > mh * 4:
  449. break
  450. if not smpg and ydis > mh * 16:
  451. break
  452. down = boxes[i]
  453. if not concat_between_pages and down["page_number"] > up["page_number"]:
  454. break
  455. if up.get("R", "") != down.get(
  456. "R", "") and up["text"][-1] != ",":
  457. i += 1
  458. continue
  459. if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
  460. or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]) \
  461. or not down["text"].strip():
  462. i += 1
  463. continue
  464. if not down["text"].strip() or not up["text"].strip():
  465. i += 1
  466. continue
  467. if up["x1"] < down["x0"] - 10 * \
  468. mw or up["x0"] > down["x1"] + 10 * mw:
  469. i += 1
  470. continue
  471. if i - dp < 5 and up.get("layout_type") == "text":
  472. if up.get("layoutno", "1") == down.get(
  473. "layoutno", "2"):
  474. dfs(down, i + 1)
  475. boxes.pop(i)
  476. return
  477. i += 1
  478. continue
  479. fea = self._updown_concat_features(up, down)
  480. if self.updown_cnt_mdl.predict(
  481. xgb.DMatrix([fea]))[0] <= 0.5:
  482. i += 1
  483. continue
  484. dfs(down, i + 1)
  485. boxes.pop(i)
  486. return
  487. dfs(boxes[0], 1)
  488. boxes.pop(0)
  489. if chunks:
  490. blocks.append(chunks)
  491. # concat within each block
  492. boxes = []
  493. for b in blocks:
  494. if len(b) == 1:
  495. boxes.append(b[0])
  496. continue
  497. t = b[0]
  498. for c in b[1:]:
  499. t["text"] = t["text"].strip()
  500. c["text"] = c["text"].strip()
  501. if not c["text"]:
  502. continue
  503. if t["text"] and re.match(
  504. r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
  505. t["text"] += " "
  506. t["text"] += c["text"]
  507. t["x0"] = min(t["x0"], c["x0"])
  508. t["x1"] = max(t["x1"], c["x1"])
  509. t["page_number"] = min(t["page_number"], c["page_number"])
  510. t["bottom"] = c["bottom"]
  511. if not t["layout_type"] \
  512. and c["layout_type"]:
  513. t["layout_type"] = c["layout_type"]
  514. boxes.append(t)
  515. self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
  516. def _filter_forpages(self):
  517. if not self.boxes:
  518. return
  519. findit = False
  520. i = 0
  521. while i < len(self.boxes):
  522. if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$",
  523. re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
  524. i += 1
  525. continue
  526. findit = True
  527. eng = re.match(
  528. r"[0-9a-zA-Z :'.-]{5,}",
  529. self.boxes[i]["text"].strip())
  530. self.boxes.pop(i)
  531. if i >= len(self.boxes):
  532. break
  533. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  534. self.boxes[i]["text"].strip().split()[:2])
  535. while not prefix:
  536. self.boxes.pop(i)
  537. if i >= len(self.boxes):
  538. break
  539. prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(
  540. self.boxes[i]["text"].strip().split()[:2])
  541. self.boxes.pop(i)
  542. if i >= len(self.boxes) or not prefix:
  543. break
  544. for j in range(i, min(i + 128, len(self.boxes))):
  545. if not re.match(prefix, self.boxes[j]["text"]):
  546. continue
  547. for k in range(i, j):
  548. self.boxes.pop(i)
  549. break
  550. if findit:
  551. return
  552. page_dirty = [0] * len(self.page_images)
  553. for b in self.boxes:
  554. if re.search(r"(··|··|··)", b["text"]):
  555. page_dirty[b["page_number"] - 1] += 1
  556. page_dirty = set([i + 1 for i, t in enumerate(page_dirty) if t > 3])
  557. if not page_dirty:
  558. return
  559. i = 0
  560. while i < len(self.boxes):
  561. if self.boxes[i]["page_number"] in page_dirty:
  562. self.boxes.pop(i)
  563. continue
  564. i += 1
  565. def _merge_with_same_bullet(self):
  566. i = 0
  567. while i + 1 < len(self.boxes):
  568. b = self.boxes[i]
  569. b_ = self.boxes[i + 1]
  570. if not b["text"].strip():
  571. self.boxes.pop(i)
  572. continue
  573. if not b_["text"].strip():
  574. self.boxes.pop(i + 1)
  575. continue
  576. if b["text"].strip()[0] != b_["text"].strip()[0] \
  577. or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
  578. or rag_tokenizer.is_chinese(b["text"].strip()[0]) \
  579. or b["top"] > b_["bottom"]:
  580. i += 1
  581. continue
  582. b_["text"] = b["text"] + "\n" + b_["text"]
  583. b_["x0"] = min(b["x0"], b_["x0"])
  584. b_["x1"] = max(b["x1"], b_["x1"])
  585. b_["top"] = b["top"]
  586. self.boxes.pop(i)
  587. def _extract_table_figure(self, need_image, ZM,
  588. return_html, need_position):
  589. tables = {}
  590. figures = {}
  591. # extract figure and table boxes
  592. i = 0
  593. lst_lout_no = ""
  594. nomerge_lout_no = []
  595. while i < len(self.boxes):
  596. if "layoutno" not in self.boxes[i]:
  597. i += 1
  598. continue
  599. lout_no = str(self.boxes[i]["page_number"]) + \
  600. "-" + str(self.boxes[i]["layoutno"])
  601. if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption",
  602. "title",
  603. "figure caption",
  604. "reference"]:
  605. nomerge_lout_no.append(lst_lout_no)
  606. if self.boxes[i]["layout_type"] == "table":
  607. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  608. self.boxes.pop(i)
  609. continue
  610. if lout_no not in tables:
  611. tables[lout_no] = []
  612. tables[lout_no].append(self.boxes[i])
  613. self.boxes.pop(i)
  614. lst_lout_no = lout_no
  615. continue
  616. if need_image and self.boxes[i]["layout_type"] == "figure":
  617. if re.match(r"(数据|资料|图表)*来源[:: ]", self.boxes[i]["text"]):
  618. self.boxes.pop(i)
  619. continue
  620. if lout_no not in figures:
  621. figures[lout_no] = []
  622. figures[lout_no].append(self.boxes[i])
  623. self.boxes.pop(i)
  624. lst_lout_no = lout_no
  625. continue
  626. i += 1
  627. # merge table on different pages
  628. nomerge_lout_no = set(nomerge_lout_no)
  629. tbls = sorted([(k, bxs) for k, bxs in tables.items()],
  630. key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
  631. i = len(tbls) - 1
  632. while i - 1 >= 0:
  633. k0, bxs0 = tbls[i - 1]
  634. k, bxs = tbls[i]
  635. i -= 1
  636. if k0 in nomerge_lout_no:
  637. continue
  638. if bxs[0]["page_number"] == bxs0[0]["page_number"]:
  639. continue
  640. if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
  641. continue
  642. mh = self.mean_height[bxs[0]["page_number"] - 1]
  643. if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
  644. continue
  645. tables[k0].extend(tables[k])
  646. del tables[k]
  647. def x_overlapped(a, b):
  648. return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
  649. # find captions and pop out
  650. i = 0
  651. while i < len(self.boxes):
  652. c = self.boxes[i]
  653. # mh = self.mean_height[c["page_number"]-1]
  654. if not TableStructureRecognizer.is_caption(c):
  655. i += 1
  656. continue
  657. # find the nearest layouts
  658. def nearest(tbls):
  659. nonlocal c
  660. mink = ""
  661. minv = 1000000000
  662. for k, bxs in tbls.items():
  663. for b in bxs:
  664. if b.get("layout_type", "").find("caption") >= 0:
  665. continue
  666. y_dis = self._y_dis(c, b)
  667. x_dis = self._x_dis(
  668. c, b) if not x_overlapped(
  669. c, b) else 0
  670. dis = y_dis * y_dis + x_dis * x_dis
  671. if dis < minv:
  672. mink = k
  673. minv = dis
  674. return mink, minv
  675. tk, tv = nearest(tables)
  676. fk, fv = nearest(figures)
  677. # if min(tv, fv) > 2000:
  678. # i += 1
  679. # continue
  680. if tv < fv and tk:
  681. tables[tk].insert(0, c)
  682. logging.debug(
  683. "TABLE:" +
  684. self.boxes[i]["text"] +
  685. "; Cap: " +
  686. tk)
  687. elif fk:
  688. figures[fk].insert(0, c)
  689. logging.debug(
  690. "FIGURE:" +
  691. self.boxes[i]["text"] +
  692. "; Cap: " +
  693. tk)
  694. self.boxes.pop(i)
  695. res = []
  696. positions = []
  697. def cropout(bxs, ltype, poss):
  698. nonlocal ZM
  699. pn = set([b["page_number"] - 1 for b in bxs])
  700. if len(pn) < 2:
  701. pn = list(pn)[0]
  702. ht = self.page_cum_height[pn]
  703. b = {
  704. "x0": np.min([b["x0"] for b in bxs]),
  705. "top": np.min([b["top"] for b in bxs]) - ht,
  706. "x1": np.max([b["x1"] for b in bxs]),
  707. "bottom": np.max([b["bottom"] for b in bxs]) - ht
  708. }
  709. louts = [layout for layout in self.page_layout[pn] if layout["type"] == ltype]
  710. ii = Recognizer.find_overlapped(b, louts, naive=True)
  711. if ii is not None:
  712. b = louts[ii]
  713. else:
  714. logging.warning(
  715. f"Missing layout match: {pn + 1},%s" %
  716. (bxs[0].get(
  717. "layoutno", "")))
  718. left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
  719. if right < left:
  720. right = left + 1
  721. poss.append((pn + self.page_from, left, right, top, bott))
  722. return self.page_images[pn] \
  723. .crop((left * ZM, top * ZM,
  724. right * ZM, bott * ZM))
  725. pn = {}
  726. for b in bxs:
  727. p = b["page_number"] - 1
  728. if p not in pn:
  729. pn[p] = []
  730. pn[p].append(b)
  731. pn = sorted(pn.items(), key=lambda x: x[0])
  732. imgs = [cropout(arr, ltype, poss) for p, arr in pn]
  733. pic = Image.new("RGB",
  734. (int(np.max([i.size[0] for i in imgs])),
  735. int(np.sum([m.size[1] for m in imgs]))),
  736. (245, 245, 245))
  737. height = 0
  738. for img in imgs:
  739. pic.paste(img, (0, int(height)))
  740. height += img.size[1]
  741. return pic
  742. # crop figure out and add caption
  743. for k, bxs in figures.items():
  744. txt = "\n".join([b["text"] for b in bxs])
  745. if not txt:
  746. continue
  747. poss = []
  748. res.append(
  749. (cropout(
  750. bxs,
  751. "figure", poss),
  752. [txt]))
  753. positions.append(poss)
  754. for k, bxs in tables.items():
  755. if not bxs:
  756. continue
  757. bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
  758. [(b["bottom"] - b["top"]) / 2 for b in bxs]))
  759. poss = []
  760. res.append((cropout(bxs, "table", poss),
  761. self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
  762. positions.append(poss)
  763. assert len(positions) == len(res)
  764. if need_position:
  765. return list(zip(res, positions))
  766. return res
  767. def proj_match(self, line):
  768. if len(line) <= 2:
  769. return
  770. if re.match(r"[0-9 ().,%%+/-]+$", line):
  771. return False
  772. for p, j in [
  773. (r"第[零一二三四五六七八九十百]+章", 1),
  774. (r"第[零一二三四五六七八九十百]+[条节]", 2),
  775. (r"[零一二三四五六七八九十百]+[、  ]", 3),
  776. (r"[\((][零一二三四五六七八九十百]+[)\)]", 4),
  777. (r"[0-9]+(、|\.[  ]|\.[^0-9])", 5),
  778. (r"[0-9]+\.[0-9]+(、|[.  ]|[^0-9])", 6),
  779. (r"[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 7),
  780. (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[  ]|[^0-9])", 8),
  781. (r".{,48}[::??]$", 9),
  782. (r"[0-9]+)", 10),
  783. (r"[\((][0-9]+[)\)]", 11),
  784. (r"[零一二三四五六七八九十百]+是", 12),
  785. (r"[⚫•➢✓]", 12)
  786. ]:
  787. if re.match(p, line):
  788. return j
  789. return
  790. def _line_tag(self, bx, ZM):
  791. pn = [bx["page_number"]]
  792. top = bx["top"] - self.page_cum_height[pn[0] - 1]
  793. bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
  794. page_images_cnt = len(self.page_images)
  795. if pn[-1] - 1 >= page_images_cnt:
  796. return ""
  797. while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
  798. bott -= self.page_images[pn[-1] - 1].size[1] / ZM
  799. pn.append(pn[-1] + 1)
  800. if pn[-1] - 1 >= page_images_cnt:
  801. return ""
  802. return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
  803. .format("-".join([str(p) for p in pn]),
  804. bx["x0"], bx["x1"], top, bott)
  805. def __filterout_scraps(self, boxes, ZM):
  806. def width(b):
  807. return b["x1"] - b["x0"]
  808. def height(b):
  809. return b["bottom"] - b["top"]
  810. def usefull(b):
  811. if b.get("layout_type"):
  812. return True
  813. if width(
  814. b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
  815. return True
  816. if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
  817. return True
  818. return False
  819. res = []
  820. while boxes:
  821. lines = []
  822. widths = []
  823. pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
  824. mh = self.mean_height[boxes[0]["page_number"] - 1]
  825. mj = self.proj_match(
  826. boxes[0]["text"]) or boxes[0].get(
  827. "layout_type",
  828. "") == "title"
  829. def dfs(line, st):
  830. nonlocal mh, pw, lines, widths
  831. lines.append(line)
  832. widths.append(width(line))
  833. mmj = self.proj_match(
  834. line["text"]) or line.get(
  835. "layout_type",
  836. "") == "title"
  837. for i in range(st + 1, min(st + 20, len(boxes))):
  838. if (boxes[i]["page_number"] - line["page_number"]) > 0:
  839. break
  840. if not mmj and self._y_dis(
  841. line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
  842. break
  843. if not usefull(boxes[i]):
  844. continue
  845. if mmj or \
  846. (self._x_dis(boxes[i], line) < pw / 10): \
  847. # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
  848. # concat following
  849. dfs(boxes[i], i)
  850. boxes.pop(i)
  851. break
  852. try:
  853. if usefull(boxes[0]):
  854. dfs(boxes[0], 0)
  855. else:
  856. logging.debug("WASTE: " + boxes[0]["text"])
  857. except Exception:
  858. pass
  859. boxes.pop(0)
  860. mw = np.mean(widths)
  861. if mj or mw / pw >= 0.35 or mw > 200:
  862. res.append(
  863. "\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
  864. else:
  865. logging.debug("REMOVED: " +
  866. "<<".join([c["text"] for c in lines]))
  867. return "\n\n".join(res)
  868. @staticmethod
  869. def total_page_number(fnm, binary=None):
  870. try:
  871. pdf = pdfplumber.open(
  872. fnm) if not binary else pdfplumber.open(BytesIO(binary))
  873. return len(pdf.pages)
  874. except Exception:
  875. logging.exception("total_page_number")
  876. def __images__(self, fnm, zoomin=3, page_from=0,
  877. page_to=299, callback=None):
  878. self.lefted_chars = []
  879. self.mean_height = []
  880. self.mean_width = []
  881. self.boxes = []
  882. self.garbages = {}
  883. self.page_cum_height = [0]
  884. self.page_layout = []
  885. self.page_from = page_from
  886. start = timer()
  887. try:
  888. self.pdf = pdfplumber.open(fnm) if isinstance(
  889. fnm, str) else pdfplumber.open(BytesIO(fnm))
  890. self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
  891. enumerate(self.pdf.pages[page_from:page_to])]
  892. try:
  893. self.page_chars = [[c for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
  894. except Exception as e:
  895. logging.warning(f"Failed to extract characters for pages {page_from}-{page_to}: {str(e)}")
  896. self.page_chars = [[] for _ in range(page_to - page_from)] # If failed to extract, using empty list instead.
  897. self.total_page = len(self.pdf.pages)
  898. except Exception:
  899. logging.exception("RAGFlowPdfParser __images__")
  900. logging.info(f"__images__ dedupe_chars cost {timer() - start}s")
  901. self.outlines = []
  902. try:
  903. self.pdf = pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm))
  904. outlines = self.pdf.outline
  905. def dfs(arr, depth):
  906. for a in arr:
  907. if isinstance(a, dict):
  908. self.outlines.append((a["/Title"], depth))
  909. continue
  910. dfs(a, depth + 1)
  911. dfs(outlines, 0)
  912. except Exception as e:
  913. logging.warning(f"Outlines exception: {e}")
  914. if not self.outlines:
  915. logging.warning("Miss outlines")
  916. logging.debug("Images converted.")
  917. self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
  918. random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in
  919. range(len(self.page_chars))]
  920. if sum([1 if e else 0 for e in self.is_english]) > len(
  921. self.page_images) / 2:
  922. self.is_english = True
  923. else:
  924. self.is_english = False
  925. start = timer()
  926. for i, img in enumerate(self.page_images):
  927. chars = self.page_chars[i] if not self.is_english else []
  928. self.mean_height.append(
  929. np.median(sorted([c["height"] for c in chars])) if chars else 0
  930. )
  931. self.mean_width.append(
  932. np.median(sorted([c["width"] for c in chars])) if chars else 8
  933. )
  934. self.page_cum_height.append(img.size[1] / zoomin)
  935. j = 0
  936. while j + 1 < len(chars):
  937. if chars[j]["text"] and chars[j + 1]["text"] \
  938. and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
  939. and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
  940. chars[j]["width"]) / 2:
  941. chars[j]["text"] += " "
  942. j += 1
  943. self.__ocr(i + 1, img, chars, zoomin)
  944. if callback and i % 6 == 5:
  945. callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="")
  946. logging.info(f"__images__ {len(self.page_images)} pages cost {timer() - start}s")
  947. if not self.is_english and not any(
  948. [c for c in self.page_chars]) and self.boxes:
  949. bxes = [b for bxs in self.boxes for b in bxs]
  950. self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}",
  951. "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
  952. logging.debug("Is it English:", self.is_english)
  953. self.page_cum_height = np.cumsum(self.page_cum_height)
  954. assert len(self.page_cum_height) == len(self.page_images) + 1
  955. if len(self.boxes) == 0 and zoomin < 9:
  956. self.__images__(fnm, zoomin * 3, page_from, page_to, callback)
  957. def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
  958. self.__images__(fnm, zoomin)
  959. self._layouts_rec(zoomin)
  960. self._table_transformer_job(zoomin)
  961. self._text_merge()
  962. self._concat_downward()
  963. self._filter_forpages()
  964. tbls = self._extract_table_figure(
  965. need_image, zoomin, return_html, False)
  966. return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
  967. def remove_tag(self, txt):
  968. return re.sub(r"@@[\t0-9.-]+?##", "", txt)
  969. def crop(self, text, ZM=3, need_position=False):
  970. imgs = []
  971. poss = []
  972. for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
  973. pn, left, right, top, bottom = tag.strip(
  974. "#").strip("@").split("\t")
  975. left, right, top, bottom = float(left), float(
  976. right), float(top), float(bottom)
  977. poss.append(([int(p) - 1 for p in pn.split("-")],
  978. left, right, top, bottom))
  979. if not poss:
  980. if need_position:
  981. return None, None
  982. return
  983. max_width = max(
  984. np.max([right - left for (_, left, right, _, _) in poss]), 6)
  985. GAP = 6
  986. pos = poss[0]
  987. poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(
  988. 0, pos[3] - 120), max(pos[3] - GAP, 0)))
  989. pos = poss[-1]
  990. poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP),
  991. min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
  992. positions = []
  993. for ii, (pns, left, right, top, bottom) in enumerate(poss):
  994. right = left + max_width
  995. bottom *= ZM
  996. for pn in pns[1:]:
  997. bottom += self.page_images[pn - 1].size[1]
  998. imgs.append(
  999. self.page_images[pns[0]].crop((left * ZM, top * ZM,
  1000. right *
  1001. ZM, min(
  1002. bottom, self.page_images[pns[0]].size[1])
  1003. ))
  1004. )
  1005. if 0 < ii < len(poss) - 1:
  1006. positions.append((pns[0] + self.page_from, left, right, top, min(
  1007. bottom, self.page_images[pns[0]].size[1]) / ZM))
  1008. bottom -= self.page_images[pns[0]].size[1]
  1009. for pn in pns[1:]:
  1010. imgs.append(
  1011. self.page_images[pn].crop((left * ZM, 0,
  1012. right * ZM,
  1013. min(bottom,
  1014. self.page_images[pn].size[1])
  1015. ))
  1016. )
  1017. if 0 < ii < len(poss) - 1:
  1018. positions.append((pn + self.page_from, left, right, 0, min(
  1019. bottom, self.page_images[pn].size[1]) / ZM))
  1020. bottom -= self.page_images[pn].size[1]
  1021. if not imgs:
  1022. if need_position:
  1023. return None, None
  1024. return
  1025. height = 0
  1026. for img in imgs:
  1027. height += img.size[1] + GAP
  1028. height = int(height)
  1029. width = int(np.max([i.size[0] for i in imgs]))
  1030. pic = Image.new("RGB",
  1031. (width, height),
  1032. (245, 245, 245))
  1033. height = 0
  1034. for ii, img in enumerate(imgs):
  1035. if ii == 0 or ii + 1 == len(imgs):
  1036. img = img.convert('RGBA')
  1037. overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
  1038. overlay.putalpha(128)
  1039. img = Image.alpha_composite(img, overlay).convert("RGB")
  1040. pic.paste(img, (0, int(height)))
  1041. height += img.size[1] + GAP
  1042. if need_position:
  1043. return pic, positions
  1044. return pic
  1045. def get_position(self, bx, ZM):
  1046. poss = []
  1047. pn = bx["page_number"]
  1048. top = bx["top"] - self.page_cum_height[pn - 1]
  1049. bott = bx["bottom"] - self.page_cum_height[pn - 1]
  1050. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1051. bott, self.page_images[pn - 1].size[1] / ZM)))
  1052. while bott * ZM > self.page_images[pn - 1].size[1]:
  1053. bott -= self.page_images[pn - 1].size[1] / ZM
  1054. top = 0
  1055. pn += 1
  1056. poss.append((pn, bx["x0"], bx["x1"], top, min(
  1057. bott, self.page_images[pn - 1].size[1] / ZM)))
  1058. return poss
  1059. class PlainParser(object):
  1060. def __call__(self, filename, from_page=0, to_page=100000, **kwargs):
  1061. self.outlines = []
  1062. lines = []
  1063. try:
  1064. self.pdf = pdf2_read(
  1065. filename if isinstance(
  1066. filename, str) else BytesIO(filename))
  1067. for page in self.pdf.pages[from_page:to_page]:
  1068. lines.extend([t for t in page.extract_text().split("\n")])
  1069. outlines = self.pdf.outline
  1070. def dfs(arr, depth):
  1071. for a in arr:
  1072. if isinstance(a, dict):
  1073. self.outlines.append((a["/Title"], depth))
  1074. continue
  1075. dfs(a, depth + 1)
  1076. dfs(outlines, 0)
  1077. except Exception:
  1078. logging.exception("Outlines exception")
  1079. if not self.outlines:
  1080. logging.warning("Miss outlines")
  1081. return [(line, "") for line in lines], []
  1082. def crop(self, ck, need_position):
  1083. raise NotImplementedError
  1084. @staticmethod
  1085. def remove_tag(txt):
  1086. raise NotImplementedError
  1087. if __name__ == "__main__":
  1088. pass