You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

naive.py 24KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586
  1. #
  2. # Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. #
  16. import logging
  17. import re
  18. from functools import reduce
  19. from io import BytesIO
  20. from timeit import default_timer as timer
  21. from docx import Document
  22. from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
  23. from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
  24. from docx.opc.oxml import parse_xml
  25. from markdown import markdown
  26. from PIL import Image
  27. from tika import parser
  28. from api.db import LLMType
  29. from api.db.services.llm_service import LLMBundle
  30. from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
  31. from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper
  32. from deepdoc.parser.pdf_parser import PlainParser, VisionParser
  33. from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
  34. class Docx(DocxParser):
  35. def __init__(self):
  36. pass
  37. def get_picture(self, document, paragraph):
  38. img = paragraph._element.xpath('.//pic:pic')
  39. if not img:
  40. return None
  41. img = img[0]
  42. embed = img.xpath('.//a:blip/@r:embed')
  43. if not embed:
  44. return None
  45. embed = embed[0]
  46. try:
  47. related_part = document.part.related_parts[embed]
  48. image_blob = related_part.image.blob
  49. except UnrecognizedImageError:
  50. logging.info("Unrecognized image format. Skipping image.")
  51. return None
  52. except UnexpectedEndOfFileError:
  53. logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
  54. return None
  55. except InvalidImageStreamError:
  56. logging.info("The recognized image stream appears to be corrupted. Skipping image.")
  57. return None
  58. except UnicodeDecodeError:
  59. logging.info("The recognized image stream appears to be corrupted. Skipping image.")
  60. return None
  61. except Exception:
  62. logging.info("The recognized image stream appears to be corrupted. Skipping image.")
  63. return None
  64. try:
  65. image = Image.open(BytesIO(image_blob)).convert('RGB')
  66. return image
  67. except Exception:
  68. return None
  69. def __clean(self, line):
  70. line = re.sub(r"\u3000", " ", line).strip()
  71. return line
  72. def __get_nearest_title(self, table_index, filename):
  73. """Get the hierarchical title structure before the table"""
  74. import re
  75. from docx.text.paragraph import Paragraph
  76. titles = []
  77. blocks = []
  78. # Get document name from filename parameter
  79. doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
  80. if not doc_name:
  81. doc_name = "Untitled Document"
  82. # Collect all document blocks while maintaining document order
  83. try:
  84. # Iterate through all paragraphs and tables in document order
  85. for i, block in enumerate(self.doc._element.body):
  86. if block.tag.endswith('p'): # Paragraph
  87. p = Paragraph(block, self.doc)
  88. blocks.append(('p', i, p))
  89. elif block.tag.endswith('tbl'): # Table
  90. blocks.append(('t', i, None)) # Table object will be retrieved later
  91. except Exception as e:
  92. logging.error(f"Error collecting blocks: {e}")
  93. return ""
  94. # Find the target table position
  95. target_table_pos = -1
  96. table_count = 0
  97. for i, (block_type, pos, _) in enumerate(blocks):
  98. if block_type == 't':
  99. if table_count == table_index:
  100. target_table_pos = pos
  101. break
  102. table_count += 1
  103. if target_table_pos == -1:
  104. return "" # Target table not found
  105. # Find the nearest heading paragraph in reverse order
  106. nearest_title = None
  107. for i in range(len(blocks)-1, -1, -1):
  108. block_type, pos, block = blocks[i]
  109. if pos >= target_table_pos: # Skip blocks after the table
  110. continue
  111. if block_type != 'p':
  112. continue
  113. if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
  114. try:
  115. level_match = re.search(r"(\d+)", block.style.name)
  116. if level_match:
  117. level = int(level_match.group(1))
  118. if level <= 7: # Support up to 7 heading levels
  119. title_text = block.text.strip()
  120. if title_text: # Avoid empty titles
  121. nearest_title = (level, title_text)
  122. break
  123. except Exception as e:
  124. logging.error(f"Error parsing heading level: {e}")
  125. if nearest_title:
  126. # Add current title
  127. titles.append(nearest_title)
  128. current_level = nearest_title[0]
  129. # Find all parent headings, allowing cross-level search
  130. while current_level > 1:
  131. found = False
  132. for i in range(len(blocks)-1, -1, -1):
  133. block_type, pos, block = blocks[i]
  134. if pos >= target_table_pos: # Skip blocks after the table
  135. continue
  136. if block_type != 'p':
  137. continue
  138. if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
  139. try:
  140. level_match = re.search(r"(\d+)", block.style.name)
  141. if level_match:
  142. level = int(level_match.group(1))
  143. # Find any heading with a higher level
  144. if level < current_level:
  145. title_text = block.text.strip()
  146. if title_text: # Avoid empty titles
  147. titles.append((level, title_text))
  148. current_level = level
  149. found = True
  150. break
  151. except Exception as e:
  152. logging.error(f"Error parsing parent heading: {e}")
  153. if not found: # Break if no parent heading is found
  154. break
  155. # Sort by level (ascending, from highest to lowest)
  156. titles.sort(key=lambda x: x[0])
  157. # Organize titles (from highest to lowest)
  158. hierarchy = [doc_name] + [t[1] for t in titles]
  159. return " > ".join(hierarchy)
  160. return ""
  161. def __call__(self, filename, binary=None, from_page=0, to_page=100000):
  162. self.doc = Document(
  163. filename) if not binary else Document(BytesIO(binary))
  164. pn = 0
  165. lines = []
  166. last_image = None
  167. for p in self.doc.paragraphs:
  168. if pn > to_page:
  169. break
  170. if from_page <= pn < to_page:
  171. if p.text.strip():
  172. if p.style and p.style.name == 'Caption':
  173. former_image = None
  174. if lines and lines[-1][1] and lines[-1][2] != 'Caption':
  175. former_image = lines[-1][1].pop()
  176. elif last_image:
  177. former_image = last_image
  178. last_image = None
  179. lines.append((self.__clean(p.text), [former_image], p.style.name))
  180. else:
  181. current_image = self.get_picture(self.doc, p)
  182. image_list = [current_image]
  183. if last_image:
  184. image_list.insert(0, last_image)
  185. last_image = None
  186. lines.append((self.__clean(p.text), image_list, p.style.name if p.style else ""))
  187. else:
  188. if current_image := self.get_picture(self.doc, p):
  189. if lines:
  190. lines[-1][1].append(current_image)
  191. else:
  192. last_image = current_image
  193. for run in p.runs:
  194. if 'lastRenderedPageBreak' in run._element.xml:
  195. pn += 1
  196. continue
  197. if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
  198. pn += 1
  199. new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
  200. tbls = []
  201. for i, tb in enumerate(self.doc.tables):
  202. title = self.__get_nearest_title(i, filename)
  203. html = "<table>"
  204. if title:
  205. html += f"<caption>Table Location: {title}</caption>"
  206. for r in tb.rows:
  207. html += "<tr>"
  208. i = 0
  209. try:
  210. while i < len(r.cells):
  211. span = 1
  212. c = r.cells[i]
  213. for j in range(i + 1, len(r.cells)):
  214. if c.text == r.cells[j].text:
  215. span += 1
  216. i = j
  217. else:
  218. break
  219. i += 1
  220. html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
  221. except Exception as e:
  222. logging.warning(f"Error parsing table, ignore: {e}")
  223. html += "</tr>"
  224. html += "</table>"
  225. tbls.append(((None, html), ""))
  226. return new_line, tbls
  227. class Pdf(PdfParser):
  228. def __init__(self):
  229. super().__init__()
  230. def __call__(self, filename, binary=None, from_page=0,
  231. to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
  232. start = timer()
  233. first_start = start
  234. callback(msg="OCR started")
  235. self.__images__(
  236. filename if not binary else binary,
  237. zoomin,
  238. from_page,
  239. to_page,
  240. callback
  241. )
  242. callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
  243. logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
  244. start = timer()
  245. self._layouts_rec(zoomin)
  246. callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
  247. start = timer()
  248. self._table_transformer_job(zoomin)
  249. callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
  250. start = timer()
  251. self._text_merge()
  252. callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
  253. if separate_tables_figures:
  254. tbls, figures = self._extract_table_figure(True, zoomin, True, True, True)
  255. self._concat_downward()
  256. logging.info("layouts cost: {}s".format(timer() - first_start))
  257. return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls, figures
  258. else:
  259. tbls = self._extract_table_figure(True, zoomin, True, True)
  260. # self._naive_vertical_merge()
  261. self._concat_downward()
  262. # self._filter_forpages()
  263. logging.info("layouts cost: {}s".format(timer() - first_start))
  264. return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
  265. class Markdown(MarkdownParser):
  266. def get_picture_urls(self, sections):
  267. if not sections:
  268. return []
  269. if isinstance(sections, type("")):
  270. text = sections
  271. elif isinstance(sections[0], type("")):
  272. text = sections[0]
  273. else:
  274. return []
  275. from bs4 import BeautifulSoup
  276. html_content = markdown(text)
  277. soup = BeautifulSoup(html_content, 'html.parser')
  278. html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
  279. return html_images
  280. def get_pictures(self, text):
  281. """Download and open all images from markdown text."""
  282. import requests
  283. image_urls = self.get_picture_urls(text)
  284. images = []
  285. # Find all image URLs in text
  286. for url in image_urls:
  287. try:
  288. # check if the url is a local file or a remote URL
  289. if url.startswith(('http://', 'https://')):
  290. # For remote URLs, download the image
  291. response = requests.get(url, stream=True, timeout=30)
  292. if response.status_code == 200 and response.headers['Content-Type'].startswith('image/'):
  293. img = Image.open(BytesIO(response.content)).convert('RGB')
  294. images.append(img)
  295. else:
  296. # For local file paths, open the image directly
  297. from pathlib import Path
  298. local_path = Path(url)
  299. if not local_path.exists():
  300. logging.warning(f"Local image file not found: {url}")
  301. continue
  302. img = Image.open(url).convert('RGB')
  303. images.append(img)
  304. except Exception as e:
  305. logging.error(f"Failed to download/open image from {url}: {e}")
  306. continue
  307. return images if images else None
  308. def __call__(self, filename, binary=None, separate_tables=True):
  309. if binary:
  310. encoding = find_codec(binary)
  311. txt = binary.decode(encoding, errors="ignore")
  312. else:
  313. with open(filename, "r") as f:
  314. txt = f.read()
  315. remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
  316. sections = []
  317. tbls = []
  318. for sec in remainder.split("\n"):
  319. if sec.strip().find("#") == 0:
  320. sections.append((sec, ""))
  321. elif sections and sections[-1][0].strip().find("#") == 0:
  322. sec_, _ = sections.pop(-1)
  323. sections.append((sec_ + "\n" + sec, ""))
  324. else:
  325. sections.append((sec, ""))
  326. for table in tables:
  327. tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
  328. return sections, tbls
  329. def load_from_xml_v2(baseURI, rels_item_xml):
  330. """
  331. Return |_SerializedRelationships| instance loaded with the
  332. relationships contained in *rels_item_xml*. Returns an empty
  333. collection if *rels_item_xml* is |None|.
  334. """
  335. srels = _SerializedRelationships()
  336. if rels_item_xml is not None:
  337. rels_elm = parse_xml(rels_item_xml)
  338. for rel_elm in rels_elm.Relationship_lst:
  339. if rel_elm.target_ref in ('../NULL', 'NULL'):
  340. continue
  341. srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
  342. return srels
  343. def chunk(filename, binary=None, from_page=0, to_page=100000,
  344. lang="Chinese", callback=None, **kwargs):
  345. """
  346. Supported file formats are docx, pdf, excel, txt.
  347. This method apply the naive ways to chunk files.
  348. Successive text will be sliced into pieces using 'delimiter'.
  349. Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
  350. """
  351. is_english = lang.lower() == "english" # is_english(cks)
  352. parser_config = kwargs.get(
  353. "parser_config", {
  354. "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
  355. doc = {
  356. "docnm_kwd": filename,
  357. "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
  358. }
  359. doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
  360. res = []
  361. pdf_parser = None
  362. section_images = None
  363. if re.search(r"\.docx$", filename, re.IGNORECASE):
  364. callback(0.1, "Start to parse.")
  365. try:
  366. vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
  367. callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
  368. except Exception:
  369. vision_model = None
  370. # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
  371. _SerializedRelationships.load_from_xml = load_from_xml_v2
  372. sections, tables = Docx()(filename, binary)
  373. if vision_model:
  374. figures_data = vision_figure_parser_figure_data_wrapper(sections)
  375. try:
  376. docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
  377. boosted_figures = docx_vision_parser(callback=callback)
  378. tables.extend(boosted_figures)
  379. except Exception as e:
  380. callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
  381. res = tokenize_table(tables, doc, is_english)
  382. callback(0.8, "Finish parsing.")
  383. st = timer()
  384. chunks, images = naive_merge_docx(
  385. sections, int(parser_config.get(
  386. "chunk_token_num", 128)), parser_config.get(
  387. "delimiter", "\n!?。;!?"))
  388. if kwargs.get("section_only", False):
  389. return chunks
  390. res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
  391. logging.info("naive_merge({}): {}".format(filename, timer() - st))
  392. return res
  393. elif re.search(r"\.pdf$", filename, re.IGNORECASE):
  394. layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
  395. if isinstance(layout_recognizer, bool):
  396. layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
  397. callback(0.1, "Start to parse.")
  398. if layout_recognizer == "DeepDOC":
  399. pdf_parser = Pdf()
  400. try:
  401. vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
  402. callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
  403. except Exception:
  404. vision_model = None
  405. if vision_model:
  406. sections, tables, figures = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback, separate_tables_figures=True)
  407. callback(0.5, "Basic parsing complete. Proceeding with figure enhancement...")
  408. try:
  409. pdf_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures, **kwargs)
  410. boosted_figures = pdf_vision_parser(callback=callback)
  411. tables.extend(boosted_figures)
  412. except Exception as e:
  413. callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
  414. tables.extend(figures)
  415. else:
  416. sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
  417. res = tokenize_table(tables, doc, is_english)
  418. callback(0.8, "Finish parsing.")
  419. else:
  420. if layout_recognizer == "Plain Text":
  421. pdf_parser = PlainParser()
  422. else:
  423. vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang)
  424. pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
  425. sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
  426. callback=callback)
  427. res = tokenize_table(tables, doc, is_english)
  428. callback(0.8, "Finish parsing.")
  429. elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
  430. callback(0.1, "Start to parse.")
  431. excel_parser = ExcelParser()
  432. if parser_config.get("html4excel"):
  433. sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
  434. else:
  435. sections = [(_, "") for _ in excel_parser(binary) if _]
  436. parser_config["chunk_token_num"] = 12800
  437. elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
  438. callback(0.1, "Start to parse.")
  439. sections = TxtParser()(filename, binary,
  440. parser_config.get("chunk_token_num", 128),
  441. parser_config.get("delimiter", "\n!?;。;!?"))
  442. callback(0.8, "Finish parsing.")
  443. elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
  444. callback(0.1, "Start to parse.")
  445. markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
  446. sections, tables = markdown_parser(filename, binary, separate_tables=False)
  447. # Process images for each section
  448. section_images = []
  449. for section_text, _ in sections:
  450. images = markdown_parser.get_pictures(section_text) if section_text else None
  451. if images:
  452. # If multiple images found, combine them using concat_img
  453. combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
  454. section_images.append(combined_image)
  455. else:
  456. section_images.append(None)
  457. res = tokenize_table(tables, doc, is_english)
  458. callback(0.8, "Finish parsing.")
  459. elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
  460. callback(0.1, "Start to parse.")
  461. sections = HtmlParser()(filename, binary)
  462. sections = [(_, "") for _ in sections if _]
  463. callback(0.8, "Finish parsing.")
  464. elif re.search(r"\.(json|jsonl|ldjson)$", filename, re.IGNORECASE):
  465. callback(0.1, "Start to parse.")
  466. chunk_token_num = int(parser_config.get("chunk_token_num", 128))
  467. sections = JsonParser(chunk_token_num)(binary)
  468. sections = [(_, "") for _ in sections if _]
  469. callback(0.8, "Finish parsing.")
  470. elif re.search(r"\.doc$", filename, re.IGNORECASE):
  471. callback(0.1, "Start to parse.")
  472. binary = BytesIO(binary)
  473. doc_parsed = parser.from_buffer(binary)
  474. if doc_parsed.get('content', None) is not None:
  475. sections = doc_parsed['content'].split('\n')
  476. sections = [(_, "") for _ in sections if _]
  477. callback(0.8, "Finish parsing.")
  478. else:
  479. callback(0.8, f"tika.parser got empty content from {filename}.")
  480. logging.warning(f"tika.parser got empty content from {filename}.")
  481. return []
  482. else:
  483. raise NotImplementedError(
  484. "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
  485. st = timer()
  486. if section_images:
  487. # if all images are None, set section_images to None
  488. if all(image is None for image in section_images):
  489. section_images = None
  490. if section_images:
  491. chunks, images = naive_merge_with_images(sections, section_images,
  492. int(parser_config.get(
  493. "chunk_token_num", 128)), parser_config.get(
  494. "delimiter", "\n!?。;!?"))
  495. if kwargs.get("section_only", False):
  496. return chunks
  497. res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
  498. else:
  499. chunks = naive_merge(
  500. sections, int(parser_config.get(
  501. "chunk_token_num", 128)), parser_config.get(
  502. "delimiter", "\n!?。;!?"))
  503. if kwargs.get("section_only", False):
  504. return chunks
  505. res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
  506. logging.info("naive_merge({}): {}".format(filename, timer() - st))
  507. return res
  508. if __name__ == "__main__":
  509. import sys
  510. def dummy(prog=None, msg=""):
  511. pass
  512. chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)