### What problem does this PR solve? Add VLM-boosted PDF parser if VLM is set. ### Type of change - [x] New Feature (non-breaking change which adds functionality)tags/v0.18.0
| # | |||||
| # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # | |||||
| from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk | |||||
| from rag.prompts import vision_llm_figure_describe_prompt | |||||
| class VisionFigureParser: | |||||
| def __init__(self, vision_model, figures_data, *args, **kwargs): | |||||
| self.vision_model = vision_model | |||||
| self._extract_figures_info(figures_data) | |||||
| assert len(self.figures) == len(self.descriptions) | |||||
| assert not self.positions or (len(self.figures) == len(self.positions)) | |||||
| def _extract_figures_info(self, figures_data): | |||||
| self.figures = [] | |||||
| self.descriptions = [] | |||||
| self.positions = [] | |||||
| for item in figures_data: | |||||
| # position | |||||
| if len(item) == 2 and isinstance(item[1], list) and len(item[1]) == 1 and len(item[1][0]) == 5: | |||||
| img_desc = item[0] | |||||
| assert len(img_desc) == 2, "Should be (figure, [description])" | |||||
| self.figures.append(img_desc[0]) | |||||
| self.descriptions.append(img_desc[1]) | |||||
| self.positions.append(item[1]) | |||||
| else: | |||||
| assert len(item) == 2 and isinstance(item, tuple), f"get {len(item)=}, {item=}" | |||||
| self.figures.append(item[0]) | |||||
| self.descriptions.append(item[1]) | |||||
| def _assemble(self): | |||||
| self.assembled = [] | |||||
| self.has_positions = len(self.positions) != 0 | |||||
| for i in range(len(self.figures)): | |||||
| figure = self.figures[i] | |||||
| desc = self.descriptions[i] | |||||
| pos = self.positions[i] if self.has_positions else None | |||||
| figure_desc = (figure, desc) | |||||
| if pos is not None: | |||||
| self.assembled.append((figure_desc, pos)) | |||||
| else: | |||||
| self.assembled.append((figure_desc,)) | |||||
| return self.assembled | |||||
| def __call__(self, **kwargs): | |||||
| callback = kwargs.get("callback", lambda prog, msg: None) | |||||
| for idx, img_binary in enumerate(self.figures or []): | |||||
| figure_num = idx # 0-based | |||||
| txt = picture_vision_llm_chunk( | |||||
| binary=img_binary, | |||||
| vision_model=self.vision_model, | |||||
| prompt=vision_llm_figure_describe_prompt(), | |||||
| callback=callback, | |||||
| ) | |||||
| if txt: | |||||
| self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num]) | |||||
| self._assemble() | |||||
| return self.assembled |
| b_["top"] = b["top"] | b_["top"] = b["top"] | ||||
| self.boxes.pop(i) | self.boxes.pop(i) | ||||
| def _extract_table_figure(self, need_image, ZM, | |||||
| return_html, need_position): | |||||
| def _extract_table_figure(self, need_image, ZM, return_html, need_position, separate_tables_figures=False): | |||||
| tables = {} | tables = {} | ||||
| figures = {} | figures = {} | ||||
| # extract figure and table boxes | # extract figure and table boxes | ||||
| tk) | tk) | ||||
| self.boxes.pop(i) | self.boxes.pop(i) | ||||
| res = [] | |||||
| positions = [] | |||||
| def cropout(bxs, ltype, poss): | def cropout(bxs, ltype, poss): | ||||
| nonlocal ZM | nonlocal ZM | ||||
| pn = set([b["page_number"] - 1 for b in bxs]) | pn = set([b["page_number"] - 1 for b in bxs]) | ||||
| height += img.size[1] | height += img.size[1] | ||||
| return pic | return pic | ||||
| res = [] | |||||
| positions = [] | |||||
| figure_results = [] | |||||
| figure_positions = [] | |||||
| # crop figure out and add caption | # crop figure out and add caption | ||||
| for k, bxs in figures.items(): | for k, bxs in figures.items(): | ||||
| txt = "\n".join([b["text"] for b in bxs]) | txt = "\n".join([b["text"] for b in bxs]) | ||||
| continue | continue | ||||
| poss = [] | poss = [] | ||||
| res.append( | |||||
| (cropout( | |||||
| bxs, | |||||
| "figure", poss), | |||||
| [txt])) | |||||
| positions.append(poss) | |||||
| if separate_tables_figures: | |||||
| figure_results.append( | |||||
| (cropout( | |||||
| bxs, | |||||
| "figure", poss), | |||||
| [txt])) | |||||
| figure_positions.append(poss) | |||||
| else: | |||||
| res.append( | |||||
| (cropout( | |||||
| bxs, | |||||
| "figure", poss), | |||||
| [txt])) | |||||
| positions.append(poss) | |||||
| for k, bxs in tables.items(): | for k, bxs in tables.items(): | ||||
| if not bxs: | if not bxs: | ||||
| continue | continue | ||||
| bxs = Recognizer.sort_Y_firstly(bxs, np.mean( | bxs = Recognizer.sort_Y_firstly(bxs, np.mean( | ||||
| [(b["bottom"] - b["top"]) / 2 for b in bxs])) | [(b["bottom"] - b["top"]) / 2 for b in bxs])) | ||||
| poss = [] | poss = [] | ||||
| res.append((cropout(bxs, "table", poss), | res.append((cropout(bxs, "table", poss), | ||||
| self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english))) | self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english))) | ||||
| positions.append(poss) | positions.append(poss) | ||||
| assert len(positions) == len(res) | |||||
| if need_position: | |||||
| return list(zip(res, positions)) | |||||
| return res | |||||
| if separate_tables_figures: | |||||
| assert len(positions) + len(figure_positions) == len(res) + len(figure_results) | |||||
| if need_position: | |||||
| return list(zip(res, positions)), list(zip(figure_results, figure_positions)) | |||||
| else: | |||||
| return res, figure_results | |||||
| else: | |||||
| assert len(positions) == len(res) | |||||
| if need_position: | |||||
| return list(zip(res, positions)) | |||||
| else: | |||||
| return res | |||||
| def proj_match(self, line): | def proj_match(self, line): | ||||
| if len(line) <= 2: | if len(line) <= 2: |
| from api.db.services.llm_service import LLMBundle | from api.db.services.llm_service import LLMBundle | ||||
| from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser | from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser | ||||
| from deepdoc.parser.pdf_parser import PlainParser, VisionParser | from deepdoc.parser.pdf_parser import PlainParser, VisionParser | ||||
| from deepdoc.parser.figure_parser import VisionFigureParser | |||||
| from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table | from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table | ||||
| from rag.utils import num_tokens_from_string | from rag.utils import num_tokens_from_string | ||||
| super().__init__() | super().__init__() | ||||
| def __call__(self, filename, binary=None, from_page=0, | def __call__(self, filename, binary=None, from_page=0, | ||||
| to_page=100000, zoomin=3, callback=None): | |||||
| to_page=100000, zoomin=3, callback=None, separate_tables_figures=False): | |||||
| start = timer() | start = timer() | ||||
| first_start = start | first_start = start | ||||
| callback(msg="OCR started") | callback(msg="OCR started") | ||||
| start = timer() | start = timer() | ||||
| self._text_merge() | self._text_merge() | ||||
| callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) | callback(0.67, "Text merged ({:.2f}s)".format(timer() - start)) | ||||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||||
| # self._naive_vertical_merge() | |||||
| self._concat_downward() | |||||
| # self._filter_forpages() | |||||
| logging.info("layouts cost: {}s".format(timer() - first_start)) | |||||
| return [(b["text"], self._line_tag(b, zoomin)) | |||||
| for b in self.boxes], tbls | |||||
| if separate_tables_figures: | |||||
| tbls, figures = self._extract_table_figure(True, zoomin, True, True, True) | |||||
| self._concat_downward() | |||||
| logging.info("layouts cost: {}s".format(timer() - first_start)) | |||||
| return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls, figures | |||||
| else: | |||||
| tbls = self._extract_table_figure(True, zoomin, True, True) | |||||
| # self._naive_vertical_merge() | |||||
| self._concat_downward() | |||||
| # self._filter_forpages() | |||||
| logging.info("layouts cost: {}s".format(timer() - first_start)) | |||||
| return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls | |||||
| class Markdown(MarkdownParser): | class Markdown(MarkdownParser): | ||||
| if layout_recognizer == "DeepDOC": | if layout_recognizer == "DeepDOC": | ||||
| pdf_parser = Pdf() | pdf_parser = Pdf() | ||||
| elif layout_recognizer == "Plain Text": | |||||
| pdf_parser = PlainParser() | |||||
| try: | |||||
| vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) | |||||
| except Exception: | |||||
| vision_model = None | |||||
| if vision_model: | |||||
| sections, tables, figures = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback, separate_tables_figures=True) | |||||
| pdf_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures, **kwargs) | |||||
| boosted_figures = pdf_vision_parser(callback=callback) | |||||
| tables.extend(boosted_figures) | |||||
| else: | |||||
| sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) | |||||
| res = tokenize_table(tables, doc, is_english) | |||||
| else: | else: | ||||
| vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang) | |||||
| pdf_parser = VisionParser(vision_model=vision_model, **kwargs) | |||||
| if layout_recognizer == "Plain Text": | |||||
| pdf_parser = PlainParser() | |||||
| else: | |||||
| vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang) | |||||
| pdf_parser = VisionParser(vision_model=vision_model, **kwargs) | |||||
| sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, | |||||
| callback=callback) | |||||
| res = tokenize_table(tables, doc, is_english) | |||||
| sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, | |||||
| callback=callback) | |||||
| res = tokenize_table(tables, doc, is_english) | |||||
| elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE): | elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE): | ||||
| callback(0.1, "Start to parse.") | callback(0.1, "Start to parse.") |
| except Exception as e: | except Exception as e: | ||||
| callback(-1, str(e)) | callback(-1, str(e)) | ||||
| return [] | |||||
| return "" |
| - If you do not detect valid content in the image, return an empty string. | - If you do not detect valid content in the image, return an empty string. | ||||
| """ | """ | ||||
| return prompt_en | return prompt_en | ||||
| def vision_llm_figure_describe_prompt() -> str: | |||||
| prompt = """ | |||||
| You are an expert visual data analyst. Analyze the image and provide a comprehensive description of its content. Focus on identifying the type of visual data representation (e.g., bar chart, pie chart, line graph, table, flowchart), its structure, and any text captions or labels included in the image. | |||||
| Tasks: | |||||
| 1. Describe the overall structure of the visual representation. Specify if it is a chart, graph, table, or diagram. | |||||
| 2. Identify and extract any axes, legends, titles, or labels present in the image. Provide the exact text where available. | |||||
| 3. Extract the data points from the visual elements (e.g., bar heights, line graph coordinates, pie chart segments, table rows and columns). | |||||
| 4. Analyze and explain any trends, comparisons, or patterns shown in the data. | |||||
| 5. Capture any annotations, captions, or footnotes, and explain their relevance to the image. | |||||
| 6. Only include details that are explicitly present in the image. If an element (e.g., axis, legend, or caption) does not exist or is not visible, do not mention it. | |||||
| Output format (include only sections relevant to the image content): | |||||
| - Visual Type: [Type] | |||||
| - Title: [Title text, if available] | |||||
| - Axes / Legends / Labels: [Details, if available] | |||||
| - Data Points: [Extracted data] | |||||
| - Trends / Insights: [Analysis and interpretation] | |||||
| - Captions / Annotations: [Text and relevance, if available] | |||||
| Ensure high accuracy, clarity, and completeness in your analysis, and includes only the information present in the image. Avoid unnecessary statements about missing elements. | |||||
| """ | |||||
| return prompt |