| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 | 
							- #
 - #  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
 - #
 - #  Licensed under the Apache License, Version 2.0 (the "License");
 - #  you may not use this file except in compliance with the License.
 - #  You may obtain a copy of the License at
 - #
 - #      http://www.apache.org/licenses/LICENSE-2.0
 - #
 - #  Unless required by applicable law or agreed to in writing, software
 - #  distributed under the License is distributed on an "AS IS" BASIS,
 - #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 - #  See the License for the specific language governing permissions and
 - #  limitations under the License.
 - #
 - 
 - import logging
 - from io import BytesIO
 - from pptx import Presentation
 - 
 - 
 - class RAGFlowPptParser:
 -     def __init__(self):
 -         super().__init__()
 - 
 -     def __get_bulleted_text(self, paragraph):
 -         is_bulleted = bool(paragraph._p.xpath("./a:pPr/a:buChar")) or bool(paragraph._p.xpath("./a:pPr/a:buAutoNum")) or bool(paragraph._p.xpath("./a:pPr/a:buBlip"))
 -         if is_bulleted:
 -             return f"{'  '* paragraph.level}.{paragraph.text}"
 -         else:
 -             return paragraph.text
 - 
 -     def __extract(self, shape):
 -         try:
 -             # First try to get text content
 -             if hasattr(shape, 'has_text_frame') and shape.has_text_frame:
 -                 text_frame = shape.text_frame
 -                 texts = []
 -                 for paragraph in text_frame.paragraphs:
 -                     if paragraph.text.strip():
 -                         texts.append(self.__get_bulleted_text(paragraph))
 -                 return "\n".join(texts)
 - 
 -             # Safely get shape_type
 -             try:
 -                 shape_type = shape.shape_type
 -             except NotImplementedError:
 -                 # If shape_type is not available, try to get text content
 -                 if hasattr(shape, 'text'):
 -                     return shape.text.strip()
 -                 return ""
 - 
 -             # Handle table
 -             if shape_type == 19:
 -                 tb = shape.table
 -                 rows = []
 -                 for i in range(1, len(tb.rows)):
 -                     rows.append("; ".join([tb.cell(
 -                         0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
 -                 return "\n".join(rows)
 - 
 -             # Handle group shape
 -             if shape_type == 6:
 -                 texts = []
 -                 for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
 -                     t = self.__extract_texts(p)
 -                     if t:
 -                         texts.append(t)
 -                 return "\n".join(texts)
 - 
 -             return ""
 - 
 -         except Exception as e:
 -             logging.error(f"Error processing shape: {str(e)}")
 -             return ""
 - 
 -     def __call__(self, fnm, from_page, to_page, callback=None):
 -         ppt = Presentation(fnm) if isinstance(
 -             fnm, str) else Presentation(
 -             BytesIO(fnm))
 -         txts = []
 -         self.total_page = len(ppt.slides)
 -         for i, slide in enumerate(ppt.slides):
 -             if i < from_page:
 -                 continue
 -             if i >= to_page:
 -                 break
 -             texts = []
 -             for shape in sorted(
 -                     slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
 -                 try:
 -                     txt = self.__extract(shape)
 -                     if txt:
 -                         texts.append(txt)
 -                 except Exception as e:
 -                     logging.exception(e)
 -             txts.append("\n".join(texts))
 - 
 -         return txts
 
 
  |