| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 | 
							- #  Licensed under the Apache License, Version 2.0 (the "License");
 - #  you may not use this file except in compliance with the License.
 - #  You may obtain a copy of the License at
 - #
 - #      http://www.apache.org/licenses/LICENSE-2.0
 - #
 - #  Unless required by applicable law or agreed to in writing, software
 - #  distributed under the License is distributed on an "AS IS" BASIS,
 - #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 - #  See the License for the specific language governing permissions and
 - #  limitations under the License.
 - #
 - import logging
 - from io import BytesIO
 - from pptx import Presentation
 - 
 - 
 - class RAGFlowPptParser(object):
 -     def __init__(self):
 -         super().__init__()
 - 
 -     def __extract(self, shape):
 -         if shape.shape_type == 19:
 -             tb = shape.table
 -             rows = []
 -             for i in range(1, len(tb.rows)):
 -                 rows.append("; ".join([tb.cell(
 -                     0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)]))
 -             return "\n".join(rows)
 - 
 -         if shape.has_text_frame:
 -             return shape.text_frame.text
 - 
 -         if shape.shape_type == 6:
 -             texts = []
 -             for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)):
 -                 t = self.__extract(p)
 -                 if t:
 -                     texts.append(t)
 -             return "\n".join(texts)
 - 
 -     def __call__(self, fnm, from_page, to_page, callback=None):
 -         ppt = Presentation(fnm) if isinstance(
 -             fnm, str) else Presentation(
 -             BytesIO(fnm))
 -         txts = []
 -         self.total_page = len(ppt.slides)
 -         for i, slide in enumerate(ppt.slides):
 -             if i < from_page:
 -                 continue
 -             if i >= to_page:
 -                 break
 -             texts = []
 -             for shape in sorted(
 -                     slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left)):
 -                 try:
 -                     txt = self.__extract(shape)
 -                     if txt:
 -                         texts.append(txt)
 -                 except Exception as e:
 -                     logging.exception(e)
 -             txts.append("\n".join(texts))
 - 
 -         return txts
 
 
  |