|
|
|
@@ -31,29 +31,48 @@ class RAGFlowPptParser: |
|
|
|
return paragraph.text |
|
|
|
|
|
|
|
def __extract(self, shape): |
|
|
|
if shape.shape_type == 19: |
|
|
|
tb = shape.table |
|
|
|
rows = [] |
|
|
|
for i in range(1, len(tb.rows)): |
|
|
|
rows.append("; ".join([tb.cell( |
|
|
|
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) |
|
|
|
return "\n".join(rows) |
|
|
|
try: |
|
|
|
# First try to get text content |
|
|
|
if hasattr(shape, 'has_text_frame') and shape.has_text_frame: |
|
|
|
text_frame = shape.text_frame |
|
|
|
texts = [] |
|
|
|
for paragraph in text_frame.paragraphs: |
|
|
|
if paragraph.text.strip(): |
|
|
|
texts.append(self.__get_bulleted_text(paragraph)) |
|
|
|
return "\n".join(texts) |
|
|
|
|
|
|
|
if shape.has_text_frame: |
|
|
|
text_frame = shape.text_frame |
|
|
|
texts = [] |
|
|
|
for paragraph in text_frame.paragraphs: |
|
|
|
if paragraph.text.strip(): |
|
|
|
texts.append(self.__get_bulleted_text(paragraph)) |
|
|
|
return "\n".join(texts) |
|
|
|
# Safely get shape_type |
|
|
|
try: |
|
|
|
shape_type = shape.shape_type |
|
|
|
except NotImplementedError: |
|
|
|
# If shape_type is not available, try to get text content |
|
|
|
if hasattr(shape, 'text'): |
|
|
|
return shape.text.strip() |
|
|
|
return "" |
|
|
|
|
|
|
|
if shape.shape_type == 6: |
|
|
|
texts = [] |
|
|
|
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)): |
|
|
|
t = self.__extract(p) |
|
|
|
if t: |
|
|
|
texts.append(t) |
|
|
|
return "\n".join(texts) |
|
|
|
# Handle table |
|
|
|
if shape_type == 19: |
|
|
|
tb = shape.table |
|
|
|
rows = [] |
|
|
|
for i in range(1, len(tb.rows)): |
|
|
|
rows.append("; ".join([tb.cell( |
|
|
|
0, j).text + ": " + tb.cell(i, j).text for j in range(len(tb.columns)) if tb.cell(i, j)])) |
|
|
|
return "\n".join(rows) |
|
|
|
|
|
|
|
# Handle group shape |
|
|
|
if shape_type == 6: |
|
|
|
texts = [] |
|
|
|
for p in sorted(shape.shapes, key=lambda x: (x.top // 10, x.left)): |
|
|
|
t = self.__extract_texts(p) |
|
|
|
if t: |
|
|
|
texts.append(t) |
|
|
|
return "\n".join(texts) |
|
|
|
|
|
|
|
return "" |
|
|
|
|
|
|
|
except Exception as e: |
|
|
|
logging.error(f"Error processing shape: {str(e)}") |
|
|
|
return "" |
|
|
|
|
|
|
|
def __call__(self, fnm, from_page, to_page, callback=None): |
|
|
|
ppt = Presentation(fnm) if isinstance( |