| @@ -6,12 +6,14 @@ import docx | |||
| import pandas as pd | |||
| import pypdfium2 | |||
| import yaml | |||
| from unstructured.partition.api import partition_via_api | |||
| from unstructured.partition.email import partition_email | |||
| from unstructured.partition.epub import partition_epub | |||
| from unstructured.partition.msg import partition_msg | |||
| from unstructured.partition.ppt import partition_ppt | |||
| from unstructured.partition.pptx import partition_pptx | |||
| from configs import dify_config | |||
| from core.file import File, FileTransferMethod, file_manager | |||
| from core.helper import ssrf_proxy | |||
| from core.variables import ArrayFileSegment | |||
| @@ -263,7 +265,14 @@ def _extract_text_from_ppt(file_content: bytes) -> str: | |||
| def _extract_text_from_pptx(file_content: bytes) -> str: | |||
| try: | |||
| with io.BytesIO(file_content) as file: | |||
| elements = partition_pptx(file=file) | |||
| if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY: | |||
| elements = partition_via_api( | |||
| file=file, | |||
| api_url=dify_config.UNSTRUCTURED_API_URL, | |||
| api_key=dify_config.UNSTRUCTURED_API_KEY, | |||
| ) | |||
| else: | |||
| elements = partition_pptx(file=file) | |||
| return "\n".join([getattr(element, "text", "") for element in elements]) | |||
| except Exception as e: | |||
| raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e | |||