Преглед на файлове

feat(document_extractor): integrate unstructured API for PPTX extraction (#10180)

tags/0.11.0
-LAN- преди 1 година
родител
ревизия
53a7cb0e9d
No account linked to committer's email address
променени са 1 файла, в които са добавени 10 реда и са изтрити 1 реда
  1. 10
    1
      api/core/workflow/nodes/document_extractor/node.py

+ 10
- 1
api/core/workflow/nodes/document_extractor/node.py Целия файл

import pandas as pd import pandas as pd
import pypdfium2 import pypdfium2
import yaml import yaml
from unstructured.partition.api import partition_via_api
from unstructured.partition.email import partition_email from unstructured.partition.email import partition_email
from unstructured.partition.epub import partition_epub from unstructured.partition.epub import partition_epub
from unstructured.partition.msg import partition_msg from unstructured.partition.msg import partition_msg
from unstructured.partition.ppt import partition_ppt from unstructured.partition.ppt import partition_ppt
from unstructured.partition.pptx import partition_pptx from unstructured.partition.pptx import partition_pptx


from configs import dify_config
from core.file import File, FileTransferMethod, file_manager from core.file import File, FileTransferMethod, file_manager
from core.helper import ssrf_proxy from core.helper import ssrf_proxy
from core.variables import ArrayFileSegment from core.variables import ArrayFileSegment
def _extract_text_from_pptx(file_content: bytes) -> str: def _extract_text_from_pptx(file_content: bytes) -> str:
try: try:
with io.BytesIO(file_content) as file: with io.BytesIO(file_content) as file:
elements = partition_pptx(file=file)
if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
elements = partition_via_api(
file=file,
api_url=dify_config.UNSTRUCTURED_API_URL,
api_key=dify_config.UNSTRUCTURED_API_KEY,
)
else:
elements = partition_pptx(file=file)
return "\n".join([getattr(element, "text", "") for element in elements]) return "\n".join([getattr(element, "text", "") for element in elements])
except Exception as e: except Exception as e:
raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e

Loading…
Отказ
Запис