Преглед изворни кода

fix document extractor node incorrectly processing doc and ppt files (#12902)

tags/1.0.0
AugNSo пре 8 месеци
родитељ
комит
2b86465d4c
No account linked to committer's email address

+ 1
- 1
api/constants/__init__.py Прегледај датотеку



if dify_config.ETL_TYPE == "Unstructured": if dify_config.ETL_TYPE == "Unstructured":
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls"] DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls"]
DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
DOCUMENT_EXTENSIONS.extend(("doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
if dify_config.UNSTRUCTURED_API_URL: if dify_config.UNSTRUCTURED_API_URL:
DOCUMENT_EXTENSIONS.append("ppt") DOCUMENT_EXTENSIONS.append("ppt")
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS]) DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])

+ 53
- 8
api/core/workflow/nodes/document_extractor/node.py Прегледај датотеку

return _extract_text_from_plain_text(file_content) return _extract_text_from_plain_text(file_content)
case "application/pdf": case "application/pdf":
return _extract_text_from_pdf(file_content) return _extract_text_from_pdf(file_content)
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | "application/msword":
case "application/msword":
return _extract_text_from_doc(file_content) return _extract_text_from_doc(file_content)
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
return _extract_text_from_docx(file_content)
case "text/csv": case "text/csv":
return _extract_text_from_csv(file_content) return _extract_text_from_csv(file_content)
case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-excel": case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-excel":
return _extract_text_from_yaml(file_content) return _extract_text_from_yaml(file_content)
case ".pdf": case ".pdf":
return _extract_text_from_pdf(file_content) return _extract_text_from_pdf(file_content)
case ".doc" | ".docx":
case ".doc":
return _extract_text_from_doc(file_content) return _extract_text_from_doc(file_content)
case ".docx":
return _extract_text_from_docx(file_content)
case ".csv": case ".csv":
return _extract_text_from_csv(file_content) return _extract_text_from_csv(file_content)
case ".xls" | ".xlsx": case ".xls" | ".xlsx":


def _extract_text_from_doc(file_content: bytes) -> str: def _extract_text_from_doc(file_content: bytes) -> str:
""" """
Extract text from a DOC/DOCX file.
Extract text from a DOC file.
"""
from unstructured.partition.api import partition_via_api

if not (dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY):
raise TextExtractionError("UNSTRUCTURED_API_URL and UNSTRUCTURED_API_KEY must be set")

try:
with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
temp_file.write(file_content)
temp_file.flush()
with open(temp_file.name, "rb") as file:
elements = partition_via_api(
file=file,
metadata_filename=temp_file.name,
api_url=dify_config.UNSTRUCTURED_API_URL,
api_key=dify_config.UNSTRUCTURED_API_KEY,
)
os.unlink(temp_file.name)
return "\n".join([getattr(element, "text", "") for element in elements])
except Exception as e:
raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e


def _extract_text_from_docx(file_content: bytes) -> str:
"""
Extract text from a DOCX file.
For now support only paragraph and table add more if needed For now support only paragraph and table add more if needed
""" """
try: try:


text.append(markdown_table) text.append(markdown_table)
except Exception as e: except Exception as e:
logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
logger.warning(f"Failed to extract table from DOC: {e}")
continue continue


return "\n".join(text) return "\n".join(text)


except Exception as e: except Exception as e:
raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e
raise TextExtractionError(f"Failed to extract text from DOCX: {str(e)}") from e




def _download_file_content(file: File) -> bytes: def _download_file_content(file: File) -> bytes:




def _extract_text_from_ppt(file_content: bytes) -> str: def _extract_text_from_ppt(file_content: bytes) -> str:
from unstructured.partition.api import partition_via_api
from unstructured.partition.ppt import partition_ppt from unstructured.partition.ppt import partition_ppt


try: try:
with io.BytesIO(file_content) as file:
elements = partition_ppt(file=file)
if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as temp_file:
temp_file.write(file_content)
temp_file.flush()
with open(temp_file.name, "rb") as file:
elements = partition_via_api(
file=file,
metadata_filename=temp_file.name,
api_url=dify_config.UNSTRUCTURED_API_URL,
api_key=dify_config.UNSTRUCTURED_API_KEY,
)
os.unlink(temp_file.name)
else:
with io.BytesIO(file_content) as file:
elements = partition_ppt(file=file)
return "\n".join([getattr(element, "text", "") for element in elements]) return "\n".join([getattr(element, "text", "") for element in elements])

except Exception as e: except Exception as e:
raise TextExtractionError(f"Failed to extract text from PPT: {str(e)}") from e
raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e




def _extract_text_from_pptx(file_content: bytes) -> str: def _extract_text_from_pptx(file_content: bytes) -> str:

+ 4
- 4
api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py Прегледај датотеку

from core.workflow.entities.node_entities import NodeRunResult from core.workflow.entities.node_entities import NodeRunResult
from core.workflow.nodes.document_extractor import DocumentExtractorNode, DocumentExtractorNodeData from core.workflow.nodes.document_extractor import DocumentExtractorNode, DocumentExtractorNodeData
from core.workflow.nodes.document_extractor.node import ( from core.workflow.nodes.document_extractor.node import (
_extract_text_from_doc,
_extract_text_from_docx,
_extract_text_from_pdf, _extract_text_from_pdf,
_extract_text_from_plain_text, _extract_text_from_plain_text,
) )
monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_pdf", mock_pdf_extract) monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_pdf", mock_pdf_extract)
elif mime_type.startswith("application/vnd.openxmlformats"): elif mime_type.startswith("application/vnd.openxmlformats"):
mock_docx_extract = Mock(return_value=expected_text[0]) mock_docx_extract = Mock(return_value=expected_text[0])
monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_doc", mock_docx_extract)
monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_docx", mock_docx_extract)


result = document_extractor_node._run() result = document_extractor_node._run()






@patch("docx.Document") @patch("docx.Document")
def test_extract_text_from_doc(mock_document):
def test_extract_text_from_docx(mock_document):
mock_paragraph1 = Mock() mock_paragraph1 = Mock()
mock_paragraph1.text = "Paragraph 1" mock_paragraph1.text = "Paragraph 1"
mock_paragraph2 = Mock() mock_paragraph2 = Mock()
mock_paragraph2.text = "Paragraph 2" mock_paragraph2.text = "Paragraph 2"
mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2] mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2]


text = _extract_text_from_doc(b"PK\x03\x04")
text = _extract_text_from_docx(b"PK\x03\x04")
assert text == "Paragraph 1\nParagraph 2" assert text == "Paragraph 1\nParagraph 2"





+ 1
- 1
web/app/components/base/prompt-editor/constants.tsx Прегледај датотеку



export const FILE_EXTS: Record<string, string[]> = { export const FILE_EXTS: Record<string, string[]> = {
[SupportUploadFileTypes.image]: ['JPG', 'JPEG', 'PNG', 'GIF', 'WEBP', 'SVG'], [SupportUploadFileTypes.image]: ['JPG', 'JPEG', 'PNG', 'GIF', 'WEBP', 'SVG'],
[SupportUploadFileTypes.document]: ['TXT', 'MD', 'MDX', 'MARKDOWN', 'PDF', 'HTML', 'XLSX', 'XLS', 'DOCX', 'CSV', 'EML', 'MSG', 'PPTX', 'PPT', 'XML', 'EPUB'],
[SupportUploadFileTypes.document]: ['TXT', 'MD', 'MDX', 'MARKDOWN', 'PDF', 'HTML', 'XLSX', 'XLS', 'DOC', 'DOCX', 'CSV', 'EML', 'MSG', 'PPTX', 'PPT', 'XML', 'EPUB'],
[SupportUploadFileTypes.audio]: ['MP3', 'M4A', 'WAV', 'WEBM', 'AMR', 'MPGA'], [SupportUploadFileTypes.audio]: ['MP3', 'M4A', 'WAV', 'WEBM', 'AMR', 'MPGA'],
[SupportUploadFileTypes.video]: ['MP4', 'MOV', 'MPEG', 'MPGA'], [SupportUploadFileTypes.video]: ['MP4', 'MOV', 'MPEG', 'MPGA'],
} }

Loading…
Откажи
Сачувај