소스 검색

feat: add VTT data transform to Document extractor (#18936)

tags/1.3.1
crazywoola 6 달 전
부모
커밋
2c2af1d117
No account linked to committer's email address
4개의 변경된 파일2208개의 추가작업 그리고 2152개의 파일을 삭제
  1. 2
    2
      api/constants/__init__.py
  2. 45
    1
      api/core/workflow/nodes/document_extractor/node.py
  3. 1
    0
      api/pyproject.toml
  4. 2160
    2149
      api/uv.lock

+ 2
- 2
api/constants/__init__.py 파일 보기

@@ -16,11 +16,11 @@ AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS])


if dify_config.ETL_TYPE == "Unstructured":
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls"]
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "vtt"]
DOCUMENT_EXTENSIONS.extend(("doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
if dify_config.UNSTRUCTURED_API_URL:
DOCUMENT_EXTENSIONS.append("ppt")
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
else:
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv", "vtt"]
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])

+ 45
- 1
api/core/workflow/nodes/document_extractor/node.py 파일 보기

@@ -11,6 +11,7 @@ import docx
import pandas as pd
import pypandoc # type: ignore
import pypdfium2 # type: ignore
import webvtt # type: ignore
import yaml # type: ignore
from docx.document import Document
from docx.oxml.table import CT_Tbl
@@ -132,6 +133,8 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
return _extract_text_from_json(file_content)
case "application/x-yaml" | "text/yaml":
return _extract_text_from_yaml(file_content)
case "text/vtt":
return _extract_text_from_vtt(file_content)
case _:
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")

@@ -139,7 +142,7 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) -> str:
"""Extract text from a file based on its file extension."""
match file_extension:
case ".txt" | ".markdown" | ".md" | ".html" | ".htm" | ".xml" | ".vtt":
case ".txt" | ".markdown" | ".md" | ".html" | ".htm" | ".xml":
return _extract_text_from_plain_text(file_content)
case ".json":
return _extract_text_from_json(file_content)
@@ -165,6 +168,8 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
return _extract_text_from_eml(file_content)
case ".msg":
return _extract_text_from_msg(file_content)
case ".vtt":
return _extract_text_from_vtt(file_content)
case _:
raise UnsupportedFileTypeError(f"Unsupported Extension Type: {file_extension}")

@@ -462,3 +467,42 @@ def _extract_text_from_msg(file_content: bytes) -> str:
return "\n".join([str(element) for element in elements])
except Exception as e:
raise TextExtractionError(f"Failed to extract text from MSG: {str(e)}") from e


def _extract_text_from_vtt(vtt_bytes: bytes) -> str:
text = _extract_text_from_plain_text(vtt_bytes)

# remove bom
text = text.lstrip("\ufeff")

raw_results = []
for caption in webvtt.from_string(text):
raw_results.append((caption.voice, caption.text))

# Merge consecutive utterances by the same speaker
merged_results = []
if raw_results:
current_speaker, current_text = raw_results[0]

for i in range(1, len(raw_results)):
spk, txt = raw_results[i]
if spk == None:
merged_results.append((None, current_text))
continue

if spk == current_speaker:
# If it is the same speaker, merge the utterances (joined by space)
current_text += " " + txt
else:
# If the speaker changes, register the utterance so far and move on
merged_results.append((current_speaker, current_text))
current_speaker, current_text = spk, txt

# Add the last element
merged_results.append((current_speaker, current_text))
else:
merged_results = raw_results

# Return the result in the specified format: Speaker "text" style
formatted = [f'{spk or ""} "{txt}"' for spk, txt in merged_results]
return "\n".join(formatted)

+ 1
- 0
api/pyproject.toml 파일 보기

@@ -84,6 +84,7 @@ dependencies = [
"validators==0.21.0",
"weave~=0.51.34",
"yarl~=1.18.3",
"webvtt-py~=0.5.1",
]
# Before adding new dependency, consider place it in
# alphabet order (a-z) and suitable group.

+ 2160
- 2149
api/uv.lock
파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
파일 보기


Loading…
취소
저장