Signed-off-by: yihong0618 <zouzou0208@gmail.com>tags/0.14.0
| if dify_config.ETL_TYPE == "Unstructured": | if dify_config.ETL_TYPE == "Unstructured": | ||||
| DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"] | |||||
| DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls"] | |||||
| DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "xml", "epub")) | DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "xml", "epub")) | ||||
| if dify_config.UNSTRUCTURED_API_URL: | if dify_config.UNSTRUCTURED_API_URL: | ||||
| DOCUMENT_EXTENSIONS.append("ppt") | DOCUMENT_EXTENSIONS.append("ppt") | ||||
| DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS]) | DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS]) | ||||
| else: | else: | ||||
| DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"] | |||||
| DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"] | |||||
| DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS]) | DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS]) |
| extractor = ExcelExtractor(file_path) | extractor = ExcelExtractor(file_path) | ||||
| elif file_extension == ".pdf": | elif file_extension == ".pdf": | ||||
| extractor = PdfExtractor(file_path) | extractor = PdfExtractor(file_path) | ||||
| elif file_extension in {".md", ".markdown"}: | |||||
| elif file_extension in {".md", ".markdown", ".mdx"}: | |||||
| extractor = ( | extractor = ( | ||||
| UnstructuredMarkdownExtractor(file_path, unstructured_api_url, unstructured_api_key) | UnstructuredMarkdownExtractor(file_path, unstructured_api_url, unstructured_api_key) | ||||
| if is_automatic | if is_automatic | ||||
| extractor = ExcelExtractor(file_path) | extractor = ExcelExtractor(file_path) | ||||
| elif file_extension == ".pdf": | elif file_extension == ".pdf": | ||||
| extractor = PdfExtractor(file_path) | extractor = PdfExtractor(file_path) | ||||
| elif file_extension in {".md", ".markdown"}: | |||||
| elif file_extension in {".md", ".markdown", ".mdx"}: | |||||
| extractor = MarkdownExtractor(file_path, autodetect_encoding=True) | extractor = MarkdownExtractor(file_path, autodetect_encoding=True) | ||||
| elif file_extension in {".htm", ".html"}: | elif file_extension in {".htm", ".html"}: | ||||
| extractor = HtmlExtractor(file_path) | extractor = HtmlExtractor(file_path) |
| return <Json className={className} /> | return <Json className={className} /> | ||||
| case 'md': | case 'md': | ||||
| case 'markdown': | case 'markdown': | ||||
| case 'mdx': | |||||
| return <Md className={className} /> | return <Md className={className} /> | ||||
| case 'pdf': | case 'pdf': | ||||
| return <Pdf className={className} /> | return <Pdf className={className} /> |
| if (extension === 'pdf') | if (extension === 'pdf') | ||||
| return FileAppearanceTypeEnum.pdf | return FileAppearanceTypeEnum.pdf | ||||
| if (extension === 'md' || extension === 'markdown') | |||||
| if (extension === 'md' || extension === 'markdown' || extension === 'mdx') | |||||
| return FileAppearanceTypeEnum.markdown | return FileAppearanceTypeEnum.markdown | ||||
| if (extension === 'xlsx' || extension === 'xls') | if (extension === 'xlsx' || extension === 'xls') |
| export const FILE_EXTS: Record<string, string[]> = { | export const FILE_EXTS: Record<string, string[]> = { | ||||
| [SupportUploadFileTypes.image]: ['JPG', 'JPEG', 'PNG', 'GIF', 'WEBP', 'SVG'], | [SupportUploadFileTypes.image]: ['JPG', 'JPEG', 'PNG', 'GIF', 'WEBP', 'SVG'], | ||||
| [SupportUploadFileTypes.document]: ['TXT', 'MD', 'MARKDOWN', 'PDF', 'HTML', 'XLSX', 'XLS', 'DOCX', 'CSV', 'EML', 'MSG', 'PPTX', 'PPT', 'XML', 'EPUB'], | |||||
| [SupportUploadFileTypes.document]: ['TXT', 'MD', 'MDX', 'MARKDOWN', 'PDF', 'HTML', 'XLSX', 'XLS', 'DOCX', 'CSV', 'EML', 'MSG', 'PPTX', 'PPT', 'XML', 'EPUB'], | |||||
| [SupportUploadFileTypes.audio]: ['MP3', 'M4A', 'WAV', 'WEBM', 'AMR', 'MPGA'], | [SupportUploadFileTypes.audio]: ['MP3', 'M4A', 'WAV', 'WEBM', 'AMR', 'MPGA'], | ||||
| [SupportUploadFileTypes.video]: ['MP4', 'MOV', 'MPEG', 'MPGA'], | [SupportUploadFileTypes.video]: ['MP4', 'MOV', 'MPEG', 'MPGA'], | ||||
| } | } |