Co-authored-by: jyong <jyong@dify.ai>tags/0.3.34
| @@ -117,3 +117,6 @@ HOSTED_ANTHROPIC_API_BASE= | |||
| HOSTED_ANTHROPIC_API_KEY= | |||
| HOSTED_ANTHROPIC_QUOTA_LIMIT=600000 | |||
| HOSTED_ANTHROPIC_PAID_ENABLED=false | |||
| ETL_TYPE=dify | |||
| UNSTRUCTURED_API_URL= | |||
| @@ -54,7 +54,8 @@ DEFAULTS = { | |||
| 'UPLOAD_IMAGE_FILE_SIZE_LIMIT': 10, | |||
| 'OUTPUT_MODERATION_BUFFER_SIZE': 300, | |||
| 'MULTIMODAL_SEND_IMAGE_FORMAT': 'base64', | |||
| 'INVITE_EXPIRY_HOURS': 72 | |||
| 'INVITE_EXPIRY_HOURS': 72, | |||
| 'ETL_TYPE': 'dify', | |||
| } | |||
| @@ -276,6 +277,9 @@ class Config: | |||
| self.HOSTED_MODERATION_ENABLED = get_bool_env('HOSTED_MODERATION_ENABLED') | |||
| self.HOSTED_MODERATION_PROVIDERS = get_env('HOSTED_MODERATION_PROVIDERS') | |||
| self.ETL_TYPE = get_env('ETL_TYPE') | |||
| self.UNSTRUCTURED_API_URL = get_env('UNSTRUCTURED_API_URL') | |||
| class CloudEditionConfig(Config): | |||
| @@ -69,5 +69,20 @@ class FilePreviewApi(Resource): | |||
| return {'content': text} | |||
| class FileeSupportTypApi(Resource): | |||
| @setup_required | |||
| @login_required | |||
| @account_initialization_required | |||
| def get(self): | |||
| etl_type = current_app.config['ETL_TYPE'] | |||
| if etl_type == 'Unstructured': | |||
| allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', | |||
| 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml'] | |||
| else: | |||
| allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] | |||
| return {'allowed_extensions': allowed_extensions} | |||
| api.add_resource(FileApi, '/files/upload') | |||
| api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview') | |||
| api.add_resource(FileeSupportTypApi, '/files/support-type') | |||
| @@ -3,7 +3,8 @@ from pathlib import Path | |||
| from typing import List, Union, Optional | |||
| import requests | |||
| from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredFileLoader, UnstructuredAPIFileLoader | |||
| from flask import current_app | |||
| from langchain.document_loaders import TextLoader, Docx2txtLoader | |||
| from langchain.schema import Document | |||
| from core.data_loader.loader.csv_loader import CSVLoader | |||
| @@ -11,6 +12,13 @@ from core.data_loader.loader.excel import ExcelLoader | |||
| from core.data_loader.loader.html import HTMLLoader | |||
| from core.data_loader.loader.markdown import MarkdownLoader | |||
| from core.data_loader.loader.pdf import PdfLoader | |||
| from core.data_loader.loader.unstructured.unstructured_eml import UnstructuredEmailLoader | |||
| from core.data_loader.loader.unstructured.unstructured_markdown import UnstructuredMarkdownLoader | |||
| from core.data_loader.loader.unstructured.unstructured_msg import UnstructuredMsgLoader | |||
| from core.data_loader.loader.unstructured.unstructured_ppt import UnstructuredPPTLoader | |||
| from core.data_loader.loader.unstructured.unstructured_pptx import UnstructuredPPTXLoader | |||
| from core.data_loader.loader.unstructured.unstructured_text import UnstructuredTextLoader | |||
| from core.data_loader.loader.unstructured.unstructured_xml import UnstructuredXmlLoader | |||
| from extensions.ext_storage import storage | |||
| from models.model import UploadFile | |||
| @@ -49,14 +57,34 @@ class FileExtractor: | |||
| input_file = Path(file_path) | |||
| delimiter = '\n' | |||
| file_extension = input_file.suffix.lower() | |||
| if is_automatic: | |||
| loader = UnstructuredFileLoader( | |||
| file_path, strategy="hi_res", mode="elements" | |||
| ) | |||
| # loader = UnstructuredAPIFileLoader( | |||
| # file_path=filenames[0], | |||
| # api_key="FAKE_API_KEY", | |||
| # ) | |||
| etl_type = current_app.config['ETL_TYPE'] | |||
| unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL'] | |||
| if etl_type == 'Unstructured': | |||
| if file_extension == '.xlsx': | |||
| loader = ExcelLoader(file_path) | |||
| elif file_extension == '.pdf': | |||
| loader = PdfLoader(file_path, upload_file=upload_file) | |||
| elif file_extension in ['.md', '.markdown']: | |||
| loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url) | |||
| elif file_extension in ['.htm', '.html']: | |||
| loader = HTMLLoader(file_path) | |||
| elif file_extension == '.docx': | |||
| loader = Docx2txtLoader(file_path) | |||
| elif file_extension == '.csv': | |||
| loader = CSVLoader(file_path, autodetect_encoding=True) | |||
| elif file_extension == '.msg': | |||
| loader = UnstructuredMsgLoader(file_path, unstructured_api_url) | |||
| elif file_extension == '.eml': | |||
| loader = UnstructuredEmailLoader(file_path, unstructured_api_url) | |||
| elif file_extension == '.ppt': | |||
| loader = UnstructuredPPTLoader(file_path, unstructured_api_url) | |||
| elif file_extension == '.pptx': | |||
| loader = UnstructuredPPTXLoader(file_path, unstructured_api_url) | |||
| elif file_extension == '.xml': | |||
| loader = UnstructuredXmlLoader(file_path, unstructured_api_url) | |||
| else: | |||
| # txt | |||
| loader = UnstructuredTextLoader(file_path, unstructured_api_url) | |||
| else: | |||
| if file_extension == '.xlsx': | |||
| loader = ExcelLoader(file_path) | |||
| @@ -0,0 +1,41 @@ | |||
| import logging | |||
| import re | |||
| from typing import Optional, List, Tuple, cast | |||
| from langchain.document_loaders.base import BaseLoader | |||
| from langchain.document_loaders.helpers import detect_file_encodings | |||
| from langchain.schema import Document | |||
| logger = logging.getLogger(__name__) | |||
| class UnstructuredEmailLoader(BaseLoader): | |||
| """Load msg files. | |||
| Args: | |||
| file_path: Path to the file to load. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| file_path: str, | |||
| api_url: str, | |||
| ): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||
| def load(self) -> List[Document]: | |||
| from unstructured.partition.email import partition_email | |||
| elements = partition_email(filename=self._file_path, api_url=self._api_url) | |||
| from unstructured.chunking.title import chunk_by_title | |||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||
| documents = [] | |||
| for chunk in chunks: | |||
| text = chunk.text.strip() | |||
| documents.append(Document(page_content=text)) | |||
| return documents | |||
| @@ -0,0 +1,48 @@ | |||
| import logging | |||
| from typing import List | |||
| from langchain.document_loaders.base import BaseLoader | |||
| from langchain.schema import Document | |||
| logger = logging.getLogger(__name__) | |||
| class UnstructuredMarkdownLoader(BaseLoader): | |||
| """Load md files. | |||
| Args: | |||
| file_path: Path to the file to load. | |||
| remove_hyperlinks: Whether to remove hyperlinks from the text. | |||
| remove_images: Whether to remove images from the text. | |||
| encoding: File encoding to use. If `None`, the file will be loaded | |||
| with the default system encoding. | |||
| autodetect_encoding: Whether to try to autodetect the file encoding | |||
| if the specified encoding fails. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| file_path: str, | |||
| api_url: str, | |||
| ): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||
| def load(self) -> List[Document]: | |||
| from unstructured.partition.md import partition_md | |||
| elements = partition_md(filename=self._file_path, api_url=self._api_url) | |||
| from unstructured.chunking.title import chunk_by_title | |||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||
| documents = [] | |||
| for chunk in chunks: | |||
| text = chunk.text.strip() | |||
| documents.append(Document(page_content=text)) | |||
| return documents | |||
| @@ -0,0 +1,40 @@ | |||
| import logging | |||
| import re | |||
| from typing import Optional, List, Tuple, cast | |||
| from langchain.document_loaders.base import BaseLoader | |||
| from langchain.document_loaders.helpers import detect_file_encodings | |||
| from langchain.schema import Document | |||
| logger = logging.getLogger(__name__) | |||
| class UnstructuredMsgLoader(BaseLoader): | |||
| """Load msg files. | |||
| Args: | |||
| file_path: Path to the file to load. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| file_path: str, | |||
| api_url: str | |||
| ): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||
| def load(self) -> List[Document]: | |||
| from unstructured.partition.msg import partition_msg | |||
| elements = partition_msg(filename=self._file_path, api_url=self._api_url) | |||
| from unstructured.chunking.title import chunk_by_title | |||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||
| documents = [] | |||
| for chunk in chunks: | |||
| text = chunk.text.strip() | |||
| documents.append(Document(page_content=text)) | |||
| return documents | |||
| @@ -0,0 +1,40 @@ | |||
| import logging | |||
| import re | |||
| from typing import Optional, List, Tuple, cast | |||
| from langchain.document_loaders.base import BaseLoader | |||
| from langchain.document_loaders.helpers import detect_file_encodings | |||
| from langchain.schema import Document | |||
| logger = logging.getLogger(__name__) | |||
| class UnstructuredPPTLoader(BaseLoader): | |||
| """Load msg files. | |||
| Args: | |||
| file_path: Path to the file to load. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| file_path: str, | |||
| api_url: str | |||
| ): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||
| def load(self) -> List[Document]: | |||
| from unstructured.partition.ppt import partition_ppt | |||
| elements = partition_ppt(filename=self._file_path, api_url=self._api_url) | |||
| from unstructured.chunking.title import chunk_by_title | |||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||
| documents = [] | |||
| for chunk in chunks: | |||
| text = chunk.text.strip() | |||
| documents.append(Document(page_content=text)) | |||
| return documents | |||
| @@ -0,0 +1,40 @@ | |||
| import logging | |||
| import re | |||
| from typing import Optional, List, Tuple, cast | |||
| from langchain.document_loaders.base import BaseLoader | |||
| from langchain.document_loaders.helpers import detect_file_encodings | |||
| from langchain.schema import Document | |||
| logger = logging.getLogger(__name__) | |||
| class UnstructuredPPTXLoader(BaseLoader): | |||
| """Load msg files. | |||
| Args: | |||
| file_path: Path to the file to load. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| file_path: str, | |||
| api_url: str | |||
| ): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||
| def load(self) -> List[Document]: | |||
| from unstructured.partition.pptx import partition_pptx | |||
| elements = partition_pptx(filename=self._file_path, api_url=self._api_url) | |||
| from unstructured.chunking.title import chunk_by_title | |||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||
| documents = [] | |||
| for chunk in chunks: | |||
| text = chunk.text.strip() | |||
| documents.append(Document(page_content=text)) | |||
| return documents | |||
| @@ -0,0 +1,40 @@ | |||
| import logging | |||
| import re | |||
| from typing import Optional, List, Tuple, cast | |||
| from langchain.document_loaders.base import BaseLoader | |||
| from langchain.document_loaders.helpers import detect_file_encodings | |||
| from langchain.schema import Document | |||
| logger = logging.getLogger(__name__) | |||
| class UnstructuredTextLoader(BaseLoader): | |||
| """Load msg files. | |||
| Args: | |||
| file_path: Path to the file to load. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| file_path: str, | |||
| api_url: str | |||
| ): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||
| def load(self) -> List[Document]: | |||
| from unstructured.partition.text import partition_text | |||
| elements = partition_text(filename=self._file_path, api_url=self._api_url) | |||
| from unstructured.chunking.title import chunk_by_title | |||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||
| documents = [] | |||
| for chunk in chunks: | |||
| text = chunk.text.strip() | |||
| documents.append(Document(page_content=text)) | |||
| return documents | |||
| @@ -0,0 +1,40 @@ | |||
| import logging | |||
| import re | |||
| from typing import Optional, List, Tuple, cast | |||
| from langchain.document_loaders.base import BaseLoader | |||
| from langchain.document_loaders.helpers import detect_file_encodings | |||
| from langchain.schema import Document | |||
| logger = logging.getLogger(__name__) | |||
| class UnstructuredXmlLoader(BaseLoader): | |||
| """Load msg files. | |||
| Args: | |||
| file_path: Path to the file to load. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| file_path: str, | |||
| api_url: str | |||
| ): | |||
| """Initialize with file path.""" | |||
| self._file_path = file_path | |||
| self._api_url = api_url | |||
| def load(self) -> List[Document]: | |||
| from unstructured.partition.xml import partition_xml | |||
| elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url) | |||
| from unstructured.chunking.title import chunk_by_title | |||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||
| documents = [] | |||
| for chunk in chunks: | |||
| text = chunk.text.strip() | |||
| documents.append(Document(page_content=text)) | |||
| return documents | |||
| @@ -397,7 +397,7 @@ class IndexingRunner: | |||
| one_or_none() | |||
| if file_detail: | |||
| text_docs = FileExtractor.load(file_detail, is_automatic=False) | |||
| text_docs = FileExtractor.load(file_detail, is_automatic=True) | |||
| elif dataset_document.data_source_type == 'notion_import': | |||
| loader = NotionLoader.from_document(dataset_document) | |||
| text_docs = loader.load() | |||
| @@ -135,7 +135,7 @@ class DatasetProcessRule(db.Model): | |||
| ], | |||
| 'segmentation': { | |||
| 'delimiter': '\n', | |||
| 'max_tokens': 512 | |||
| 'max_tokens': 1000 | |||
| } | |||
| } | |||
| @@ -53,4 +53,6 @@ zhipuai==1.0.7 | |||
| werkzeug==2.3.7 | |||
| pymilvus==2.3.0 | |||
| qdrant-client==1.6.4 | |||
| cohere~=4.32 | |||
| cohere~=4.32 | |||
| unstructured~=0.10.27 | |||
| unstructured[docx,pptx]~=0.10.27 | |||
| @@ -27,7 +27,13 @@ class FileService: | |||
| @staticmethod | |||
| def upload_file(file: FileStorage, user: Union[Account, EndUser], only_image: bool = False) -> UploadFile: | |||
| extension = file.filename.split('.')[-1] | |||
| if extension.lower() not in ALLOWED_EXTENSIONS: | |||
| etl_type = current_app.config['ETL_TYPE'] | |||
| if etl_type == 'Unstructured': | |||
| allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', | |||
| 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml'] | |||
| else: | |||
| allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] | |||
| if extension.lower() not in allowed_extensions: | |||
| raise UnsupportedFileTypeError() | |||
| elif only_image and extension.lower() not in IMAGE_EXTENSIONS: | |||
| raise UnsupportedFileTypeError() | |||