Co-authored-by: jyong <jyong@dify.ai>tags/0.3.34
| HOSTED_ANTHROPIC_API_KEY= | HOSTED_ANTHROPIC_API_KEY= | ||||
| HOSTED_ANTHROPIC_QUOTA_LIMIT=600000 | HOSTED_ANTHROPIC_QUOTA_LIMIT=600000 | ||||
| HOSTED_ANTHROPIC_PAID_ENABLED=false | HOSTED_ANTHROPIC_PAID_ENABLED=false | ||||
| ETL_TYPE=dify | |||||
| UNSTRUCTURED_API_URL= |
| 'UPLOAD_IMAGE_FILE_SIZE_LIMIT': 10, | 'UPLOAD_IMAGE_FILE_SIZE_LIMIT': 10, | ||||
| 'OUTPUT_MODERATION_BUFFER_SIZE': 300, | 'OUTPUT_MODERATION_BUFFER_SIZE': 300, | ||||
| 'MULTIMODAL_SEND_IMAGE_FORMAT': 'base64', | 'MULTIMODAL_SEND_IMAGE_FORMAT': 'base64', | ||||
| 'INVITE_EXPIRY_HOURS': 72 | |||||
| 'INVITE_EXPIRY_HOURS': 72, | |||||
| 'ETL_TYPE': 'dify', | |||||
| } | } | ||||
| self.HOSTED_MODERATION_ENABLED = get_bool_env('HOSTED_MODERATION_ENABLED') | self.HOSTED_MODERATION_ENABLED = get_bool_env('HOSTED_MODERATION_ENABLED') | ||||
| self.HOSTED_MODERATION_PROVIDERS = get_env('HOSTED_MODERATION_PROVIDERS') | self.HOSTED_MODERATION_PROVIDERS = get_env('HOSTED_MODERATION_PROVIDERS') | ||||
| self.ETL_TYPE = get_env('ETL_TYPE') | |||||
| self.UNSTRUCTURED_API_URL = get_env('UNSTRUCTURED_API_URL') | |||||
| class CloudEditionConfig(Config): | class CloudEditionConfig(Config): | ||||
| return {'content': text} | return {'content': text} | ||||
| class FileeSupportTypApi(Resource): | |||||
| @setup_required | |||||
| @login_required | |||||
| @account_initialization_required | |||||
| def get(self): | |||||
| etl_type = current_app.config['ETL_TYPE'] | |||||
| if etl_type == 'Unstructured': | |||||
| allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', | |||||
| 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml'] | |||||
| else: | |||||
| allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] | |||||
| return {'allowed_extensions': allowed_extensions} | |||||
| api.add_resource(FileApi, '/files/upload') | api.add_resource(FileApi, '/files/upload') | ||||
| api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview') | api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview') | ||||
| api.add_resource(FileeSupportTypApi, '/files/support-type') |
| from typing import List, Union, Optional | from typing import List, Union, Optional | ||||
| import requests | import requests | ||||
| from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredFileLoader, UnstructuredAPIFileLoader | |||||
| from flask import current_app | |||||
| from langchain.document_loaders import TextLoader, Docx2txtLoader | |||||
| from langchain.schema import Document | from langchain.schema import Document | ||||
| from core.data_loader.loader.csv_loader import CSVLoader | from core.data_loader.loader.csv_loader import CSVLoader | ||||
| from core.data_loader.loader.html import HTMLLoader | from core.data_loader.loader.html import HTMLLoader | ||||
| from core.data_loader.loader.markdown import MarkdownLoader | from core.data_loader.loader.markdown import MarkdownLoader | ||||
| from core.data_loader.loader.pdf import PdfLoader | from core.data_loader.loader.pdf import PdfLoader | ||||
| from core.data_loader.loader.unstructured.unstructured_eml import UnstructuredEmailLoader | |||||
| from core.data_loader.loader.unstructured.unstructured_markdown import UnstructuredMarkdownLoader | |||||
| from core.data_loader.loader.unstructured.unstructured_msg import UnstructuredMsgLoader | |||||
| from core.data_loader.loader.unstructured.unstructured_ppt import UnstructuredPPTLoader | |||||
| from core.data_loader.loader.unstructured.unstructured_pptx import UnstructuredPPTXLoader | |||||
| from core.data_loader.loader.unstructured.unstructured_text import UnstructuredTextLoader | |||||
| from core.data_loader.loader.unstructured.unstructured_xml import UnstructuredXmlLoader | |||||
| from extensions.ext_storage import storage | from extensions.ext_storage import storage | ||||
| from models.model import UploadFile | from models.model import UploadFile | ||||
| input_file = Path(file_path) | input_file = Path(file_path) | ||||
| delimiter = '\n' | delimiter = '\n' | ||||
| file_extension = input_file.suffix.lower() | file_extension = input_file.suffix.lower() | ||||
| if is_automatic: | |||||
| loader = UnstructuredFileLoader( | |||||
| file_path, strategy="hi_res", mode="elements" | |||||
| ) | |||||
| # loader = UnstructuredAPIFileLoader( | |||||
| # file_path=filenames[0], | |||||
| # api_key="FAKE_API_KEY", | |||||
| # ) | |||||
| etl_type = current_app.config['ETL_TYPE'] | |||||
| unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL'] | |||||
| if etl_type == 'Unstructured': | |||||
| if file_extension == '.xlsx': | |||||
| loader = ExcelLoader(file_path) | |||||
| elif file_extension == '.pdf': | |||||
| loader = PdfLoader(file_path, upload_file=upload_file) | |||||
| elif file_extension in ['.md', '.markdown']: | |||||
| loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url) | |||||
| elif file_extension in ['.htm', '.html']: | |||||
| loader = HTMLLoader(file_path) | |||||
| elif file_extension == '.docx': | |||||
| loader = Docx2txtLoader(file_path) | |||||
| elif file_extension == '.csv': | |||||
| loader = CSVLoader(file_path, autodetect_encoding=True) | |||||
| elif file_extension == '.msg': | |||||
| loader = UnstructuredMsgLoader(file_path, unstructured_api_url) | |||||
| elif file_extension == '.eml': | |||||
| loader = UnstructuredEmailLoader(file_path, unstructured_api_url) | |||||
| elif file_extension == '.ppt': | |||||
| loader = UnstructuredPPTLoader(file_path, unstructured_api_url) | |||||
| elif file_extension == '.pptx': | |||||
| loader = UnstructuredPPTXLoader(file_path, unstructured_api_url) | |||||
| elif file_extension == '.xml': | |||||
| loader = UnstructuredXmlLoader(file_path, unstructured_api_url) | |||||
| else: | |||||
| # txt | |||||
| loader = UnstructuredTextLoader(file_path, unstructured_api_url) | |||||
| else: | else: | ||||
| if file_extension == '.xlsx': | if file_extension == '.xlsx': | ||||
| loader = ExcelLoader(file_path) | loader = ExcelLoader(file_path) |
| import logging | |||||
| import re | |||||
| from typing import Optional, List, Tuple, cast | |||||
| from langchain.document_loaders.base import BaseLoader | |||||
| from langchain.document_loaders.helpers import detect_file_encodings | |||||
| from langchain.schema import Document | |||||
| logger = logging.getLogger(__name__) | |||||
| class UnstructuredEmailLoader(BaseLoader): | |||||
| """Load msg files. | |||||
| Args: | |||||
| file_path: Path to the file to load. | |||||
| """ | |||||
| def __init__( | |||||
| self, | |||||
| file_path: str, | |||||
| api_url: str, | |||||
| ): | |||||
| """Initialize with file path.""" | |||||
| self._file_path = file_path | |||||
| self._api_url = api_url | |||||
| def load(self) -> List[Document]: | |||||
| from unstructured.partition.email import partition_email | |||||
| elements = partition_email(filename=self._file_path, api_url=self._api_url) | |||||
| from unstructured.chunking.title import chunk_by_title | |||||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||||
| documents = [] | |||||
| for chunk in chunks: | |||||
| text = chunk.text.strip() | |||||
| documents.append(Document(page_content=text)) | |||||
| return documents |
| import logging | |||||
| from typing import List | |||||
| from langchain.document_loaders.base import BaseLoader | |||||
| from langchain.schema import Document | |||||
| logger = logging.getLogger(__name__) | |||||
| class UnstructuredMarkdownLoader(BaseLoader): | |||||
| """Load md files. | |||||
| Args: | |||||
| file_path: Path to the file to load. | |||||
| remove_hyperlinks: Whether to remove hyperlinks from the text. | |||||
| remove_images: Whether to remove images from the text. | |||||
| encoding: File encoding to use. If `None`, the file will be loaded | |||||
| with the default system encoding. | |||||
| autodetect_encoding: Whether to try to autodetect the file encoding | |||||
| if the specified encoding fails. | |||||
| """ | |||||
| def __init__( | |||||
| self, | |||||
| file_path: str, | |||||
| api_url: str, | |||||
| ): | |||||
| """Initialize with file path.""" | |||||
| self._file_path = file_path | |||||
| self._api_url = api_url | |||||
| def load(self) -> List[Document]: | |||||
| from unstructured.partition.md import partition_md | |||||
| elements = partition_md(filename=self._file_path, api_url=self._api_url) | |||||
| from unstructured.chunking.title import chunk_by_title | |||||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||||
| documents = [] | |||||
| for chunk in chunks: | |||||
| text = chunk.text.strip() | |||||
| documents.append(Document(page_content=text)) | |||||
| return documents |
| import logging | |||||
| import re | |||||
| from typing import Optional, List, Tuple, cast | |||||
| from langchain.document_loaders.base import BaseLoader | |||||
| from langchain.document_loaders.helpers import detect_file_encodings | |||||
| from langchain.schema import Document | |||||
| logger = logging.getLogger(__name__) | |||||
| class UnstructuredMsgLoader(BaseLoader): | |||||
| """Load msg files. | |||||
| Args: | |||||
| file_path: Path to the file to load. | |||||
| """ | |||||
| def __init__( | |||||
| self, | |||||
| file_path: str, | |||||
| api_url: str | |||||
| ): | |||||
| """Initialize with file path.""" | |||||
| self._file_path = file_path | |||||
| self._api_url = api_url | |||||
| def load(self) -> List[Document]: | |||||
| from unstructured.partition.msg import partition_msg | |||||
| elements = partition_msg(filename=self._file_path, api_url=self._api_url) | |||||
| from unstructured.chunking.title import chunk_by_title | |||||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||||
| documents = [] | |||||
| for chunk in chunks: | |||||
| text = chunk.text.strip() | |||||
| documents.append(Document(page_content=text)) | |||||
| return documents |
| import logging | |||||
| import re | |||||
| from typing import Optional, List, Tuple, cast | |||||
| from langchain.document_loaders.base import BaseLoader | |||||
| from langchain.document_loaders.helpers import detect_file_encodings | |||||
| from langchain.schema import Document | |||||
| logger = logging.getLogger(__name__) | |||||
| class UnstructuredPPTLoader(BaseLoader): | |||||
| """Load msg files. | |||||
| Args: | |||||
| file_path: Path to the file to load. | |||||
| """ | |||||
| def __init__( | |||||
| self, | |||||
| file_path: str, | |||||
| api_url: str | |||||
| ): | |||||
| """Initialize with file path.""" | |||||
| self._file_path = file_path | |||||
| self._api_url = api_url | |||||
| def load(self) -> List[Document]: | |||||
| from unstructured.partition.ppt import partition_ppt | |||||
| elements = partition_ppt(filename=self._file_path, api_url=self._api_url) | |||||
| from unstructured.chunking.title import chunk_by_title | |||||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||||
| documents = [] | |||||
| for chunk in chunks: | |||||
| text = chunk.text.strip() | |||||
| documents.append(Document(page_content=text)) | |||||
| return documents |
| import logging | |||||
| import re | |||||
| from typing import Optional, List, Tuple, cast | |||||
| from langchain.document_loaders.base import BaseLoader | |||||
| from langchain.document_loaders.helpers import detect_file_encodings | |||||
| from langchain.schema import Document | |||||
| logger = logging.getLogger(__name__) | |||||
| class UnstructuredPPTXLoader(BaseLoader): | |||||
| """Load msg files. | |||||
| Args: | |||||
| file_path: Path to the file to load. | |||||
| """ | |||||
| def __init__( | |||||
| self, | |||||
| file_path: str, | |||||
| api_url: str | |||||
| ): | |||||
| """Initialize with file path.""" | |||||
| self._file_path = file_path | |||||
| self._api_url = api_url | |||||
| def load(self) -> List[Document]: | |||||
| from unstructured.partition.pptx import partition_pptx | |||||
| elements = partition_pptx(filename=self._file_path, api_url=self._api_url) | |||||
| from unstructured.chunking.title import chunk_by_title | |||||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||||
| documents = [] | |||||
| for chunk in chunks: | |||||
| text = chunk.text.strip() | |||||
| documents.append(Document(page_content=text)) | |||||
| return documents |
| import logging | |||||
| import re | |||||
| from typing import Optional, List, Tuple, cast | |||||
| from langchain.document_loaders.base import BaseLoader | |||||
| from langchain.document_loaders.helpers import detect_file_encodings | |||||
| from langchain.schema import Document | |||||
| logger = logging.getLogger(__name__) | |||||
| class UnstructuredTextLoader(BaseLoader): | |||||
| """Load msg files. | |||||
| Args: | |||||
| file_path: Path to the file to load. | |||||
| """ | |||||
| def __init__( | |||||
| self, | |||||
| file_path: str, | |||||
| api_url: str | |||||
| ): | |||||
| """Initialize with file path.""" | |||||
| self._file_path = file_path | |||||
| self._api_url = api_url | |||||
| def load(self) -> List[Document]: | |||||
| from unstructured.partition.text import partition_text | |||||
| elements = partition_text(filename=self._file_path, api_url=self._api_url) | |||||
| from unstructured.chunking.title import chunk_by_title | |||||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||||
| documents = [] | |||||
| for chunk in chunks: | |||||
| text = chunk.text.strip() | |||||
| documents.append(Document(page_content=text)) | |||||
| return documents |
| import logging | |||||
| import re | |||||
| from typing import Optional, List, Tuple, cast | |||||
| from langchain.document_loaders.base import BaseLoader | |||||
| from langchain.document_loaders.helpers import detect_file_encodings | |||||
| from langchain.schema import Document | |||||
| logger = logging.getLogger(__name__) | |||||
| class UnstructuredXmlLoader(BaseLoader): | |||||
| """Load msg files. | |||||
| Args: | |||||
| file_path: Path to the file to load. | |||||
| """ | |||||
| def __init__( | |||||
| self, | |||||
| file_path: str, | |||||
| api_url: str | |||||
| ): | |||||
| """Initialize with file path.""" | |||||
| self._file_path = file_path | |||||
| self._api_url = api_url | |||||
| def load(self) -> List[Document]: | |||||
| from unstructured.partition.xml import partition_xml | |||||
| elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url) | |||||
| from unstructured.chunking.title import chunk_by_title | |||||
| chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) | |||||
| documents = [] | |||||
| for chunk in chunks: | |||||
| text = chunk.text.strip() | |||||
| documents.append(Document(page_content=text)) | |||||
| return documents |
| one_or_none() | one_or_none() | ||||
| if file_detail: | if file_detail: | ||||
| text_docs = FileExtractor.load(file_detail, is_automatic=False) | |||||
| text_docs = FileExtractor.load(file_detail, is_automatic=True) | |||||
| elif dataset_document.data_source_type == 'notion_import': | elif dataset_document.data_source_type == 'notion_import': | ||||
| loader = NotionLoader.from_document(dataset_document) | loader = NotionLoader.from_document(dataset_document) | ||||
| text_docs = loader.load() | text_docs = loader.load() |
| ], | ], | ||||
| 'segmentation': { | 'segmentation': { | ||||
| 'delimiter': '\n', | 'delimiter': '\n', | ||||
| 'max_tokens': 512 | |||||
| 'max_tokens': 1000 | |||||
| } | } | ||||
| } | } | ||||
| werkzeug==2.3.7 | werkzeug==2.3.7 | ||||
| pymilvus==2.3.0 | pymilvus==2.3.0 | ||||
| qdrant-client==1.6.4 | qdrant-client==1.6.4 | ||||
| cohere~=4.32 | |||||
| cohere~=4.32 | |||||
| unstructured~=0.10.27 | |||||
| unstructured[docx,pptx]~=0.10.27 |
| @staticmethod | @staticmethod | ||||
| def upload_file(file: FileStorage, user: Union[Account, EndUser], only_image: bool = False) -> UploadFile: | def upload_file(file: FileStorage, user: Union[Account, EndUser], only_image: bool = False) -> UploadFile: | ||||
| extension = file.filename.split('.')[-1] | extension = file.filename.split('.')[-1] | ||||
| if extension.lower() not in ALLOWED_EXTENSIONS: | |||||
| etl_type = current_app.config['ETL_TYPE'] | |||||
| if etl_type == 'Unstructured': | |||||
| allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', | |||||
| 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml'] | |||||
| else: | |||||
| allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] | |||||
| if extension.lower() not in allowed_extensions: | |||||
| raise UnsupportedFileTypeError() | raise UnsupportedFileTypeError() | ||||
| elif only_image and extension.lower() not in IMAGE_EXTENSIONS: | elif only_image and extension.lower() not in IMAGE_EXTENSIONS: | ||||
| raise UnsupportedFileTypeError() | raise UnsupportedFileTypeError() |