| UPLOAD_AUDIO_FILE_SIZE_LIMIT=50 | UPLOAD_AUDIO_FILE_SIZE_LIMIT=50 | ||||
| # Model configuration | # Model configuration | ||||
| MULTIMODAL_SEND_IMAGE_FORMAT=base64 | |||||
| MULTIMODAL_SEND_VIDEO_FORMAT=base64 | |||||
| MULTIMODAL_SEND_FORMAT=base64 | |||||
| PROMPT_GENERATION_MAX_TOKENS=512 | PROMPT_GENERATION_MAX_TOKENS=512 | ||||
| CODE_GENERATION_MAX_TOKENS=1024 | CODE_GENERATION_MAX_TOKENS=1024 | ||||
| ) | ) | ||||
| class VisionFormatConfig(BaseSettings): | |||||
| MULTIMODAL_SEND_IMAGE_FORMAT: Literal["base64", "url"] = Field( | |||||
| description="Format for sending images in multimodal contexts ('base64' or 'url'), default is base64", | |||||
| default="base64", | |||||
| ) | |||||
| MULTIMODAL_SEND_VIDEO_FORMAT: Literal["base64", "url"] = Field( | |||||
| description="Format for sending videos in multimodal contexts ('base64' or 'url'), default is base64", | |||||
| class MultiModalTransferConfig(BaseSettings): | |||||
| MULTIMODAL_SEND_FORMAT: Literal["base64", "url"] = Field( | |||||
| description="Format for sending files in multimodal contexts ('base64' or 'url'), default is base64", | |||||
| default="base64", | default="base64", | ||||
| ) | ) | ||||
| FileAccessConfig, | FileAccessConfig, | ||||
| FileUploadConfig, | FileUploadConfig, | ||||
| HttpConfig, | HttpConfig, | ||||
| VisionFormatConfig, | |||||
| InnerAPIConfig, | InnerAPIConfig, | ||||
| IndexingConfig, | IndexingConfig, | ||||
| LoggingConfig, | LoggingConfig, | ||||
| MailConfig, | MailConfig, | ||||
| ModelLoadBalanceConfig, | ModelLoadBalanceConfig, | ||||
| ModerationConfig, | ModerationConfig, | ||||
| MultiModalTransferConfig, | |||||
| PositionConfig, | PositionConfig, | ||||
| RagEtlConfig, | RagEtlConfig, | ||||
| SecurityConfig, | SecurityConfig, |
| *, | *, | ||||
| image_detail_config: ImagePromptMessageContent.DETAIL | None = None, | image_detail_config: ImagePromptMessageContent.DETAIL | None = None, | ||||
| ): | ): | ||||
| match f.type: | |||||
| case FileType.IMAGE: | |||||
| image_detail_config = image_detail_config or ImagePromptMessageContent.DETAIL.LOW | |||||
| if dify_config.MULTIMODAL_SEND_IMAGE_FORMAT == "url": | |||||
| data = _to_url(f) | |||||
| else: | |||||
| data = _to_base64_data_string(f) | |||||
| return ImagePromptMessageContent(data=data, detail=image_detail_config, format=f.extension.lstrip(".")) | |||||
| case FileType.AUDIO: | |||||
| data = _to_base64_data_string(f) | |||||
| if f.extension is None: | |||||
| raise ValueError("Missing file extension") | |||||
| return AudioPromptMessageContent(data=data, format=f.extension.lstrip(".")) | |||||
| case FileType.VIDEO: | |||||
| if dify_config.MULTIMODAL_SEND_VIDEO_FORMAT == "url": | |||||
| data = _to_url(f) | |||||
| else: | |||||
| data = _to_base64_data_string(f) | |||||
| if f.extension is None: | |||||
| raise ValueError("Missing file extension") | |||||
| return VideoPromptMessageContent(data=data, format=f.extension.lstrip(".")) | |||||
| case FileType.DOCUMENT: | |||||
| data = _to_base64_data_string(f) | |||||
| return DocumentPromptMessageContent(encode_format="base64", data=data, format=f.extension.lstrip(".")) | |||||
| case _: | |||||
| raise ValueError(f"file type {f.type} is not supported") | |||||
| if f.extension is None: | |||||
| raise ValueError("Missing file extension") | |||||
| if f.mime_type is None: | |||||
| raise ValueError("Missing file mime_type") | |||||
| params = { | |||||
| "base64_data": _get_encoded_string(f) if dify_config.MULTIMODAL_SEND_FORMAT == "base64" else "", | |||||
| "url": _to_url(f) if dify_config.MULTIMODAL_SEND_FORMAT == "url" else "", | |||||
| "format": f.extension.removeprefix("."), | |||||
| "mime_type": f.mime_type, | |||||
| } | |||||
| if f.type == FileType.IMAGE: | |||||
| params["detail"] = image_detail_config or ImagePromptMessageContent.DETAIL.LOW | |||||
| prompt_class_map = { | |||||
| FileType.IMAGE: ImagePromptMessageContent, | |||||
| FileType.AUDIO: AudioPromptMessageContent, | |||||
| FileType.VIDEO: VideoPromptMessageContent, | |||||
| FileType.DOCUMENT: DocumentPromptMessageContent, | |||||
| } | |||||
| try: | |||||
| return prompt_class_map[f.type](**params) | |||||
| except KeyError: | |||||
| raise ValueError(f"file type {f.type} is not supported") | |||||
| def download(f: File, /): | def download(f: File, /): | ||||
| return encoded_string | return encoded_string | ||||
| def _to_base64_data_string(f: File, /): | |||||
| encoded_string = _get_encoded_string(f) | |||||
| return f"data:{f.mime_type};base64,{encoded_string}" | |||||
| def _to_url(f: File, /): | def _to_url(f: File, /): | ||||
| if f.transfer_method == FileTransferMethod.REMOTE_URL: | if f.transfer_method == FileTransferMethod.REMOTE_URL: | ||||
| if f.remote_url is None: | if f.remote_url is None: |
| from abc import ABC | from abc import ABC | ||||
| from collections.abc import Sequence | from collections.abc import Sequence | ||||
| from enum import Enum, StrEnum | from enum import Enum, StrEnum | ||||
| from typing import Literal, Optional | |||||
| from typing import Optional | |||||
| from pydantic import BaseModel, Field, field_validator | |||||
| from pydantic import BaseModel, Field, computed_field, field_validator | |||||
| class PromptMessageRole(Enum): | class PromptMessageRole(Enum): | ||||
| """ | """ | ||||
| type: PromptMessageContentType | type: PromptMessageContentType | ||||
| data: str | |||||
| class TextPromptMessageContent(PromptMessageContent): | class TextPromptMessageContent(PromptMessageContent): | ||||
| """ | """ | ||||
| type: PromptMessageContentType = PromptMessageContentType.TEXT | type: PromptMessageContentType = PromptMessageContentType.TEXT | ||||
| data: str | |||||
| class MultiModalPromptMessageContent(PromptMessageContent): | |||||
| """ | |||||
| Model class for multi-modal prompt message content. | |||||
| """ | |||||
| type: PromptMessageContentType | |||||
| format: str = Field(..., description="the format of multi-modal file") | |||||
| base64_data: str = Field("", description="the base64 data of multi-modal file") | |||||
| url: str = Field("", description="the url of multi-modal file") | |||||
| mime_type: str = Field(..., description="the mime type of multi-modal file") | |||||
| @computed_field(return_type=str) | |||||
| @property | |||||
| def data(self): | |||||
| return self.url or f"data:{self.mime_type};base64,{self.base64_data}" | |||||
| class VideoPromptMessageContent(PromptMessageContent): | |||||
| class VideoPromptMessageContent(MultiModalPromptMessageContent): | |||||
| type: PromptMessageContentType = PromptMessageContentType.VIDEO | type: PromptMessageContentType = PromptMessageContentType.VIDEO | ||||
| data: str = Field(..., description="Base64 encoded video data") | |||||
| format: str = Field(..., description="Video format") | |||||
| class AudioPromptMessageContent(PromptMessageContent): | |||||
| class AudioPromptMessageContent(MultiModalPromptMessageContent): | |||||
| type: PromptMessageContentType = PromptMessageContentType.AUDIO | type: PromptMessageContentType = PromptMessageContentType.AUDIO | ||||
| data: str = Field(..., description="Base64 encoded audio data") | |||||
| format: str = Field(..., description="Audio format") | |||||
| class ImagePromptMessageContent(PromptMessageContent): | |||||
| class ImagePromptMessageContent(MultiModalPromptMessageContent): | |||||
| """ | """ | ||||
| Model class for image prompt message content. | Model class for image prompt message content. | ||||
| """ | """ | ||||
| type: PromptMessageContentType = PromptMessageContentType.IMAGE | type: PromptMessageContentType = PromptMessageContentType.IMAGE | ||||
| detail: DETAIL = DETAIL.LOW | detail: DETAIL = DETAIL.LOW | ||||
| format: str = Field("jpg", description="Image format") | |||||
| class DocumentPromptMessageContent(PromptMessageContent): | |||||
| class DocumentPromptMessageContent(MultiModalPromptMessageContent): | |||||
| type: PromptMessageContentType = PromptMessageContentType.DOCUMENT | type: PromptMessageContentType = PromptMessageContentType.DOCUMENT | ||||
| encode_format: Literal["base64"] | |||||
| data: str | |||||
| format: str = Field(..., description="Document format") | |||||
| class PromptMessage(ABC, BaseModel): | class PromptMessage(ABC, BaseModel): |
| import base64 | import base64 | ||||
| import io | |||||
| import json | import json | ||||
| from collections.abc import Generator, Sequence | from collections.abc import Generator, Sequence | ||||
| from typing import Optional, Union, cast | from typing import Optional, Union, cast | ||||
| ) | ) | ||||
| from anthropic.types.beta.tools import ToolsBetaMessage | from anthropic.types.beta.tools import ToolsBetaMessage | ||||
| from httpx import Timeout | from httpx import Timeout | ||||
| from PIL import Image | |||||
| from core.model_runtime.callbacks.base_callback import Callback | from core.model_runtime.callbacks.base_callback import Callback | ||||
| from core.model_runtime.entities import ( | from core.model_runtime.entities import ( | ||||
| sub_messages.append(sub_message_dict) | sub_messages.append(sub_message_dict) | ||||
| elif message_content.type == PromptMessageContentType.IMAGE: | elif message_content.type == PromptMessageContentType.IMAGE: | ||||
| message_content = cast(ImagePromptMessageContent, message_content) | message_content = cast(ImagePromptMessageContent, message_content) | ||||
| if not message_content.data.startswith("data:"): | |||||
| if not message_content.base64_data: | |||||
| # fetch image data from url | # fetch image data from url | ||||
| try: | try: | ||||
| image_content = requests.get(message_content.data).content | |||||
| with Image.open(io.BytesIO(image_content)) as img: | |||||
| mime_type = f"image/{img.format.lower()}" | |||||
| image_content = requests.get(message_content.url).content | |||||
| base64_data = base64.b64encode(image_content).decode("utf-8") | base64_data = base64.b64encode(image_content).decode("utf-8") | ||||
| except Exception as ex: | except Exception as ex: | ||||
| raise ValueError( | raise ValueError( | ||||
| f"Failed to fetch image data from url {message_content.data}, {ex}" | f"Failed to fetch image data from url {message_content.data}, {ex}" | ||||
| ) | ) | ||||
| else: | else: | ||||
| data_split = message_content.data.split(";base64,") | |||||
| mime_type = data_split[0].replace("data:", "") | |||||
| base64_data = data_split[1] | |||||
| base64_data = message_content.base64_data | |||||
| mime_type = message_content.mime_type | |||||
| if mime_type not in {"image/jpeg", "image/png", "image/gif", "image/webp"}: | if mime_type not in {"image/jpeg", "image/png", "image/gif", "image/webp"}: | ||||
| raise ValueError( | raise ValueError( | ||||
| f"Unsupported image type {mime_type}, " | f"Unsupported image type {mime_type}, " | ||||
| } | } | ||||
| sub_messages.append(sub_message_dict) | sub_messages.append(sub_message_dict) | ||||
| elif isinstance(message_content, DocumentPromptMessageContent): | elif isinstance(message_content, DocumentPromptMessageContent): | ||||
| data_split = message_content.data.split(";base64,") | |||||
| mime_type = data_split[0].replace("data:", "") | |||||
| base64_data = data_split[1] | |||||
| if mime_type != "application/pdf": | |||||
| if message_content.mime_type != "application/pdf": | |||||
| raise ValueError( | raise ValueError( | ||||
| f"Unsupported document type {mime_type}, " "only support application/pdf" | |||||
| f"Unsupported document type {message_content.mime_type}, " | |||||
| "only support application/pdf" | |||||
| ) | ) | ||||
| sub_message_dict = { | sub_message_dict = { | ||||
| "type": "document", | "type": "document", | ||||
| "source": { | "source": { | ||||
| "type": message_content.encode_format, | |||||
| "media_type": mime_type, | |||||
| "data": base64_data, | |||||
| "type": "base64", | |||||
| "media_type": message_content.mime_type, | |||||
| "data": message_content.data, | |||||
| }, | }, | ||||
| } | } | ||||
| sub_messages.append(sub_message_dict) | sub_messages.append(sub_message_dict) |
| sub_messages.append(sub_message_dict) | sub_messages.append(sub_message_dict) | ||||
| elif message_content.type == PromptMessageContentType.VIDEO: | elif message_content.type == PromptMessageContentType.VIDEO: | ||||
| message_content = cast(VideoPromptMessageContent, message_content) | message_content = cast(VideoPromptMessageContent, message_content) | ||||
| video_url = message_content.data | |||||
| if message_content.data.startswith("data:"): | |||||
| raise InvokeError("not support base64, please set MULTIMODAL_SEND_VIDEO_FORMAT to url") | |||||
| video_url = message_content.url | |||||
| if not video_url: | |||||
| raise InvokeError("not support base64, please set MULTIMODAL_SEND_FORMAT to url") | |||||
| sub_message_dict = {"video": video_url} | sub_message_dict = {"video": video_url} | ||||
| sub_messages.append(sub_message_dict) | sub_messages.append(sub_message_dict) |
| import pytest | import pytest | ||||
| from configs import dify_config | |||||
| from core.app.app_config.entities import ModelConfigEntity | from core.app.app_config.entities import ModelConfigEntity | ||||
| from core.file import File, FileTransferMethod, FileType, FileUploadConfig, ImageConfig | from core.file import File, FileTransferMethod, FileType, FileUploadConfig, ImageConfig | ||||
| from core.memory.token_buffer_memory import TokenBufferMemory | from core.memory.token_buffer_memory import TokenBufferMemory | ||||
| def test__get_chat_model_prompt_messages_with_files_no_memory(get_chat_model_args): | def test__get_chat_model_prompt_messages_with_files_no_memory(get_chat_model_args): | ||||
| model_config_mock, _, messages, inputs, context = get_chat_model_args | model_config_mock, _, messages, inputs, context = get_chat_model_args | ||||
| dify_config.MULTIMODAL_SEND_FORMAT = "url" | |||||
| files = [ | files = [ | ||||
| File( | File( | ||||
| prompt_transform = AdvancedPromptTransform() | prompt_transform = AdvancedPromptTransform() | ||||
| prompt_transform._calculate_rest_token = MagicMock(return_value=2000) | prompt_transform._calculate_rest_token = MagicMock(return_value=2000) | ||||
| with patch("core.file.file_manager.to_prompt_message_content") as mock_get_encoded_string: | with patch("core.file.file_manager.to_prompt_message_content") as mock_get_encoded_string: | ||||
| mock_get_encoded_string.return_value = ImagePromptMessageContent(data=str(files[0].remote_url)) | |||||
| mock_get_encoded_string.return_value = ImagePromptMessageContent( | |||||
| url=str(files[0].remote_url), format="jpg", mime_type="image/jpg" | |||||
| ) | |||||
| prompt_messages = prompt_transform._get_chat_model_prompt_messages( | prompt_messages = prompt_transform._get_chat_model_prompt_messages( | ||||
| prompt_template=messages, | prompt_template=messages, | ||||
| inputs=inputs, | inputs=inputs, |
| TextPromptMessageContent, | TextPromptMessageContent, | ||||
| UserPromptMessage, | UserPromptMessage, | ||||
| ) | ) | ||||
| from core.model_runtime.entities.model_entities import AIModelEntity, FetchFrom, ModelFeature, ModelType, ProviderModel | |||||
| from core.model_runtime.entities.provider_entities import ConfigurateMethod, ProviderEntity | |||||
| from core.model_runtime.entities.model_entities import AIModelEntity, FetchFrom, ModelFeature, ModelType | |||||
| from core.model_runtime.model_providers.model_provider_factory import ModelProviderFactory | from core.model_runtime.model_providers.model_provider_factory import ModelProviderFactory | ||||
| from core.prompt.entities.advanced_prompt_entities import MemoryConfig | from core.prompt.entities.advanced_prompt_entities import MemoryConfig | ||||
| from core.variables import ArrayAnySegment, ArrayFileSegment, NoneSegment | from core.variables import ArrayAnySegment, ArrayFileSegment, NoneSegment | ||||
| def test_fetch_prompt_messages__basic(faker, llm_node, model_config): | def test_fetch_prompt_messages__basic(faker, llm_node, model_config): | ||||
| # Setup dify config | # Setup dify config | ||||
| dify_config.MULTIMODAL_SEND_IMAGE_FORMAT = "url" | |||||
| dify_config.MULTIMODAL_SEND_VIDEO_FORMAT = "url" | |||||
| dify_config.MULTIMODAL_SEND_FORMAT = "url" | |||||
| # Generate fake values for prompt template | # Generate fake values for prompt template | ||||
| fake_assistant_prompt = faker.sentence() | fake_assistant_prompt = faker.sentence() | ||||
| tenant_id="test", | tenant_id="test", | ||||
| type=FileType.IMAGE, | type=FileType.IMAGE, | ||||
| filename="test1.jpg", | filename="test1.jpg", | ||||
| extension=".jpg", | |||||
| transfer_method=FileTransferMethod.REMOTE_URL, | transfer_method=FileTransferMethod.REMOTE_URL, | ||||
| remote_url=fake_remote_url, | remote_url=fake_remote_url, | ||||
| extension=".jpg", | |||||
| mime_type="image/jpg", | |||||
| ) | ) | ||||
| ], | ], | ||||
| vision_enabled=True, | vision_enabled=True, | ||||
| UserPromptMessage( | UserPromptMessage( | ||||
| content=[ | content=[ | ||||
| TextPromptMessageContent(data=fake_query), | TextPromptMessageContent(data=fake_query), | ||||
| ImagePromptMessageContent(data=fake_remote_url, detail=fake_vision_detail), | |||||
| ImagePromptMessageContent( | |||||
| url=fake_remote_url, mime_type="image/jpg", format="jpg", detail=fake_vision_detail | |||||
| ), | |||||
| ] | ] | ||||
| ), | ), | ||||
| ], | ], | ||||
| expected_messages=[ | expected_messages=[ | ||||
| UserPromptMessage( | UserPromptMessage( | ||||
| content=[ | content=[ | ||||
| ImagePromptMessageContent(data=fake_remote_url, detail=fake_vision_detail), | |||||
| ImagePromptMessageContent( | |||||
| url=fake_remote_url, mime_type="image/jpg", format="jpg", detail=fake_vision_detail | |||||
| ), | |||||
| ] | ] | ||||
| ), | ), | ||||
| ] | ] | ||||
| tenant_id="test", | tenant_id="test", | ||||
| type=FileType.IMAGE, | type=FileType.IMAGE, | ||||
| filename="test1.jpg", | filename="test1.jpg", | ||||
| extension=".jpg", | |||||
| transfer_method=FileTransferMethod.REMOTE_URL, | transfer_method=FileTransferMethod.REMOTE_URL, | ||||
| remote_url=fake_remote_url, | remote_url=fake_remote_url, | ||||
| extension=".jpg", | |||||
| mime_type="image/jpg", | |||||
| ) | ) | ||||
| }, | }, | ||||
| ), | ), |
| # Multi-modal Configuration | # Multi-modal Configuration | ||||
| # ------------------------------ | # ------------------------------ | ||||
| # The format of the image/video sent when the multi-modal model is input, | |||||
| # The format of the image/video/audio/document sent when the multi-modal model is input, | |||||
| # the default is base64, optional url. | # the default is base64, optional url. | ||||
| # The delay of the call in url mode will be lower than that in base64 mode. | # The delay of the call in url mode will be lower than that in base64 mode. | ||||
| # It is generally recommended to use the more compatible base64 mode. | # It is generally recommended to use the more compatible base64 mode. | ||||
| # If configured as url, you need to configure FILES_URL as an externally accessible address so that the multi-modal model can access the image/video. | |||||
| MULTIMODAL_SEND_IMAGE_FORMAT=base64 | |||||
| MULTIMODAL_SEND_VIDEO_FORMAT=base64 | |||||
| # If configured as url, you need to configure FILES_URL as an externally accessible address so that the multi-modal model can access the image/video/audio/document. | |||||
| MULTIMODAL_SEND_FORMAT=base64 | |||||
| # Upload image file size limit, default 10M. | # Upload image file size limit, default 10M. | ||||
| UPLOAD_IMAGE_FILE_SIZE_LIMIT=10 | UPLOAD_IMAGE_FILE_SIZE_LIMIT=10 |
| UNSTRUCTURED_API_KEY: ${UNSTRUCTURED_API_KEY:-} | UNSTRUCTURED_API_KEY: ${UNSTRUCTURED_API_KEY:-} | ||||
| PROMPT_GENERATION_MAX_TOKENS: ${PROMPT_GENERATION_MAX_TOKENS:-512} | PROMPT_GENERATION_MAX_TOKENS: ${PROMPT_GENERATION_MAX_TOKENS:-512} | ||||
| CODE_GENERATION_MAX_TOKENS: ${CODE_GENERATION_MAX_TOKENS:-1024} | CODE_GENERATION_MAX_TOKENS: ${CODE_GENERATION_MAX_TOKENS:-1024} | ||||
| MULTIMODAL_SEND_IMAGE_FORMAT: ${MULTIMODAL_SEND_IMAGE_FORMAT:-base64} | |||||
| MULTIMODAL_SEND_VIDEO_FORMAT: ${MULTIMODAL_SEND_VIDEO_FORMAT:-base64} | |||||
| MULTIMODAL_SEND_FORMAT: ${MULTIMODAL_SEND_FORMAT:-base64} | |||||
| UPLOAD_IMAGE_FILE_SIZE_LIMIT: ${UPLOAD_IMAGE_FILE_SIZE_LIMIT:-10} | UPLOAD_IMAGE_FILE_SIZE_LIMIT: ${UPLOAD_IMAGE_FILE_SIZE_LIMIT:-10} | ||||
| UPLOAD_VIDEO_FILE_SIZE_LIMIT: ${UPLOAD_VIDEO_FILE_SIZE_LIMIT:-100} | UPLOAD_VIDEO_FILE_SIZE_LIMIT: ${UPLOAD_VIDEO_FILE_SIZE_LIMIT:-100} | ||||
| UPLOAD_AUDIO_FILE_SIZE_LIMIT: ${UPLOAD_AUDIO_FILE_SIZE_LIMIT:-50} | UPLOAD_AUDIO_FILE_SIZE_LIMIT: ${UPLOAD_AUDIO_FILE_SIZE_LIMIT:-50} |