| @@ -1,3 +1,7 @@ | |||
| - deepseek-r1 | |||
| - deepseek-r1-distill-qwen-14b | |||
| - deepseek-r1-distill-qwen-32b | |||
| - deepseek-v3 | |||
| - qwen-vl-max-0809 | |||
| - qwen-vl-max-0201 | |||
| - qwen-vl-max | |||
| @@ -0,0 +1,21 @@ | |||
| model: deepseek-r1-distill-qwen-14b | |||
| label: | |||
| zh_Hans: DeepSeek-R1-Distill-Qwen-14B | |||
| en_US: DeepSeek-R1-Distill-Qwen-14B | |||
| model_type: llm | |||
| features: | |||
| - agent-thought | |||
| model_properties: | |||
| mode: chat | |||
| context_size: 32000 | |||
| parameter_rules: | |||
| - name: max_tokens | |||
| use_template: max_tokens | |||
| min: 1 | |||
| max: 8192 | |||
| default: 4096 | |||
| pricing: | |||
| input: "0.001" | |||
| output: "0.003" | |||
| unit: "0.001" | |||
| currency: RMB | |||
| @@ -0,0 +1,21 @@ | |||
| model: deepseek-r1-distill-qwen-32b | |||
| label: | |||
| zh_Hans: DeepSeek-R1-Distill-Qwen-32B | |||
| en_US: DeepSeek-R1-Distill-Qwen-32B | |||
| model_type: llm | |||
| features: | |||
| - agent-thought | |||
| model_properties: | |||
| mode: chat | |||
| context_size: 32000 | |||
| parameter_rules: | |||
| - name: max_tokens | |||
| use_template: max_tokens | |||
| min: 1 | |||
| max: 8192 | |||
| default: 4096 | |||
| pricing: | |||
| input: "0.002" | |||
| output: "0.006" | |||
| unit: "0.001" | |||
| currency: RMB | |||
| @@ -0,0 +1,21 @@ | |||
| model: deepseek-r1 | |||
| label: | |||
| zh_Hans: DeepSeek-R1 | |||
| en_US: DeepSeek-R1 | |||
| model_type: llm | |||
| features: | |||
| - agent-thought | |||
| model_properties: | |||
| mode: chat | |||
| context_size: 64000 | |||
| parameter_rules: | |||
| - name: max_tokens | |||
| use_template: max_tokens | |||
| min: 1 | |||
| max: 8192 | |||
| default: 4096 | |||
| pricing: | |||
| input: "0.004" | |||
| output: "0.016" | |||
| unit: '0.001' | |||
| currency: RMB | |||
| @@ -0,0 +1,52 @@ | |||
| model: deepseek-v3 | |||
| label: | |||
| zh_Hans: DeepSeek-V3 | |||
| en_US: DeepSeek-V3 | |||
| model_type: llm | |||
| features: | |||
| - agent-thought | |||
| model_properties: | |||
| mode: chat | |||
| context_size: 64000 | |||
| parameter_rules: | |||
| - name: temperature | |||
| use_template: temperature | |||
| - name: max_tokens | |||
| use_template: max_tokens | |||
| type: int | |||
| default: 512 | |||
| min: 1 | |||
| max: 4096 | |||
| help: | |||
| zh_Hans: 指定生成结果长度的上限。如果生成结果截断,可以调大该参数。 | |||
| en_US: Specifies the upper limit on the length of generated results. If the generated results are truncated, you can increase this parameter. | |||
| - name: top_p | |||
| use_template: top_p | |||
| - name: top_k | |||
| label: | |||
| zh_Hans: 取样数量 | |||
| en_US: Top k | |||
| type: int | |||
| help: | |||
| zh_Hans: 仅从每个后续标记的前 K 个选项中采样。 | |||
| en_US: Only sample from the top K options for each subsequent token. | |||
| required: false | |||
| - name: frequency_penalty | |||
| use_template: frequency_penalty | |||
| - name: response_format | |||
| label: | |||
| zh_Hans: 回复格式 | |||
| en_US: Response Format | |||
| type: string | |||
| help: | |||
| zh_Hans: 指定模型必须输出的格式 | |||
| en_US: specifying the format that the model must output | |||
| required: false | |||
| options: | |||
| - text | |||
| - json_object | |||
| pricing: | |||
| input: "0.002" | |||
| output: "0.008" | |||
| unit: "0.001" | |||
| currency: RMB | |||
| @@ -197,8 +197,7 @@ class TongyiLargeLanguageModel(LargeLanguageModel): | |||
| else: | |||
| # nothing different between chat model and completion model in tongyi | |||
| params["messages"] = self._convert_prompt_messages_to_tongyi_messages(prompt_messages) | |||
| response = Generation.call(**params, result_format="message", stream=stream) | |||
| response = Generation.call(**params, result_format="message", stream=stream, incremental_output=True) | |||
| if stream: | |||
| return self._handle_generate_stream_response(model, credentials, response, prompt_messages) | |||
| @@ -258,6 +257,9 @@ class TongyiLargeLanguageModel(LargeLanguageModel): | |||
| """ | |||
| full_text = "" | |||
| tool_calls = [] | |||
| is_reasoning_started = False | |||
| # for index, response in enumerate(responses): | |||
| index = 0 | |||
| for index, response in enumerate(responses): | |||
| if response.status_code not in {200, HTTPStatus.OK}: | |||
| raise ServiceUnavailableError( | |||
| @@ -311,7 +313,11 @@ class TongyiLargeLanguageModel(LargeLanguageModel): | |||
| ), | |||
| ) | |||
| else: | |||
| resp_content = response.output.choices[0].message.content | |||
| message = response.output.choices[0].message | |||
| resp_content, is_reasoning_started = self._wrap_thinking_by_reasoning_content( | |||
| message, is_reasoning_started | |||
| ) | |||
| if not resp_content: | |||
| if "tool_calls" in response.output.choices[0].message: | |||
| tool_calls = response.output.choices[0].message["tool_calls"] | |||