|
|
|
@@ -322,8 +322,11 @@ class AzureOpenAILargeLanguageModel(_CommonAzureOpenAI, LargeLanguageModel): |
|
|
|
response: Stream[ChatCompletionChunk], |
|
|
|
prompt_messages: list[PromptMessage], |
|
|
|
tools: Optional[list[PromptMessageTool]] = None) -> Generator: |
|
|
|
|
|
|
|
index = 0 |
|
|
|
full_assistant_content = '' |
|
|
|
real_model = model |
|
|
|
system_fingerprint = None |
|
|
|
completion = '' |
|
|
|
for chunk in response: |
|
|
|
if len(chunk.choices) == 0: |
|
|
|
continue |
|
|
|
@@ -349,40 +352,44 @@ class AzureOpenAILargeLanguageModel(_CommonAzureOpenAI, LargeLanguageModel): |
|
|
|
|
|
|
|
full_assistant_content += delta.delta.content if delta.delta.content else '' |
|
|
|
|
|
|
|
if delta.finish_reason is not None: |
|
|
|
# calculate num tokens |
|
|
|
prompt_tokens = self._num_tokens_from_messages(credentials, prompt_messages, tools) |
|
|
|
real_model = chunk.model |
|
|
|
system_fingerprint = chunk.system_fingerprint |
|
|
|
completion += delta.delta.content if delta.delta.content else '' |
|
|
|
|
|
|
|
full_assistant_prompt_message = AssistantPromptMessage( |
|
|
|
content=full_assistant_content, |
|
|
|
tool_calls=tool_calls |
|
|
|
yield LLMResultChunk( |
|
|
|
model=real_model, |
|
|
|
prompt_messages=prompt_messages, |
|
|
|
system_fingerprint=system_fingerprint, |
|
|
|
delta=LLMResultChunkDelta( |
|
|
|
index=index, |
|
|
|
message=assistant_prompt_message, |
|
|
|
) |
|
|
|
completion_tokens = self._num_tokens_from_messages(credentials, [full_assistant_prompt_message]) |
|
|
|
) |
|
|
|
|
|
|
|
# transform usage |
|
|
|
usage = self._calc_response_usage(model, credentials, prompt_tokens, completion_tokens) |
|
|
|
index += 0 |
|
|
|
|
|
|
|
yield LLMResultChunk( |
|
|
|
model=chunk.model, |
|
|
|
prompt_messages=prompt_messages, |
|
|
|
system_fingerprint=chunk.system_fingerprint, |
|
|
|
delta=LLMResultChunkDelta( |
|
|
|
index=delta.index, |
|
|
|
message=assistant_prompt_message, |
|
|
|
finish_reason=delta.finish_reason, |
|
|
|
usage=usage |
|
|
|
) |
|
|
|
) |
|
|
|
else: |
|
|
|
yield LLMResultChunk( |
|
|
|
model=chunk.model, |
|
|
|
prompt_messages=prompt_messages, |
|
|
|
system_fingerprint=chunk.system_fingerprint, |
|
|
|
delta=LLMResultChunkDelta( |
|
|
|
index=delta.index, |
|
|
|
message=assistant_prompt_message, |
|
|
|
) |
|
|
|
) |
|
|
|
# calculate num tokens |
|
|
|
prompt_tokens = self._num_tokens_from_messages(credentials, prompt_messages, tools) |
|
|
|
|
|
|
|
full_assistant_prompt_message = AssistantPromptMessage( |
|
|
|
content=completion |
|
|
|
) |
|
|
|
completion_tokens = self._num_tokens_from_messages(credentials, [full_assistant_prompt_message]) |
|
|
|
|
|
|
|
# transform usage |
|
|
|
usage = self._calc_response_usage(model, credentials, prompt_tokens, completion_tokens) |
|
|
|
|
|
|
|
yield LLMResultChunk( |
|
|
|
model=real_model, |
|
|
|
prompt_messages=prompt_messages, |
|
|
|
system_fingerprint=system_fingerprint, |
|
|
|
delta=LLMResultChunkDelta( |
|
|
|
index=index, |
|
|
|
message=AssistantPromptMessage(content=''), |
|
|
|
finish_reason='stop', |
|
|
|
usage=usage |
|
|
|
) |
|
|
|
) |
|
|
|
|
|
|
|
@staticmethod |
|
|
|
def _extract_response_tool_calls(response_tool_calls: list[ChatCompletionMessageToolCall | ChoiceDeltaToolCall]) \ |