Python: Upgrade Onnx Connector to use 0.9.0 (#13162)

nmoeller · moonbox3 · web-flow · commit 145ad9bfd2f9 · 2025-11-11T08:44:52.000Z
### Motivation and Context Fixes : #13001 ### Description Package version 0.9.0 introduced very helpful methods like applying chat templates automatically from the tokenizer and also support for multi audio & image files. This PR adds the following functionalities: - [x] Add Inference with multiple Images - [x] Add Inference with multiple Audios - [x] Chat Templates for non Multimodal Models are read via onnx ### Samples Text Sample with ONNX: <img width="1728" height="90" alt="image" src="https://github.com/user-attachments/assets/b93d9fc1-4e38-4fa9-b535-cc316e0900ed" /> Image Sample with ONNX : <img width="1721" height="91" alt="image" src="https://github.com/user-attachments/assets/47761beb-b728-4936-a6b6-fd253e041689" /> ### Contribution Checklist  - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄 --------- Co-authored-by: Evan Mattson <35585003+moonbox3@users.noreply.github.com>
diff --git a/python/samples/concepts/setup/chat_completion_services.py b/python/samples/concepts/setup/chat_completion_services.py
@@ -332,13 +332,9 @@ def get_onnx_chat_completion_service_and_request_settings() -> tuple[
     Please refer to the Semantic Kernel Python documentation for more information:
     https://learn.microsoft.com/en-us/python/api/semantic-kernel/semantic_kernel?view=semantic-kernel
     """
-    from semantic_kernel.connectors.ai.onnx import (
-        OnnxGenAIChatCompletion,
-        OnnxGenAIPromptExecutionSettings,
-        ONNXTemplate,
-    )
+    from semantic_kernel.connectors.ai.onnx import OnnxGenAIChatCompletion, OnnxGenAIPromptExecutionSettings
 
-    chat_service = OnnxGenAIChatCompletion(ONNXTemplate.PHI3, service_id=service_id)
+    chat_service = OnnxGenAIChatCompletion(template="phi4mm", service_id=service_id)
     request_settings = OnnxGenAIPromptExecutionSettings(service_id=service_id)
 
     return chat_service, request_settings
diff --git a/python/semantic_kernel/connectors/ai/onnx/services/onnx_gen_ai_chat_completion.py b/python/semantic_kernel/connectors/ai/onnx/services/onnx_gen_ai_chat_completion.py
@@ -1,5 +1,6 @@
 # Copyright (c) Microsoft. All rights reserved.
 
+import json
 import logging
 import sys
 from collections.abc import AsyncGenerator
@@ -10,7 +11,6 @@
 else:
     from typing_extensions import override  # pragma: no cover
 
-
 from pydantic import ValidationError
 
 from semantic_kernel.connectors.ai.chat_completion_client_base import ChatCompletionClientBase
@@ -20,6 +20,7 @@
 from semantic_kernel.connectors.ai.onnx.utils import ONNXTemplate, apply_template
 from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
 from semantic_kernel.contents import (
+    AudioContent,
     ChatHistory,
     ChatMessageContent,
     ImageContent,
@@ -37,12 +38,12 @@
 class OnnxGenAIChatCompletion(ChatCompletionClientBase, OnnxGenAICompletionBase):
     """OnnxGenAI text completion service."""
 
-    template: ONNXTemplate
+    template: ONNXTemplate | None
     SUPPORTS_FUNCTION_CALLING: ClassVar[bool] = False
 
     def __init__(
         self,
-        template: ONNXTemplate,
+        template: ONNXTemplate | None = None,
         ai_model_path: str | None = None,
         ai_model_id: str | None = None,
         env_file_path: str | None = None,
@@ -80,6 +81,12 @@ def __init__(
 
         super().__init__(ai_model_id=ai_model_id, ai_model_path=settings.chat_model_folder, template=template, **kwargs)
 
+        if self.enable_multi_modality and template is None:
+            raise ServiceInitializationError(
+                "When using a multi-modal model, a template must be specified."
+                " Please provide a ONNXTemplate in the constructor."
+            )
+
     @override
     async def _inner_get_chat_message_contents(
         self,
@@ -101,7 +108,8 @@ async def _inner_get_chat_message_contents(
         assert isinstance(settings, OnnxGenAIPromptExecutionSettings)  # nosec
         prompt = self._prepare_chat_history_for_request(chat_history)
         images = self._get_images_from_history(chat_history)
-        choices = await self._generate_next_token(prompt, settings, images)
+        audios = self._get_audios_from_history(chat_history)
+        choices = await self._generate_next_token(prompt, settings, images=images, audios=audios)
         return [self._create_chat_message_content(choice) for choice in choices]
 
     @override
@@ -127,7 +135,8 @@ async def _inner_get_streaming_chat_message_contents(
         assert isinstance(settings, OnnxGenAIPromptExecutionSettings)  # nosec
         prompt = self._prepare_chat_history_for_request(chat_history)
         images = self._get_images_from_history(chat_history)
-        async for chunk in self._generate_next_token_async(prompt, settings, images):
+        audios = self._get_audios_from_history(chat_history)
+        async for chunk in self._generate_next_token_async(prompt, settings, images=images, audios=audios):
             yield [
                 self._create_streaming_chat_message_content(choice_index, new_token, function_invoke_attempt)
                 for choice_index, new_token in enumerate(chunk)
@@ -159,9 +168,21 @@ def _create_streaming_chat_message_content(
     def _prepare_chat_history_for_request(
         self, chat_history: ChatHistory, role_key: str = "role", content_key: str = "content"
     ) -> Any:
-        return apply_template(chat_history, self.template)
+        if self.template:
+            return apply_template(chat_history, self.template)
+        return self.tokenizer.apply_chat_template(
+            json.dumps(self._chat_messages_to_dicts(chat_history)),
+            add_generation_prompt=True,
+        )
+
+    def _chat_messages_to_dicts(self, chat_history: "ChatHistory") -> list[dict[str, Any]]:
+        return [
+            message.to_dict(role_key="role", content_key="content")
+            for message in chat_history.messages
+            if isinstance(message, ChatMessageContent)
+        ]
 
-    def _get_images_from_history(self, chat_history: "ChatHistory") -> ImageContent | None:
+    def _get_images_from_history(self, chat_history: "ChatHistory") -> list[ImageContent] | None:
         images = []
         for message in chat_history.messages:
             for image in message.items:
@@ -174,11 +195,22 @@ def _get_images_from_history(self, chat_history: "ChatHistory") -> ImageContent
                         raise ServiceInvalidExecutionSettingsError(
                             "Image Content URI needs to be set, because onnx can only work with file paths"
                         )
-        # Currently Onnx Runtime only supports one image
-        # Later we will add support for multiple images
-        if len(images) > 1:
-            raise ServiceInvalidExecutionSettingsError("The model does not support more than one image")
-        return images[-1] if images else None
+        return images if len(images) else None
+
+    def _get_audios_from_history(self, chat_history: "ChatHistory") -> list[AudioContent] | None:
+        audios = []
+        for message in chat_history.messages:
+            for audio in message.items:
+                if isinstance(audio, AudioContent):
+                    if not self.enable_multi_modality:
+                        raise ServiceInvalidExecutionSettingsError("The model does not support multi-modality")
+                    if audio.uri:
+                        audios.append(audio)
+                    else:
+                        raise ServiceInvalidExecutionSettingsError(
+                            "Audio Content URI needs to be set, because onnx can only work with file paths"
+                        )
+        return audios if len(audios) else None
 
     @override
     def get_prompt_execution_settings_class(self) -> type["PromptExecutionSettings"]:
diff --git a/python/semantic_kernel/connectors/ai/onnx/services/onnx_gen_ai_completion_base.py b/python/semantic_kernel/connectors/ai/onnx/services/onnx_gen_ai_completion_base.py
@@ -6,7 +6,7 @@
 from typing import Any
 
 from semantic_kernel.connectors.ai.onnx.onnx_gen_ai_prompt_execution_settings import OnnxGenAIPromptExecutionSettings
-from semantic_kernel.contents import ImageContent
+from semantic_kernel.contents import AudioContent, ImageContent
 from semantic_kernel.exceptions import ServiceInitializationError, ServiceInvalidResponseError
 from semantic_kernel.kernel_pydantic import KernelBaseModel
 
@@ -50,7 +50,7 @@ def __init__(self, ai_model_path: str, **kwargs) -> None:
                     tokenizer = OnnxRuntimeGenAi.Tokenizer(model)
                 tokenizer_stream = tokenizer.create_stream()
         except Exception as ex:
-            raise ServiceInitializationError("Failed to initialize OnnxTextCompletion service", ex) from ex
+            raise ServiceInitializationError("Failed to initialize OnnxCompletion service", ex) from ex
 
         super().__init__(
             model=model,
@@ -64,25 +64,27 @@ async def _generate_next_token_async(
         self,
         prompt: str,
         settings: OnnxGenAIPromptExecutionSettings,
-        image: ImageContent | None = None,
+        images: list[ImageContent] | None = None,
+        audios: list[AudioContent] | None = None,
     ) -> AsyncGenerator[list[str], Any]:
         try:
             params = OnnxRuntimeGenAi.GeneratorParams(self.model)
             params.set_search_options(**settings.prepare_settings_dict())
+            generator = OnnxRuntimeGenAi.Generator(self.model, params)
             if not self.enable_multi_modality:
                 input_tokens = self.tokenizer.encode(prompt)
-                params.input_ids = input_tokens
+                generator.append_tokens(input_tokens)
             else:
-                if image is not None:
-                    # With the use of Pybind there is currently no way to load images from bytes
-                    # We can only open images from a file path currently
-                    image = OnnxRuntimeGenAi.Images.open(str(image.uri))
-                input_tokens = self.tokenizer(prompt, images=image)
-                params.set_inputs(input_tokens)
-            generator = OnnxRuntimeGenAi.Generator(self.model, params)
+                # With the use of Pybind in ONNX there is currently no way to load images from bytes
+                # We can only open images & audios from a file path currently
+                if images is not None:
+                    images = OnnxRuntimeGenAi.Images.open(*[str(image.uri) for image in images])
+                if audios is not None:
+                    audios = OnnxRuntimeGenAi.Audios.open(*[str(audio.uri) for audio in audios])
+                input_tokens = self.tokenizer(prompt, images=images, audios=audios)
+                generator.set_inputs(input_tokens)
 
             while not generator.is_done():
-                generator.compute_logits()
                 generator.generate_next_token()
                 new_token_choices = [self.tokenizer_stream.decode(token) for token in generator.get_next_tokens()]
                 yield new_token_choices
@@ -94,10 +96,11 @@ async def _generate_next_token(
         self,
         prompt: str,
         settings: OnnxGenAIPromptExecutionSettings,
-        image: ImageContent | None = None,
+        images: list[ImageContent] | None = None,
+        audios: list[AudioContent] | None = None,
     ):
         token_choices: list[str] = []
-        async for new_token_choice in self._generate_next_token_async(prompt, settings, image):
+        async for new_token_choice in self._generate_next_token_async(prompt, settings, images, audios=audios):
             # zip only works if the lists are the same length
             if len(token_choices) == 0:
                 token_choices = new_token_choice
diff --git a/python/semantic_kernel/connectors/ai/onnx/utils.py b/python/semantic_kernel/connectors/ai/onnx/utils.py
@@ -2,6 +2,7 @@
 from enum import Enum
 
 from semantic_kernel.contents import AuthorRole, ChatHistory, ImageContent, TextContent
+from semantic_kernel.contents.audio_content import AudioContent
 from semantic_kernel.exceptions import ServiceException, ServiceInvalidRequestError
 
 
@@ -19,6 +20,8 @@ class ONNXTemplate(str, Enum):
 
     PHI3 = "phi3"
     PHI3V = "phi3v"
+    PHI4 = "phi4"
+    PHI4MM = "phi4mm"
     GEMMA = "gemma"
     LLAMA = "llama"
     NONE = "none"
@@ -39,9 +42,11 @@ def apply_template(history: ChatHistory, template: ONNXTemplate) -> str:
     """
     template_functions = {
         ONNXTemplate.PHI3: phi3_template,
+        ONNXTemplate.PHI4: phi4_template,
         ONNXTemplate.GEMMA: gemma_template,
         ONNXTemplate.LLAMA: llama_template,
         ONNXTemplate.PHI3V: phi3v_template,
+        ONNXTemplate.PHI4MM: phi4mm_template,
         ONNXTemplate.NONE: lambda text: text,
     }
 
@@ -67,6 +72,22 @@ def phi3_template(history: ChatHistory) -> str:
     return phi3_input
 
 
+def phi4_template(history: ChatHistory) -> str:
+    """Generates a formatted string from the chat history for use with the phi4 model.
+
+    Args:
+        history (ChatHistory): An object containing the chat history with a list of messages.
+
+    Returns:
+        str: A formatted string where each message is prefixed with the role and suffixed with an end marker.
+    """
+    phi4_input = ""
+    for message in history.messages:
+        phi4_input += f"<|{message.role.value}|>\n{message.content}<|end|>\n"
+    phi4_input += "<|assistant|>\n"
+    return phi4_input
+
+
 def phi3v_template(history: ChatHistory) -> str:
     """Generates a formatted string from a given chat history for use with the phi3v model.
 
@@ -78,22 +99,56 @@ def phi3v_template(history: ChatHistory) -> str:
              the role of each message (system, user, assistant) and the type of content (text, image).
     """
     phi3v_input = ""
+    image_count = 0
     for message in history.messages:
         if message.role == AuthorRole.SYSTEM:
             phi3v_input += f"<|system|>\n{message.content}<|end|>\n"
         if message.role == AuthorRole.USER:
             for item in message.items:
                 if isinstance(item, TextContent):
                     phi3v_input += f"<|user|>\n{item.text}<|end|>\n"
-                # Currently only one image is supported in Onnx
                 if isinstance(item, ImageContent):
-                    phi3v_input += "<|image_1|>\n"
+                    phi3v_input += f"<|image_{image_count + 1}|>\n"
+                    image_count += 1
         if message.role == AuthorRole.ASSISTANT:
             phi3v_input += f"<|assistant|>\n{message.content}<|end|>\n"
     phi3v_input += "<|assistant|>\n"
     return phi3v_input
 
 
+def phi4mm_template(history: ChatHistory) -> str:
+    """Generates a formatted string from a given chat history for use with the phi4mm model.
+
+    Args:
+        history (ChatHistory): An object containing the chat history with messages.
+
+    Returns:
+        str: A formatted string representing the chat history, with special tokens indicating
+             the role of each message (system, user, assistant) and the type of content (text, image).
+    """
+    phi4mm_input = ""
+    image_count = 0
+    audio_count = 0
+    for message in history.messages:
+        if message.role == AuthorRole.SYSTEM:
+            phi4mm_input += f"<|system|>\n{message.content}<|end|>\n"
+        if message.role == AuthorRole.USER:
+            for item in message.items:
+                if isinstance(item, TextContent):
+                    phi4mm_input += f"<|user|>\n{item.text}<|end|>\n"
+                # Currently only one image is supported in Onnx
+                if isinstance(item, ImageContent):
+                    phi4mm_input += f"<|image_{image_count + 1}|>\n"
+                    image_count += 1
+                if isinstance(item, AudioContent):
+                    phi4mm_input += f"<|audio_{audio_count + 1}|>\n"
+                    audio_count += 1
+        if message.role == AuthorRole.ASSISTANT:
+            phi4mm_input += f"<|assistant|>\n{message.content}<|end|>\n"
+    phi4mm_input += "<|assistant|>\n"
+    return phi4mm_input
+
+
 def gemma_template(history: ChatHistory) -> str:
     """Generates a formatted string for the Gemma model based on the provided chat history.
 
diff --git a/python/tests/unit/connectors/ai/onnx/services/test_onnx_chat_completion.py b/python/tests/unit/connectors/ai/onnx/services/test_onnx_chat_completion.py
@@ -52,8 +52,9 @@ def test_onnx_chat_completion_with_invalid_model():
         )
 
 
-def test_onnx_chat_completion_without_prompt_template():
-    with pytest.raises(TypeError):
+@patch("builtins.open", new_callable=mock_open, read_data=json.dumps(gen_ai_config_vision))
+def test_onnx_chat_completion_with_multimodality_without_prompt_template(gen_ai_config_vision):
+    with pytest.raises(ServiceInitializationError):
         OnnxGenAIChatCompletion()
 
 
@@ -147,7 +148,7 @@ def patch_open(*args, **kwargs):
         )
 
         last_image = chat_completion._get_images_from_history(history)
-        assert last_image == image_content
+        assert last_image == [image_content]
 
 
 @patch("onnxruntime_genai.Model")
diff --git a/python/tests/unit/connectors/ai/onnx/services/test_onnx_utils.py b/python/tests/unit/connectors/ai/onnx/services/test_onnx_utils.py
diff --git a/python/uv.lock b/python/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -52,8 +52,9 @@ def test_onnx_chat_completion_with_invalid_model():`
`52`	`52`	`)`
`53`	`53`
`54`	`54`
`55`		`-def test_onnx_chat_completion_without_prompt_template():`
`56`		`- with pytest.raises(TypeError):`
	`55`	`+@patch("builtins.open", new_callable=mock_open, read_data=json.dumps(gen_ai_config_vision))`
	`56`	`+def test_onnx_chat_completion_with_multimodality_without_prompt_template(gen_ai_config_vision):`
	`57`	`+ with pytest.raises(ServiceInitializationError):`
`57`	`58`	`OnnxGenAIChatCompletion()`
`58`	`59`
`59`	`60`
`@@ -147,7 +148,7 @@ def patch_open(args, *kwargs):`
`147`	`148`	`)`
`148`	`149`
`149`	`150`	`last_image = chat_completion._get_images_from_history(history)`
`150`		`- assert last_image == image_content`
	`151`	`+ assert last_image == [image_content]`
`151`	`152`
`152`	`153`
`153`	`154`	`@patch("onnxruntime_genai.Model")`