triton-inference-server
diff --git a/‎README.md‎
Lines changed: 64 additions & 1 deletion b/‎README.md‎
Lines changed: 64 additions & 1 deletion
diff --git a/‎src/triton_cli/common.py‎
Lines changed: 7 additions & 0 deletions b/‎src/triton_cli/common.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/triton_cli/parser.py‎
Lines changed: 18 additions & 1 deletion b/‎src/triton_cli/parser.py‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎src/triton_cli/server/server_config.py‎
Lines changed: 65 additions & 4 deletions b/‎src/triton_cli/server/server_config.py‎
Lines changed: 65 additions & 4 deletions
diff --git a/‎src/triton_cli/server/server_docker.py‎
Lines changed: 1 addition & 3 deletions b/‎src/triton_cli/server/server_docker.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/triton_cli/server/server_factory.py‎
Lines changed: 56 additions & 18 deletions b/‎src/triton_cli/server/server_factory.py‎
Lines changed: 56 additions & 18 deletions
@@ -38,7 +38,7 @@ Server.
 
 ## Table of Contents
 
-| [Pre-requisites](#pre-requisites) | [Installation](#installation) | [Quickstart](#quickstart) | [Serving LLM Models](#serving-llm-models) | [Serving a vLLM Model](#serving-a-vllm-model) | [Serving a TRT-LLM Model](#serving-a-trt-llm-model) | [Additional Dependencies for Custom Environments](#additional-dependencies-for-custom-environments) | [Known Limitations](#known-limitations) |
+| [Pre-requisites](#pre-requisites) | [Installation](#installation) | [Quickstart](#quickstart) | [Serving LLM Models](#serving-llm-models) | [Serving a vLLM Model](#serving-a-vllm-model) | [Serving a TRT-LLM Model](#serving-a-trt-llm-model) | [Serving a LLM model with OpenAI API](#serving-a-llm-model-with-openai-api) | [Additional Dependencies for Custom Environments](#additional-dependencies-for-custom-environments) | [Known Limitations](#known-limitations) |
 
 ## Pre-requisites
 
@@ -295,6 +295,69 @@ triton infer -m llama-3.1-8b-instruct --prompt "machine learning is"
 # Profile model with GenAI-Perf
 triton profile -m llama-3.1-8b-instruct --backend tensorrtllm
 ```
+## Serving a LLM model with OpenAI API
+
+Triton CLI could also start the triton server with a [OpenAI RESTful API Frontend](https://github.com/triton-inference-server/server/tree/main/python/openai).
+
+Triton Server's OpenAI Frontend supports the following API endpoints:
+
+- [POST /v1/chat/completions](https://platform.openai.com/docs/api-reference/chat/create)
+- [POST /v1/completions](https://platform.openai.com/docs/api-reference/completions/create)
+- [GET /v1/models](https://platform.openai.com/docs/api-reference/models/list)
+- [GET /v1/models/{model_name}](https://platform.openai.com/docs/api-reference/models/retrieve)
+- GET /metrics
+
+To start the triton server with a OpenAI RESTful API Frontend, attach the `--frontend openai` to the  `triton start` command.
+```bash
+triton start --frontend openai
+```
+By default, the server and its OpenAI API can be accessed at `http://localhost:9000`.
+
+> [!NOTE]
+> There could be more than one LLM models in the model repository, each model could have its own tokenizer_config.json.
+> OpenAI's `/v1/chat/completions` API requires a chat template from a tokenizer. By default, Triton CLI will
+> automatically search for a tokenizer for the chat template in the model repository. If you'd like to set
+> a tokenizer's chat template, specify the tokenzier with `--openai-chat-template-tokenizer {higgingface id or path to the tokenizer directory}`
+>
+> ex:  `triton start --frontend openai --openai-chat-template-tokenizer meta-llama/Meta-Llama-3.1-8B-Instruct`
+
+#### Example
+
+```bash
+docker run -ti \
+  --gpus all \
+  --network=host \
+  --shm-size=1g --ulimit memlock=-1 \
+  -v /tmp:/tmp \
+  -v ${HOME}/models:/root/models \
+  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
+  nvcr.io/nvidia/tritonserver:25.01-trtllm-python-py3
+
+# Install the Triton CLI
+pip install git+https://github.com/triton-inference-server/triton_cli.git@main
+
+# Authenticate with huggingface for restricted models like Llama-2 and Llama-3
+huggingface-cli login
+
+# Build TRT LLM engine and generate a Triton model repository pointing at it
+triton remove -m all
+triton import -m llama-3.1-8b-instruct --backend tensorrtllm
+# For vllm backend:
+# triton import -m llama-3.1-8b-instruct --backend vllm
+
+# Start Triton with a OpenAI RESTful API Frontend
+triton start --frontend openai
+
+# Interact with model at http://localhost:9000
+curl -s http://localhost:9000/v1/chat/completions -H 'Content-Type: application/json' -d '{
+  "model": "llama-3.1-8b-instruct",
+  "messages": [{"role": "user", "content": "What is machine learning?"}]
+}'
+
+# Profile model with GenAI-Perf
+triton profile -m llama-3.1-8b-instruct --service-kind openai --endpoint-type chat --url localhost:9000 --streaming
+```
+
 ## Additional Dependencies for Custom Environments
 
 When using Triton CLI outside of official Triton NGC containers, you may
 
@@ -38,12 +38,19 @@ class TritonCLIException(Exception):
 
 # Server
 DEFAULT_TRITONSERVER_PATH: str = "tritonserver"
+DEFAULT_TRITONSERVER_OPENAI_FRONTEND_PATH: str = (
+    "/opt/tritonserver/python/openai/openai_frontend/main.py"
+)
+
 ## Server Docker
 DEFAULT_SHM_SIZE: str = "1G"
 # A custom image containing both vLLM and TRT-LLM dependencies,
 # defined in triton_cli/docker/Dockerfile.
 DEFAULT_TRITONSERVER_IMAGE: str = "triton_llm"
 
+# Serving Frontend
+SUPPORTED_FRONTEND: set = {"kserve", "openai"}
+
 # Model Repository
 DEFAULT_MODEL_REPO: Path = Path.home() / "models"
 DEFAULT_HF_CACHE: Path = Path.home() / ".cache" / "huggingface"
 
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -40,6 +40,7 @@
     DEFAULT_MODEL_REPO,
     DEFAULT_TRITONSERVER_IMAGE,
     LOGGER_NAME,
+    SUPPORTED_FRONTEND,
     TritonCLIException,
 )
 from triton_cli.client.client import InferenceServerException, TritonClient
@@ -156,6 +157,22 @@ def add_server_start_args(subcommands):
             default=300,
             help="Maximum number of seconds to wait for server startup. (Default: 300)",
         )
+        subcommand.add_argument(
+            "--frontend",
+            choices=SUPPORTED_FRONTEND,
+            type=str,
+            required=False,
+            default="kserve",
+            help=f"The inference API frontend to use when starting the triton server. Default is the KServe api frontend. Choices: '{SUPPORTED_FRONTEND}'.",
+        )
+        subcommand.add_argument(
+            "--openai-chat-template-tokenizer",
+            type=str,
+            required=False,
+            # TODO: Should probably set a default tokenizer, like 'hf-internal-testing/llama-tokenizer', since not all tokenizers have a chat template
+            default=None,
+            help="HuggingFace ID or local folder path of the tokenizer to use for chat templates with the OpenAI API frontend. If no tokenizer is specified, it searches for and selects an LLM model's tokenizer from the model repository.",
+        )
 
 
 def add_model_args(subcommands):
 
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,6 +14,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from triton_cli.common import (
+    DEFAULT_TRITONSERVER_PATH,
+    DEFAULT_TRITONSERVER_OPENAI_FRONTEND_PATH,
+)
+
 
 class TritonServerConfig:
     """
@@ -73,12 +78,19 @@ class TritonServerConfig:
         "tensorflow-version",
     ]
 
-    def __init__(self):
+    def __init__(self, server_path=None):
         """
         Construct TritonServerConfig
+
+        Parameters
+        ----------
+        server_path: string
+            path to the triton server binary. Default is "tritonserver" if unset.
         """
 
         self._server_args = {k: None for k in self.server_arg_keys}
+        self._server_path = server_path if server_path else DEFAULT_TRITONSERVER_PATH
+        self._server_name = "Triton Inference Server"
 
     @classmethod
     def allowed_keys(cls):
@@ -172,6 +184,16 @@ def server_args(self):
 
         return self._server_args
 
+    def server_path(self) -> str:
+        """
+        Returns
+        -------
+        str
+            A path to the triton server binary or script
+        """
+
+        return self._server_path
+
     # TODO: Investigate what parameters are supported with TRT LLM's launching style.
     # For example, explicit launch mode is not. See the TRTLLMUtils class for a list of
     # supported args.
@@ -231,6 +253,45 @@ def __setitem__(self, key, value):
             self._server_args[kebab_cased_key] = value
         else:
             raise Exception(
-                f"The argument '{key}' to the Triton Inference "
-                "Server is not currently supported."
+                f"The argument '{key}' to the {self._server_name}"
+                " is not currently supported."
             )
+
+
+class TritonOpenAIServerConfig(TritonServerConfig):
+    """
+    A config class to set arguments to the Triton Inference
+    Server with OpenAI RESTful API. An argument set to None will use the server default.
+    """
+
+    server_arg_keys = [
+        # triton server args
+        "tritonserver-log-verbose-level",
+        "host",
+        "backend",
+        "tokenizer",
+        "model-repository",
+        # uvicorn args
+        "openai-port",
+        "uvicorn-log-level",
+        # kserve frontend args
+        "enable-kserve-frontends",
+        "kserve-http-port",
+        "kserve-grpc-port",
+    ]
+
+    def __init__(self, server_path=None):
+        """
+        Construct TritonOpenAIServerConfig
+
+        Parameters
+        ----------
+        server_path: string
+            path to the Triton OpenAI Server python script. Default is "/opt/tritonserver/python/openai/openai_frontend/main.py" if unset.
+        """
+
+        self._server_args = {k: None for k in self.server_arg_keys}
+        self._server_path = (
+            server_path if server_path else DEFAULT_TRITONSERVER_OPENAI_FRONTEND_PATH
+        )
+        self._server_name = "Triton Inference Server with OpenAI RESTful API"
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@
 from .server_utils import TritonServerUtils
 from triton_cli.common import (
     HF_CACHE,
-    DEFAULT_TRITONSERVER_PATH,
     DEFAULT_TRITONSERVER_IMAGE,
     LOGGER_NAME,
 )
@@ -164,7 +163,6 @@ def start(self, env=None):
         }
         # Construct run command
         command = self._server_utils.get_launch_command(
-            tritonserver_path=DEFAULT_TRITONSERVER_PATH,
             server_config=self._server_config,
             cmd_as_list=False,
             env_cmds=env_cmds,
 
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,19 +15,17 @@
 # limitations under the License.
 
 import logging
-import os
 import shutil
 
 from .server_local import TritonServerLocal
 from .server_docker import TritonServerDocker
-from .server_config import TritonServerConfig
+from .server_config import TritonServerConfig, TritonOpenAIServerConfig
 from triton_cli.common import (
     DEFAULT_SHM_SIZE,
-    DEFAULT_TRITONSERVER_PATH,
     LOGGER_NAME,
     TritonCLIException,
 )
-
+from .server_utils import TRTLLMUtils, VLLMUtils
 
 logger = logging.getLogger(LOGGER_NAME)
 
@@ -82,7 +80,7 @@ def create_server_docker(
         )
 
     @staticmethod
-    def create_server_local(path, config, gpus=None):
+    def create_server_local(config, gpus=None):
         """
         Parameters
         ----------
@@ -99,7 +97,7 @@ def create_server_local(path, config, gpus=None):
         TritonServerLocal
         """
 
-        return TritonServerLocal(path=path, config=config, gpus=gpus)
+        return TritonServerLocal(config=config, gpus=gpus)
 
     @staticmethod
     def get_server_handle(config, gpus=None):
@@ -130,15 +128,10 @@ def get_server_handle(config, gpus=None):
 
     @staticmethod
     def _get_local_server_handle(config, gpus):
-        tritonserver_path = DEFAULT_TRITONSERVER_PATH
-        TritonServerFactory._validate_triton_server_path(tritonserver_path)
+        triton_config = TritonServerFactory._get_triton_server_config(config)
+        TritonServerFactory._validate_triton_server_path(triton_config.server_path())
 
-        triton_config = TritonServerConfig()
-        triton_config["model-repository"] = config.model_repository
-        if config.verbose:
-            triton_config["log-verbose"] = "1"
         server = TritonServerFactory.create_server_local(
-            path=tritonserver_path,
             config=triton_config,
             gpus=gpus,
         )
@@ -147,10 +140,7 @@ def _get_local_server_handle(config, gpus):
 
     @staticmethod
     def _get_docker_server_handle(config, gpus):
-        triton_config = TritonServerConfig()
-        triton_config["model-repository"] = os.path.abspath(config.model_repository)
-        if config.verbose:
-            triton_config["log-verbose"] = "1"
+        triton_config = TritonServerFactory._get_triton_server_config(config)
 
         server = TritonServerFactory.create_server_docker(
             image=config.image,
@@ -174,3 +164,51 @@ def _validate_triton_server_path(tritonserver_path):
             raise TritonCLIException(
                 f"Either the binary {tritonserver_path} is invalid, not on the PATH, or does not have the correct permissions."
             )
+
+    @staticmethod
+    def _get_triton_server_config(config):
+        if config.frontend == "openai":
+            triton_config = TritonOpenAIServerConfig()
+            triton_config["model-repository"] = config.model_repository
+
+            triton_config["tokenizer"] = (
+                TritonServerFactory._get_openai_chat_template_tokenizer(config)
+            )
+
+            if config.verbose:
+                triton_config["tritonserver-log-verbose-level"] = "1"
+        else:
+            triton_config = TritonServerConfig()
+            triton_config["model-repository"] = config.model_repository
+            if config.verbose:
+                triton_config["log-verbose"] = "1"
+
+        return triton_config
+
+    @staticmethod
+    def _get_openai_chat_template_tokenizer(config):
+        """
+        Raises an exception if a tokenizer can not be found and is not specified with OpenAI Frontend
+        """
+        if config.openai_chat_template_tokenizer:
+            return config.openai_chat_template_tokenizer
+
+        logger.info(
+            "OpenAI frontend's tokenizer for chat template is not specify, searching for an available tokenizer in the model repository."
+        )
+        trtllm_utils = TRTLLMUtils(config.model_repository)
+        vllm_utils = VLLMUtils(config.model_repository)
+
+        if trtllm_utils.has_trtllm_model():
+            tokenizer_path = trtllm_utils.get_engine_path()
+        elif vllm_utils.has_vllm_model():
+            tokenizer_path = vllm_utils.get_vllm_model_huggingface_id_or_path()
+        else:
+            raise TritonCLIException(
+                "Unable to find a tokenizer to start the Triton OpenAI RESTful API, please use '--openai-chat-template-tokenizer' to specify a tokenizer."
+            )
+
+        logger.info(
+            f"Found tokenizer in '{tokenizer_path}' after searching for the tokenizer in the model repository"
+        )
+        return tokenizer_path