triton-inference-server
diff --git a/‎README.md‎
Lines changed: 49 additions & 34 deletions b/‎README.md‎
Lines changed: 49 additions & 34 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 6 additions & 4 deletions b/‎pyproject.toml‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎src/triton_cli/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎src/triton_cli/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/triton_cli/docker/Dockerfile‎
Lines changed: 2 additions & 2 deletions b/‎src/triton_cli/docker/Dockerfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/triton_cli/parser.py‎
Lines changed: 7 additions & 2 deletions b/‎src/triton_cli/parser.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎src/triton_cli/repository.py‎
Lines changed: 34 additions & 62 deletions b/‎src/triton_cli/repository.py‎
Lines changed: 34 additions & 62 deletions
@@ -22,8 +22,8 @@ and running the CLI from within the latest corresponding `tritonserver`
 container image, which should have all necessary system dependencies installed.
 
 For vLLM and TRT-LLM, you can use their respective images:
-- `nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3`
-- `nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3`
+- `nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3`
+- `nvcr.io/nvidia/tritonserver:24.09-trtllm-python-py3`
 
 If you decide to run the CLI on the host or in a custom image, please
 see this list of [additional dependencies](#additional-dependencies-for-custom-environments)
@@ -38,13 +38,14 @@ matrix below:
 
 | Triton CLI Version | TRT-LLM Version | Triton Container Tag |
 |:------------------:|:---------------:|:--------------------:|
+| 0.1.0  | v0.13.0 | 24.09 |
 | 0.0.11 | v0.12.0 | 24.08 |
 | 0.0.10 | v0.11.0 | 24.07 |
-| 0.0.9 | v0.10.0 | 24.06 |
-| 0.0.8 | v0.9.0 | 24.05 |
-| 0.0.7 | v0.9.0 | 24.04 |
-| 0.0.6 | v0.8.0 | 24.02, 24.03 |
-| 0.0.5 | v0.7.1 | 24.01 |
+| 0.0.9  | v0.10.0 | 24.06 |
+| 0.0.8  | v0.9.0  | 24.05 |
+| 0.0.7  | v0.9.0  | 24.04 |
+| 0.0.6  | v0.8.0  | 24.02, 24.03 |
+| 0.0.5  | v0.7.1  | 24.01 |
 
 ### Install from GitHub
 
@@ -58,7 +59,7 @@ It is also possible to install from a specific branch name, a commit hash
 or a tag name. For example to install `triton_cli` with a specific tag:
 
 ```bash
-GIT_REF="0.0.11"
+GIT_REF="0.1.0"
 pip install git+https://github.com/triton-inference-server/triton_cli.git@${GIT_REF}
 ```
 
@@ -93,7 +94,7 @@ triton -h
 triton import -m gpt2
 
 # Start server pointing at the default model repository
-triton start --image nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3
+triton start --image nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3
 
 # Infer with CLI
 triton infer -m gpt2 --prompt "machine learning is"
@@ -119,26 +120,50 @@ minutes.
 > in Huggingface through either `huggingface-cli login` or setting the `HF_TOKEN`
 > environment variable.
 
+### Model Sources
 
-### Serving a vLLM Model
+<!-- TODO: Add more docs on commands, such as a doc on `import` behavior/args -->
 
-vLLM models will be downloaded at runtime when starting the server if not found
-locally in the HuggingFace cache. No offline engine building step is required,
-but you can pre-download the model in advance to avoid downloading at server
-startup time.
+The `triton import` command helps automate the process of creating a model repository
+to serve with Triton Inference Server. When preparing models, a `--source` is required
+to point at the location containing a model/weights. This argument is overloaded to support
+a few types of locations:
+- HuggingFace (`--source hf:<HUGGINGFACE_ID>`)
+- Local Filesystem (`--source local:</path/to/model>`)
+
+#### Model Source Aliases
 
-The following models have currently been tested for vLLM through the CLI:
+<!-- TODO: Put known model sources into a JSON file or something separate from the code -->
+
+For convenience, the Triton CLI supports short aliases for a handful
+of models which will automatically set the correct `--source` for you.
+A full list of aliases can be found from `KNOWN_MODEL_SOURCES` within `parser.py`,
+but some examples can be found below:
 - `gpt2`
 - `opt125m`
 - `mistral-7b`
-- `falcon-7b`
-- `llama-2-7b`
 - `llama-2-7b-chat`
-- `llama-3-8b`
 - `llama-3-8b-instruct`
-- `llama-3.1-8b`
 - `llama-3.1-8b-instruct`
 
+For example, this command will go get Llama 3.1 8B Instruct from HuggingFace:
+```bash
+triton import -m llama-3.1-8b-instruct
+
+# Equivalent command without alias:
+# triton import --model llama-3.1-8b-instruct --source "hf:meta-llama/Llama-3.1-8B-Instruct"
+```
+
+For full control and flexibility, you can always manually specify the `--source`.
+
+### Serving a vLLM Model
+
+vLLM models will be downloaded at runtime when starting the server if not found
+locally in the HuggingFace cache. No offline engine building step is required,
+but you can pre-download the model in advance to avoid downloading at server
+startup time.
+
+The following models are supported by vLLM: https://docs.vllm.ai/en/latest/models/supported_models.html
 
 #### Example
 
@@ -149,10 +174,10 @@ docker run -ti \
   --shm-size=1g --ulimit memlock=-1 \
   -v ${HOME}/models:/root/models \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  nvcr.io/nvidia/tritonserver:24.08-vllm-python-py3
+  nvcr.io/nvidia/tritonserver:24.09-vllm-python-py3
 
 # Install the Triton CLI
-pip install git+https://github.com/triton-inference-server/triton_cli.git@0.0.11
+pip install git+https://github.com/triton-inference-server/triton_cli.git@0.1.0
 
 # Authenticate with huggingface for restricted models like Llama-2 and Llama-3
 huggingface-cli login
@@ -189,15 +214,7 @@ triton profile -m llama-3-8b-instruct --backend vllm
 > see [here](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/model_configuration.html#instance-groups).
 
 The following models are currently supported for automating TRT-LLM
-engine builds through the CLI:
-- `gpt2`
-- `opt125m`
-- `llama-2-7b`
-- `llama-2-7b-chat`
-- `llama-3-8b`
-- `llama-3-8b-instruct`
-- `llama-3.1-8b`
-- `llama-3.1-8b-instruct`
+engine builds through the CLI: https://nvidia.github.io/TensorRT-LLM/llm-api-examples/index.html#supported-models
 
 > [!NOTE]
 > 1. Building a TRT-LLM engine for Llama-2-7B, Llama-3-8B, or Llama-3.1-8B
@@ -222,10 +239,10 @@ docker run -ti \
   -v /tmp:/tmp \
   -v ${HOME}/models:/root/models \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3
+  nvcr.io/nvidia/tritonserver:24.09-trtllm-python-py3
 
 # Install the Triton CLI
-pip install git+https://github.com/triton-inference-server/triton_cli.git@0.0.11
+pip install git+https://github.com/triton-inference-server/triton_cli.git@0.1.0
 
 # Authenticate with huggingface for restricted models like Llama-2 and Llama-3
 huggingface-cli login
@@ -282,5 +299,3 @@ and may not be as optimized as possible for your system or use case.
 - Triton CLI currently uses the TRT-LLM dependencies installed in its environment
 to build TRT-LLM engines, so you must take care to match the build-time and
 run-time versions of TRT-LLM.
-- Triton CLI currently does not support launching the server as a background
-process.
 
@@ -47,19 +47,21 @@ keywords = []
 requires-python = ">=3.10,<4"
 # TODO: Add [gpu] set of dependencies for trtllm once it's available on pypi
 dependencies = [
-    "grpcio>=1.65.5",
+    # Client deps - generally versioned together
+    "grpcio>=1.66.1",
+    # Use explicit client version matching genai-perf version for tagged release
+    "tritonclient[all] == 2.50",
+    "genai-perf @ git+https://github.com/triton-inference-server/[email protected]#subdirectory=genai-perf",
+    # Misc deps
     "directory-tree == 0.0.4", # may remove in future
     "docker == 6.1.3",
-    "genai-perf @ git+https://github.com/triton-inference-server/[email protected]#subdirectory=genai-perf",
     # TODO: rely on tritonclient to pull in protobuf and numpy dependencies?
     "numpy >=1.21,<2",
     "protobuf>=3.7.0",
     "prometheus-client == 0.19.0",
     "psutil >= 5.9.5", # may remove later
     "rich == 13.5.2",
     # TODO: Test on cpu-only machine if [cuda] dependency is an issue,
-    # Use explicit client version matching genai-perf version for tagged release
-    "tritonclient[all] == 2.49",
     "huggingface-hub >= 0.19.4",
     # Testing
     "pytest >= 8.1.1", # may remove later
 
@@ -24,4 +24,4 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-__version__ = "0.0.11"
+__version__ = "0.1.0"
@@ -1,9 +1,9 @@
 # TRT-LLM image contains engine building and runtime dependencies
-FROM nvcr.io/nvidia/tritonserver:24.08-trtllm-python-py3
+FROM nvcr.io/nvidia/tritonserver:24.09-trtllm-python-py3
 
 # Setup vLLM Triton backend
 RUN mkdir -p /opt/tritonserver/backends/vllm && \
-    git clone -b r24.08 https://github.com/triton-inference-server/vllm_backend.git /tmp/vllm_backend && \
+    git clone -b r24.09 https://github.com/triton-inference-server/vllm_backend.git /tmp/vllm_backend && \
     cp -r /tmp/vllm_backend/src/* /opt/tritonserver/backends/vllm && \
     rm -r /tmp/vllm_backend
 
 
@@ -228,8 +228,7 @@ def parse_args_repo(parser):
         "--source",
         type=str,
         required=False,
-        help="Local model path or model identifier. Use prefix 'hf:' to specify a HuggingFace model ID. "
-        "NOTE: HuggingFace model support is currently limited to Transformer models through the vLLM backend.",
+        help="Local model path or model identifier. Use prefix 'hf:' to specify a HuggingFace model ID, or 'local:' prefix to specify a file path to a model.",
     )
 
     repo_remove = parser.add_parser("remove", help="Remove model from model repository")
@@ -305,7 +304,13 @@ def start_server_with_fallback(args: argparse.Namespace, blocking=True):
         try:
             args.mode = mode
             server = start_server(args, blocking=blocking)
+        # TODO: Clean up re-entrant print error
+        except RuntimeError as e:
+            print(e)
+            break
         except Exception as e:
+            print(e)
+            print(type(e))
             msg = f"Failed to start server in '{mode}' mode. {e}"
             logger.debug(msg)
             errors.append(msg)
 
@@ -29,8 +29,8 @@
 import shutil
 import logging
 import subprocess
+import multiprocessing
 from pathlib import Path
-from rich.console import Console
 
 from directory_tree import display_tree
 
@@ -41,7 +41,6 @@
     TritonCLIException,
 )
 from triton_cli.trt_llm.engine_config_parser import parse_and_substitute
-from triton_cli.trt_llm.builder import TRTLLMBuilder
 
 from huggingface_hub import snapshot_download
 from huggingface_hub import utils as hf_utils
@@ -66,6 +65,7 @@
 
 SOURCE_PREFIX_HUGGINGFACE = "hf:"
 SOURCE_PREFIX_NGC = "ngc:"
+SOURCE_PREFIX_LOCAL = "local:"
 
 TRT_TEMPLATES_PATH = Path(__file__).parent / "templates" / "trt_llm"
 
@@ -75,35 +75,6 @@
 
 HF_TOKEN_PATH = Path.home() / ".cache" / "huggingface" / "token"
 
-# TODO: Improve this flow and reduce hard-coded model check locations
-SUPPORTED_TRT_LLM_BUILDERS = {
-    "facebook/opt-125m": {
-        "hf_allow_patterns": ["*.bin", "*.json", "*.txt"],
-    },
-    "meta-llama/Llama-2-7b-hf": {
-        "hf_allow_patterns": ["*.safetensors", "*.json"],
-    },
-    "meta-llama/Llama-2-7b-chat-hf": {
-        "hf_allow_patterns": ["*.safetensors", "*.json"],
-    },
-    "meta-llama/Meta-Llama-3-8B": {
-        "hf_allow_patterns": ["*.safetensors", "*.json"],
-    },
-    "meta-llama/Meta-Llama-3-8B-Instruct": {
-        "hf_allow_patterns": ["*.safetensors", "*.json"],
-    },
-    "meta-llama/Meta-Llama-3.1-8B": {
-        "hf_allow_patterns": ["*.safetensors", "*.json"],
-    },
-    "meta-llama/Meta-Llama-3.1-8B-Instruct": {
-        "hf_allow_patterns": ["*.safetensors", "*.json"],
-    },
-    "gpt2": {
-        "hf_allow_patterns": ["*.safetensors", "*.json"],
-        "hf_ignore_patterns": ["onnx/*"],
-    },
-}
-
 
 # NOTE: Thin wrapper around NGC CLI is a WAR for now.
 # TODO: Move out to generic files/interface for remote model stores
@@ -206,11 +177,19 @@ def add(
             backend = "tensorrtllm"
         # Local model path
         else:
-            logger.debug("No supported prefix detected, assuming local path")
+            if source.startswith(SOURCE_PREFIX_LOCAL):
+                logger.debug("Local prefix detected, parsing local file path")
+            else:
+                logger.info(
+                    "No supported --source prefix detected, assuming local path"
+                )
+
             source_type = "local"
             model_path = Path(source)
             if not model_path.exists():
-                raise TritonCLIException(f"{model_path} does not exist")
+                raise TritonCLIException(
+                    f"Local file path '{model_path}' provided by --source does not exist"
+                )
 
         model_dir, version_dir = self.__create_model_repository(name, version, backend)
 
@@ -349,23 +328,21 @@ def __generate_ngc_model(self, name: str, source: str):
             str(self.repo), name, engines_path, engines_path, "auto", dry_run=False
         )
 
-    def __generate_trtllm_model(self, name, huggingface_id):
-        builder_info = SUPPORTED_TRT_LLM_BUILDERS.get(huggingface_id)
-        if not builder_info:
-            raise TritonCLIException(
-                f"Building a TRT LLM engine for {huggingface_id} is not currently supported."
-            )
-
+    def __generate_trtllm_model(self, name: str, huggingface_id: str):
         engines_path = ENGINE_DEST_PATH + "/" + name
-        hf_download_path = ENGINE_DEST_PATH + "/" + name + "/hf_download"
-
         engines = [engine for engine in Path(engines_path).glob("*.engine")]
         if engines:
             logger.warning(
                 f"Found existing engine(s) at {engines_path}, skipping build."
             )
         else:
-            self.__build_trtllm_engine(huggingface_id, hf_download_path, engines_path)
+            # Run TRT-LLM build in a separate process to make sure it definitely
+            # cleans up any GPU memory used when done.
+            p = multiprocessing.Process(
+                target=self.__build_trtllm_engine, args=(huggingface_id, engines_path)
+            )
+            p.start()
+            p.join()
 
         # NOTE: In every case, the TRT LLM template should be filled in with values.
         # If the model exists, the CLI will raise an exception when creating the model repo.
@@ -375,30 +352,25 @@ def __generate_trtllm_model(self, name, huggingface_id):
             triton_model_dir=str(self.repo),
             bls_model_name=name,
             engine_dir=engines_path,
-            token_dir=hf_download_path,
+            token_dir=engines_path,
             token_type="auto",
             dry_run=False,
         )
 
-    def __build_trtllm_engine(self, huggingface_id, hf_download_path, engines_path):
-        builder_info = SUPPORTED_TRT_LLM_BUILDERS.get(huggingface_id)
-        hf_allow_patterns = builder_info["hf_allow_patterns"]
-        hf_ignore_patterns = builder_info.get("hf_ignore_patterns", None)
-        self.__download_hf_model(
-            huggingface_id,
-            hf_download_path,
-            allow_patterns=hf_allow_patterns,
-            ignore_patterns=hf_ignore_patterns,
-        )
+    def __build_trtllm_engine(self, huggingface_id: str, engines_path: Path):
+        from tensorrt_llm import LLM, BuildConfig
 
-        builder = TRTLLMBuilder(
-            huggingface_id=huggingface_id,
-            hf_download_path=hf_download_path,
-            engine_output_path=engines_path,
-        )
-        console = Console()
-        with console.status(f"Building TRT-LLM engine for {huggingface_id}..."):
-            builder.build()
+        # NOTE: Given config.json, can read from 'build_config' section and from_dict
+        config = BuildConfig()
+        # TODO: Expose more build args to user
+        # TODO: Discuss LLM API BuildConfig defaults
+        # config.max_input_len = 1024
+        # config.max_seq_len = 8192
+        # config.max_batch_size = 256
+
+        engine = LLM(huggingface_id, build_config=config)
+        # TODO: Investigate if LLM is internally saving a copy to a temp dir
+        engine.save(str(engines_path))
 
     def __create_model_repository(
         self, name: str, version: int = 1, backend: str = None