build: Update CLI version references to 0.0.8 and Triton references to 24.05 (#72)

rmccorm4 · web-flow · commit 8f577d370e96 · 2024-06-11T15:47:11.000-07:00
diff --git a/.github/workflows/python-package.yaml b/.github/workflows/python-package.yaml
@@ -36,7 +36,7 @@ jobs:
   build:
     runs-on: ${{ matrix.os }}
     container:
-      image: nvcr.io/nvidia/tritonserver:24.04-py3
+      image: nvcr.io/nvidia/tritonserver:24.05-py3
     strategy:
       fail-fast: false
       matrix:
diff --git a/README.md b/README.md
@@ -22,8 +22,8 @@ and running the CLI from within the latest corresponding `tritonserver`
 container image, which should have all necessary system dependencies installed.
 
 For vLLM and TRT-LLM, you can use their respective images:
-- `nvcr.io/nvidia/tritonserver:24.04-vllm-python-py3`
-- `nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3`
+- `nvcr.io/nvidia/tritonserver:24.05-vllm-python-py3`
+- `nvcr.io/nvidia/tritonserver:24.05-trtllm-python-py3`
 
 If you decide to run the CLI on the host or in a custom image, please
 see this list of [additional dependencies](#additional-dependencies-for-custom-environments)
@@ -38,6 +38,7 @@ matrix below:
 
 | Triton CLI Version | TRT-LLM Version | Triton Container Tag |
 |:------------------:|:---------------:|:--------------------:|
+| 0.0.8 | v0.9.0 | 24.05 |
 | 0.0.7 | v0.9.0 | 24.04 |
 | 0.0.6 | v0.8.0 | 24.02, 24.03 |
 | 0.0.5 | v0.7.1 | 24.01 |
@@ -51,10 +52,10 @@ pip install git+https://github.com/triton-inference-server/triton_cli.git
 ```
 
 It is also possible to install from a specific branch name, a commit hash
-or a tag name. For example to install `triton_cli` with tag 0.0.7:
+or a tag name. For example to install `triton_cli` with a specific tag:
 
 ```bash
-GIT_REF="0.0.7"
+GIT_REF="0.0.8"
 pip install git+https://github.com/triton-inference-server/triton_cli.git@${GIT_REF}
 ```
 
@@ -89,7 +90,7 @@ triton -h
 triton import -m gpt2
 
 # Start server pointing at the default model repository
-triton start --image nvcr.io/nvidia/tritonserver:24.04-vllm-python-py3
+triton start --image nvcr.io/nvidia/tritonserver:24.05-vllm-python-py3
 
 # Infer with CLI
 triton infer -m gpt2 --prompt "machine learning is"
@@ -143,11 +144,10 @@ docker run -ti \
   --shm-size=1g --ulimit memlock=-1 \
   -v ${HOME}/models:/root/models \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  nvcr.io/nvidia/tritonserver:24.04-vllm-python-py3
+  nvcr.io/nvidia/tritonserver:24.05-vllm-python-py3
 
 # Install the Triton CLI
-GIT_REF="0.0.7"
-pip install git+https://github.com/triton-inference-server/triton_cli.git@${GIT_REF}
+pip install git+https://github.com/triton-inference-server/triton_cli.git@0.0.8
 
 # Authenticate with huggingface for restricted models like Llama-2 and Llama-3
 huggingface-cli login
@@ -213,11 +213,10 @@ docker run -ti \
   -v /tmp:/tmp \
   -v ${HOME}/models:/root/models \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+  nvcr.io/nvidia/tritonserver:24.05-trtllm-python-py3
 
 # Install the Triton CLI
-GIT_REF="0.0.7"
-pip install git+https://github.com/triton-inference-server/triton_cli.git@${GIT_REF}
+pip install git+https://github.com/triton-inference-server/triton_cli.git@0.0.8
 
 # Authenticate with huggingface for restricted models like Llama-2 and Llama-3
 huggingface-cli login
diff --git a/pyproject.toml b/pyproject.toml
@@ -58,7 +58,8 @@ dependencies = [
     "psutil >= 5.9.5", # may remove later
     "rich == 13.5.2",
     # TODO: Test on cpu-only machine if [cuda] dependency is an issue,
-    "tritonclient[all] >= 2.46",
+    # Use explicit client version matching genai-perf version for tagged release
+    "tritonclient[all] == 2.46",
     "huggingface-hub >= 0.19.4",
     # Testing
     "pytest >= 8.1.1", # may remove later
diff --git a/src/triton_cli/__init__.py b/src/triton_cli/__init__.py
@@ -24,4 +24,4 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-__version__ = "0.0.8dev"
+__version__ = "0.0.8"
diff --git a/src/triton_cli/docker/Dockerfile b/src/triton_cli/docker/Dockerfile
@@ -1,20 +1,9 @@
-FROM nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+# TRT-LLM image contains engine building and runtime dependencies
+FROM nvcr.io/nvidia/tritonserver:24.05-trtllm-python-py3
 
 # Setup vLLM Triton backend
 RUN mkdir -p /opt/tritonserver/backends/vllm && \
     wget -P /opt/tritonserver/backends/vllm https://raw.githubusercontent.com/triton-inference-server/vllm_backend/main/src/model.py
 
-# TRT-LLM engine build dependencies
-# NOTE: torch 2.2.0 has a symbol conflict, so WAR is to install 2.1.2
-RUN pip install \
-  "psutil" \
-  "pynvml>=11.5.0" \
-  --extra-index-url https://pypi.nvidia.com/ "tensorrt-llm==0.9.0"
-
 # vLLM runtime dependencies
-RUN pip install \
-  # Triton 24.04 vLLM containers comes with "vllm==0.4.0.post1", but this has
-  # incompatible dependencies with trtllm==0.9.0 around torch and transformers.
-  "vllm==0.4.1"
-
-# TODO: Install Triton CLI in this image
+RUN pip install "vllm==0.4.3"