anyscale · avigyabb · Jul 3, 2025 · Aug 7, 2025 · Aug 7, 2025 · Aug 7, 2025
diff --git a/02_service_hello_world/query.py b/02_service_hello_world/query.py
@@ -1,3 +1,5 @@
+import os
+from urllib.parse import urljoin
 import requests
 
 # The "anyscale service deploy" script outputs a line that looks like
@@ -9,7 +11,7 @@
 base_url = <BASE_URL>  # Fill this in.
 
 resp = requests.get(
-    f"{base_url}/hello",
+    urljoin(base_url, "hello"),
     params={"name": "Theodore"},
     headers={"Authorization": f"Bearer {token}"})
 

diff --git a/03_deploy_llama_3_8b/Dockerfile b/03_deploy_llama_3_8b/Dockerfile
@@ -0,0 +1,8 @@
+FROM anyscale/ray:2.49.0-slim-py312-cu128
+
+# C compiler for Triton’s runtime build step (vLLM V1 engine)
+# https://github.com/vllm-project/vllm/issues/2997
+RUN sudo apt-get update && \
+    sudo apt-get install -y --no-install-recommends build-essential
+
+RUN pip install vllm==0.10.0
diff --git a/03_deploy_llama_3_8b/README.md b/03_deploy_llama_3_8b/README.md
@@ -0,0 +1,64 @@
+# Deploy Llama 3.1 8b
+
+This example uses Ray Serve along with vLLM to deploy a Llama 3.1 8b model as an Anyscale service.
+
+## Install the Anyscale CLI
+
+```bash
+pip install -U anyscale
+anyscale login
+```
+
+## Deploy the service
+
+Clone the example from GitHub.
+
+```bash
+git clone https://github.com/anyscale/examples.git
+cd examples/03_deploy_llama_3_8b
+```
+
+Deploy the service. Use `--env` to forward your Hugging Face token if you need authentication for gated models like Llama 3.
+
+```bash
+export HF_TOKEN=<INSERT HUGGING FACE TOKEN HERE>
+anyscale service deploy -f service.yaml --env HF_TOKEN=$HF_TOKEN
+```
+
+If you’re using an ungated model, go to your `LLMConfig` (in `serve_llama_3_1_8b.py`), and set `model_source` to that model. Then, you can omit the Hugging Face token from both the config and the `anyscale service deploy` command.
+
+## Understanding the example
+
+- The [application code](https://github.com/anyscale/examples/blob/main/03_deploy_llama_3_8b/serve_llama_3_1_8b.py) sets the required accelerator type with `accelerator_type="L4"`. To use a different accelerator, replace `"L4"` with the desired name. See the [list of supported accelerators](https://docs.ray.io/en/latest/ray-core/accelerator-types.html#accelerator-types) for available options.
+- Ray Serve automatically autoscales the number of model replicas between `min_replicas` and `max_replicas`. Ray Serve adapts the number of replicas by monitoring queue sizes. For more information on configuring autoscaling, see the [AutoscalingConfig documentation](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.config.AutoscalingConfig.html).
+- This example uses vLLM, and the [Dockerfile](https://github.com/anyscale/examples/blob/main/03_deploy_llama_3_8b/Dockerfile) defines the service’s dependencies. When you run `anyscale service deploy`, the build process adds these dependencies on top of an Anyscale-provided base image.
+- To configure vLLM, modify the `engine_kwargs` dictionary. See [Ray documentation for the `LLMConfig` object](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig).
+
+
+## Query the service
+
+The `anyscale service deploy` command outputs a line that looks like  
+```text
+curl -H "Authorization: Bearer <SERVICE_TOKEN>" <BASE_URL>
+```
+
+From the output, you can extract the service token and base URL. Open [query.py](https://github.com/anyscale/examples/blob/main/03_deploy_llama_3_8b/query.py) and add them to the appropriate fields.
+```python
+token = <SERVICE_TOKEN> 
+base_url = <BASE_URL> 
+```
+
+Query the model  
+```bash
+pip install openai
+python query.py
+```
+
+View the service in the [services tab](https://console.anyscale.com/services) of the Anyscale console.
+
+## Shutdown 
+
+Shutdown your Anyscale Service:
+```bash
+anyscale service terminate -n deploy-llama-3-1-8b
+```
diff --git a/03_deploy_llama_3_8b/query.py b/03_deploy_llama_3_8b/query.py
@@ -0,0 +1,26 @@
+from urllib.parse import urljoin
+from openai import OpenAI
+
+# The "anyscale service deploy" script outputs a line that looks like
+# 
+#     curl -H "Authorization: Bearer <SERVICE_TOKEN>" <BASE_URL>
+# 
+# From this, you can parse out the service token and base URL.
+token = <SERVICE_TOKEN>  # Fill this in. If deploying and querying locally, use token = "FAKE_KEY"
+base_url = <BASE_URL>  # Fill this in. If deploying and querying locally, use base_url = "http://localhost:8000"
+
+client = OpenAI(base_url= urljoin(base_url, "v1"), api_key=token)
+
+response = client.chat.completions.create(
+    model="my-llama-3.1-8B",
+    messages=[
+        {"role": "user", "content": "What's the capital of France?"}
+    ],
+    stream=True
+)
+
+# Stream and print JSON
+for chunk in response:
+    data = chunk.choices[0].delta.content
+    if data:
+        print(data, end="", flush=True)
diff --git a/03_deploy_llama_3_8b/serve_llama_3_1_8b.py b/03_deploy_llama_3_8b/serve_llama_3_1_8b.py
@@ -0,0 +1,32 @@
+from ray import serve
+from ray.serve.llm import LLMConfig, build_openai_app
+import os
+
+llm_config = LLMConfig(
+    model_loading_config=dict(
+        model_id="my-llama-3.1-8B",
+        # Or unsloth/Meta-Llama-3.1-8B-Instruct for an ungated version
+        model_source="meta-llama/Llama-3.1-8B-Instruct",
+    ),
+    accelerator_type="L4",
+    deployment_config=dict(
+        autoscaling_config=dict(
+            min_replicas=1, max_replicas=2,
+        )
+    ),
+    # We need to share our Hugging Face token with the workers so they can access the gated model.
+    # If your model is not gated, you can skip this.
+    runtime_env=dict(
+        env_vars={
+            "HF_TOKEN": os.environ["HF_TOKEN"]
+        }
+    ),
+    engine_kwargs=dict(
+        max_model_len=8192,
+    )
+)
+
+app = build_openai_app({"llm_configs": [llm_config]})
+
+# Uncomment the below line to run the service locally with Python.
+# serve.run(app, blocking=True)
diff --git a/03_deploy_llama_3_8b/service.yaml b/03_deploy_llama_3_8b/service.yaml
@@ -0,0 +1,41 @@
+# View the docs https://docs.anyscale.com/reference/service-api#serviceconfig.
+
+name: deploy-llama-3-1-8b
+
+# When empty, use the default image. This can be an Anyscale-provided base image
+# like anyscale/ray:2.43.0-slim-py312-cu125, a user-provided base image (provided
+# that it meets certain specs), or you can build new images using the Anyscale
+# image builder at https://console.anyscale-staging.com/v2/container-images.
+
+containerfile: ./Dockerfile
+
+# When empty, Anyscale will auto-select the instance types. You can also specify
+# minimum and maximum resources.
+compute_config:
+#   head_node:
+#     instance_type: m5.2xlarge
+#   worker_nodes:
+#     - instance_type: m5.16xlarge
+#       min_nodes: 0
+#       max_nodes: 100
+#     - instance_type: m7a.24xlarge
+#       min_nodes: 0
+#       max_nodes: 100
+#       market_type: PREFER_SPOT # Defaults to ON_DEMAND
+#     - instance_type: g4dn.2xlarge
+#       min_nodes: 0
+#       max_nodes: 100
+#       market_type: PREFER_SPOT # Defaults to ON_DEMAND
+  auto_select_worker_config: true
+
+# Path to a local directory or a remote URI to a .zip file (S3, GS, HTTP) that
+# will be the working directory for the job. The files in the directory will be
+# automatically uploaded to the job environment in Anyscale.
+working_dir: .
+
+# When empty, this uses the default Anyscale Cloud in your organization.
+cloud:
+
+# Specify the Ray Serve app to deploy.
+applications:
+- import_path: serve_llama_3_1_8b:app
diff --git a/deploy_llama_3_1_70b/Dockerfile b/deploy_llama_3_1_70b/Dockerfile
@@ -0,0 +1,8 @@
+FROM anyscale/ray:2.50.0-slim-py312-cu128
+
+# C compiler for Triton’s runtime build step (vLLM V1 engine)
+# https://github.com/vllm-project/vllm/issues/2997
+RUN sudo apt-get update && \
+    sudo apt-get install -y --no-install-recommends build-essential
+
+RUN pip install vllm==0.11.0
diff --git a/deploy_llama_3_1_70b/README.md b/deploy_llama_3_1_70b/README.md
@@ -0,0 +1,63 @@
+# Deploy Llama 3.1 70b
+
+This example uses Ray Serve along with vLLM to deploy a Llama 3.1 70b model as an Anyscale service. The same code can be used for similarly sized models.
+
+## Install the Anyscale CLI
+
+```bash
+pip install -U anyscale
+anyscale login
+```
+
+## Deploy the service
+
+Clone the example from GitHub.
+
+```bash
+git clone https://github.com/anyscale/examples.git
+cd examples/deploy_llama_3_1_70b
+```
+
+Deploy the service. Use `--env` to forward your Hugging Face token if you need authentication for gated models like Llama 3.
+
+```bash
+anyscale service deploy -f service.yaml --env HF_TOKEN=${HF_TOKEN:?HF_TOKEN is not set}
+```
+
+The logic in `${HF_TOKEN:?HF_TOKEN is not set}` just raises an error if no Hugging Face token is present. If you don't have a Hugging Face token, you can use one of the ungated models (change `model_name` in [serve.py](https://github.com/anyscale/examples/blob/main/deploy_llama_3_1_70b/serve.py)). Not only do the Llama models require a Hugging Face token, you also need to request permission to use the models ([here for 3.1](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct) and [here for 3.3](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct)).
+
+## Understanding the example
+
+- The [application code](https://github.com/anyscale/examples/blob/main/deploy_llama_3_1_70b/serve.py) sets the required accelerator type with `accelerator_type="L40S"`. This accelerator type is available on AWS. On other clouds, use an accelerator type like `"A100"` or `"H100"`. See the [list of supported accelerators](https://docs.ray.io/en/latest/ray-core/accelerator-types.html#accelerator-types) for available options. Depending on the accelerator type that you use, will will also need to select the appropriate instance types in [service.yaml](https://github.com/anyscale/examples/blob/main/deploy_llama_3_1_70b/service.yaml).
+- Ray Serve automatically autoscales the number of model replicas between `min_replicas` and `max_replicas`. Ray Serve adapts the number of replicas by monitoring queue sizes. For more information on configuring autoscaling, see the [AutoscalingConfig documentation](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.config.AutoscalingConfig.html).
+- This example uses vLLM, and the [Dockerfile](https://github.com/anyscale/examples/blob/main/deploy_llama_3_1_70b/Dockerfile) defines the service’s dependencies. When you run `anyscale service deploy`, the build process adds these dependencies on top of an Anyscale-provided base image.
+- To configure vLLM, modify the `engine_kwargs` dictionary. See [Ray documentation for the `LLMConfig` object](https://docs.ray.io/en/latest/serve/api/doc/ray.serve.llm.LLMConfig.html#ray.serve.llm.LLMConfig).
+
+
+## Query the service
+
+The `anyscale service deploy` command outputs a line that looks like  
+```text
+curl -H "Authorization: Bearer <SERVICE_TOKEN>" <BASE_URL>
+```
+
+From the output, you can extract the service token and base URL. Open [query.py](https://github.com/anyscale/examples/blob/main/deploy_llama_3_1_70b/query.py) and add them to the appropriate fields.
+```python
+token = <SERVICE_TOKEN> 
+base_url = <BASE_URL> 
+```
+
+Query the model  
+```bash
+pip install openai
+python query.py
+```
+
+View the service in the [services tab](https://console.anyscale.com/services) of the Anyscale console.
+
+## Shutdown 
+
+Shutdown your Anyscale Service:
+```bash
+anyscale service terminate -n deploy-70b
+```
diff --git a/deploy_llama_3_1_70b/query.py b/deploy_llama_3_1_70b/query.py
@@ -0,0 +1,26 @@
+from urllib.parse import urljoin
+from openai import OpenAI
+
+# The "anyscale service deploy" script outputs a line that looks like
+# 
+#     curl -H "Authorization: Bearer <SERVICE_TOKEN>" <BASE_URL>
+# 
+# From this, you can parse out the service token and base URL.
+token = <SERVICE_TOKEN>  # Fill this in. If deploying and querying locally, use token = "FAKE_KEY"
+base_url = <BASE_URL>  # Fill this in. If deploying and querying locally, use base_url = "http://localhost:8000"
+
+client = OpenAI(base_url= urljoin(base_url, "v1"), api_key=token)
+
+response = client.chat.completions.create(
+    model="my-70b-model",
+    messages=[
+        {"role": "user", "content": "What's the capital of France?"}
+    ],
+    stream=True
+)
+
+# Stream and print the response.
+for chunk in response:
+    data = chunk.choices[0].delta.content
+    if data:
+        print(data, end="", flush=True)
diff --git a/deploy_llama_3_1_70b/serve.py b/deploy_llama_3_1_70b/serve.py
@@ -0,0 +1,37 @@
+from ray.serve.llm import LLMConfig, build_openai_app
+import os
+
+# model_name = "meta-llama/Llama-3.1-70B-Instruct"
+# model_name = "meta-llama/Llama-3.3-70B-Instruct"
+# model_name = "unsloth/Meta-Llama-3.1-70B-Instruct"  # Ungated, no token required
+model_name = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"  # Ungated, no token required
+
+llm_config = LLMConfig(
+    model_loading_config=dict(
+        model_id="my-70b-model",
+        model_source=model_name,
+    ),
+    # Valid types (depending on what GPUs are available on the cloud) include "L40S", "A100", and "H100".
+    # If you use a cloud other than AWS, in addition to changing the accelerator type, you also need to
+    # change the compute_config in service.yaml.
+    accelerator_type="L40S",
+    deployment_config=dict(
+        autoscaling_config=dict(
+            min_replicas=1,
+            max_replicas=4,
+        )
+    ),
+    ### If your model is not gated, you can skip `HF_TOKEN`
+    # Share your Hugging Face token with the vllm engine so it can access the gated Llama 3.
+    # Type `export HF_TOKEN=<YOUR-HUGGINGFACE-TOKEN>` in a terminal
+    engine_kwargs=dict(
+        max_model_len=32768,
+        # Split weights among 8 GPUs in the node
+        tensor_parallel_size=8,
+    ),
+)
+
+app = build_openai_app({"llm_configs": [llm_config]})
+
+# Uncomment the below line to run the service locally with Python.
+# serve.run(app, blocking=True)
diff --git a/deploy_llama_3_1_70b/service.yaml b/deploy_llama_3_1_70b/service.yaml
@@ -0,0 +1,57 @@
+# View the docs https://docs.anyscale.com/reference/service-api#serviceconfig.
+
+name: deploy-70b
+
+# When empty, use the default image. This can be an Anyscale-provided base image
+# like anyscale/ray:2.49.2-slim-py312-cu128, a user-provided base image (provided
+# that it meets certain specs), or you can build new images using the Anyscale
+# image builder at https://console.anyscale-staging.com/v2/container-images.
+
+containerfile: ./Dockerfile
+
+# Anyscale will auto-select the instance types, but you can also specify the instance
+# types manually. Different GPU types are available on different clouds.
+compute_config:
+#   head_node:
+#     instance_type: m5.2xlarge
+#   worker_nodes:
+#     # These instances are only available in AWS.
+#     - instance_type: p4d.24xlarge
+#       min_nodes: 0
+#       max_nodes: 1
+#       market_type: PREFER_SPOT # Defaults to ON_DEMAND
+#     - instance_type: p4de.24xlarge
+#       min_nodes: 0
+#       max_nodes: 1
+#       market_type: PREFER_SPOT # Defaults to ON_DEMAND
+#     - instance_type: p5.48xlarge
+#       min_nodes: 0
+#       max_nodes: 1
+#       market_type: PREFER_SPOT # Defaults to ON_DEMAND
+#
+#     # These instances are only available in GCP.
+#     - instance_type: a2-highgpu-8g-nvidia-a100-40gb-8
+#       market_type: PREFER_SPOT
+#     - instance_type: a2-ultragpu-8g-nvidia-a100-80gb-8
+#       market_type: PREFER_SPOT
+#     - instance_type: a2-megagpu-16g-nvidia-a100-40gb-16
+#       market_type: PREFER_SPOT
+#     - instance_type: a3-highgpu-8g-nvidia-h100-80gb-8
+#       market_type: PREFER_SPOT
+#     - instance_type: a3-megagpu-8g-nvidia-h100-mega-80gb-8
+#       market_type: PREFER_SPOT
+#     - instance_type: a3-ultragpu-8g-nvidia-h200-141gb-8
+#       market_type: PREFER_SPOT
+  auto_select_worker_config: true
+
+# Path to a local directory or a remote URI to a .zip file (S3, GS, HTTP) that
+# will be the working directory for the job. The files in the directory will be
+# automatically uploaded to the job environment in Anyscale.
+working_dir: .
+
+# When empty, this uses the default Anyscale Cloud in your organization.
+cloud:
+
+# Specify the Ray Serve app to deploy.
+applications:
+- import_path: serve:app