kubeflow · juliusvonkohout · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/README.md b/README.md
@@ -57,7 +57,7 @@ This repository periodically synchronizes all official Kubeflow components from
 | Component | Local Manifests Path | Upstream Revision | CPU (millicores) | Memory (Mi) |  PVC Storage (GB) |
 | - | - | - | - | - | - |
 | Training Operator | applications/training-operator/upstream | [v1.9.2](https://github.com/kubeflow/training-operator/tree/v1.9.2/manifests) | 3m | 25Mi | 0GB |
-| Trainer | applications/trainer/upstream | [f12a6d3](https://github.com/kubeflow/trainer/tree/f12a6d399a3dbb84d8829a5e7603ab310c45df6a/manifests) | 8m | 143Mi | 0GB |
+| Trainer | applications/trainer/upstream | [v2.1.0](https://github.com/kubeflow/trainer/tree/v2.1.0/manifests) | 8m | 143Mi | 0GB |
 | Notebook Controller | applications/jupyter/notebook-controller/upstream | [v1.10.0](https://github.com/kubeflow/kubeflow/tree/v1.10.0/components/notebook-controller/config) | 5m | 93Mi | 0GB |
 | PVC Viewer Controller | applications/pvcviewer-controller/upstream | [v1.10.0](https://github.com/kubeflow/kubeflow/tree/v1.10.0/components/pvcviewer-controller/config) | 15m | 128Mi | 0GB |
 | Tensorboard Controller | applications/tensorboard/tensorboard-controller/upstream | [v1.10.0](https://github.com/kubeflow/kubeflow/tree/v1.10.0/components/tensorboard-controller/config) | 15m | 128Mi | 0GB |

diff --git a/applications/trainer/upstream/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml b/applications/trainer/upstream/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml
diff --git a/applications/trainer/upstream/base/crds/trainer.kubeflow.org_trainingruntimes.yaml b/applications/trainer/upstream/base/crds/trainer.kubeflow.org_trainingruntimes.yaml
diff --git a/applications/trainer/upstream/base/crds/trainer.kubeflow.org_trainjobs.yaml b/applications/trainer/upstream/base/crds/trainer.kubeflow.org_trainjobs.yaml
diff --git a/applications/trainer/upstream/base/manager/controller_manager_config.yaml b/applications/trainer/upstream/base/manager/controller_manager_config.yaml
@@ -0,0 +1,44 @@
+apiVersion: config.trainer.kubeflow.org/v1alpha1
+kind: Configuration
+# Health configuration
+health:
+  healthProbeBindAddress: :8081
+  readinessEndpointName: readyz
+  livenessEndpointName: healthz
+
+# Metrics configuration
+metrics:
+  bindAddress: :8443
+  secureServing: true
+
+# Webhook configuration
+webhook:
+  port: 9443
+  host: ""
+
+# Leader election configuration
+leaderElection:
+  leaderElect: true
+  resourceName: trainer.kubeflow.org
+  resourceNamespace: ""
+  leaseDuration: 15s
+  renewDeadline: 10s
+  retryPeriod: 2s
+
+# Controller configuration
+controller:
+  groupKindConcurrency:
+    TrainJob.trainer.kubeflow.org: 5
+    TrainingRuntime.trainer.kubeflow.org: 1
+    ClusterTrainingRuntime.trainer.kubeflow.org: 1
+
+# Certificate management configuration
+certManagement:
+  enable: true
+  webhookServiceName: kubeflow-trainer-controller-manager
+  webhookSecretName: kubeflow-trainer-webhook-cert
+
+# Client connection configuration
+clientConnection:
+  qps: 50
+  burst: 100
diff --git a/applications/trainer/upstream/base/manager/kustomization.yaml b/applications/trainer/upstream/base/manager/kustomization.yaml
@@ -1,2 +1,16 @@
 resources:
   - manager.yaml
+
+# Disable hash suffix for predictable ConfigMap names
+generatorOptions:
+  disableNameSuffixHash: true
+
+# ConfigMap generator for controller manager configuration
+configMapGenerator:
+  - name: kubeflow-trainer-config
+    files:
+      - controller_manager_config.yaml
+
+# Patches to mount the config file
+patches:
+  - path: manager_config_patch.yaml
diff --git a/applications/trainer/upstream/base/manager/manager_config_patch.yaml b/applications/trainer/upstream/base/manager/manager_config_patch.yaml
@@ -0,0 +1,21 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kubeflow-trainer-controller-manager
+spec:
+  template:
+    spec:
+      containers:
+      - name: manager
+        args:
+        - --config=/controller_manager_config.yaml
+        - --zap-log-level=2
+        volumeMounts:
+        - name: kubeflow-trainer-config
+          mountPath: /controller_manager_config.yaml
+          subPath: controller_manager_config.yaml
+          readOnly: true
+      volumes:
+      - name: kubeflow-trainer-config
+        configMap:
+          name: kubeflow-trainer-config
diff --git a/applications/trainer/upstream/base/rbac/role.yaml b/applications/trainer/upstream/base/rbac/role.yaml
@@ -42,6 +42,15 @@ rules:
   - list
   - update
   - watch
+- apiGroups:
+  - coordination.k8s.io
+  resources:
+  - leases
+  verbs:
+  - create
+  - get
+  - list
+  - update
 - apiGroups:
   - jobset.x-k8s.io
   resources:
@@ -62,6 +71,7 @@ rules:
   - list
   - watch
 - apiGroups:
+  - scheduling.volcano.sh
   - scheduling.x-k8s.io
   resources:
   - podgroups

diff --git a/applications/trainer/upstream/base/runtimes/data-cache/kustomization.yaml b/applications/trainer/upstream/base/runtimes/data-cache/kustomization.yaml
@@ -0,0 +1,5 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - torch_distributed_with_cache.yaml
diff --git a/applications/trainer/upstream/base/runtimes/data-cache/torch_distributed_with_cache.yaml b/applications/trainer/upstream/base/runtimes/data-cache/torch_distributed_with_cache.yaml
@@ -0,0 +1,55 @@
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: ClusterTrainingRuntime
+metadata:
+  name: torch-distributed-with-cache
+  labels:
+    trainer.kubeflow.org/framework: torch
+spec:
+  mlPolicy:
+    numNodes: 1
+    torch:
+      numProcPerNode: auto
+  template:
+    spec:
+      replicatedJobs:
+      - name: dataset-initializer
+        replicas: 1
+        template:
+          metadata:
+            labels:
+              trainer.kubeflow.org/trainjob-ancestor-step: dataset-initializer
+          spec:
+            template:
+              spec:
+                serviceAccountName: kubeflow-trainer-cache-initializer
+                containers:
+                  - name: dataset-initializer
+                    image: ghcr.io/kubeflow/trainer/dataset-initializer
+                    env:
+                      - name: CACHE_IMAGE
+                        value: "ghcr.io/kubeflow/trainer/data-cache:v2.1.0"
+                      - name: TRAIN_JOB_NAME
+                        valueFrom:
+                          fieldRef:
+                            apiVersion: v1
+                            fieldPath: metadata.labels['jobset.sigs.k8s.io/jobset-name']
+      - name: node
+        dependsOn:
+          - name: dataset-initializer
+            status: Complete
+        template:
+          metadata:
+            labels:
+              trainer.kubeflow.org/trainjob-ancestor-step: trainer
+          spec:
+            template:
+              spec:
+                containers:
+                  - name: node
+                    image: pytorch/pytorch:2.7.1-cuda12.8-cudnn9-runtime
+                    env:
+                      - name: TRAIN_JOB_NAME
+                        valueFrom:
+                          fieldRef:
+                            apiVersion: v1
+                            fieldPath: metadata.labels['jobset.sigs.k8s.io/jobset-name']
diff --git a/applications/trainer/upstream/base/runtimes/kustomization.yaml b/applications/trainer/upstream/base/runtimes/kustomization.yaml
@@ -3,6 +3,5 @@ kind: Kustomization
 resources:
   - deepspeed_distributed.yaml
   - mlx_distributed.yaml
-  - mpi_distributed.yaml
   - torch_distributed.yaml
   - torchtune
diff --git a/applications/trainer/upstream/base/runtimes/mpi_distributed.yaml b/applications/trainer/upstream/base/runtimes/mpi_distributed.yaml
diff --git a/applications/trainer/upstream/base/runtimes/torchtune/kustomization.yaml b/applications/trainer/upstream/base/runtimes/torchtune/kustomization.yaml
@@ -3,3 +3,4 @@ kind: Kustomization
 resources:
   - llama3_2/llama3_2_1B.yaml
   - llama3_2/llama3_2_3B.yaml
+  - qwen2_5/qwen2_5_1.5B.yaml
diff --git a/applications/trainer/upstream/base/runtimes/torchtune/llama3_2/llama3_2_1B.yaml b/applications/trainer/upstream/base/runtimes/torchtune/llama3_2/llama3_2_1B.yaml
@@ -34,9 +34,6 @@ spec:
                       persistentVolumeClaim:
                         claimName: torchtune-llama3.2-1b
         - name: model-initializer
-          dependsOn:
-            - name: dataset-initializer
-              status: Complete
           template:
             metadata:
               labels:
@@ -59,6 +56,8 @@ spec:
                         claimName: torchtune-llama3.2-1b
         - name: node
           dependsOn:
+            - name: dataset-initializer
+              status: Complete
             - name: model-initializer
               status: Complete
           template:

diff --git a/applications/trainer/upstream/base/runtimes/torchtune/llama3_2/llama3_2_3B.yaml b/applications/trainer/upstream/base/runtimes/torchtune/llama3_2/llama3_2_3B.yaml
@@ -34,9 +34,6 @@ spec:
                       persistentVolumeClaim:
                         claimName: torchtune-llama3.2-3b
         - name: model-initializer
-          dependsOn:
-            - name: dataset-initializer
-              status: Complete
           template:
             metadata:
               labels:
@@ -59,6 +56,8 @@ spec:
                         claimName: torchtune-llama3.2-3b
         - name: node
           dependsOn:
+            - name: dataset-initializer
+              status: Complete
             - name: model-initializer
               status: Complete
           template:

diff --git a/applications/trainer/upstream/base/runtimes/torchtune/qwen2_5/qwen2_5_1.5B.yaml b/applications/trainer/upstream/base/runtimes/torchtune/qwen2_5/qwen2_5_1.5B.yaml
@@ -0,0 +1,96 @@
+apiVersion: trainer.kubeflow.org/v1alpha1
+kind: ClusterTrainingRuntime
+metadata:
+  name: torchtune-qwen2.5-1.5b
+  labels:
+    trainer.kubeflow.org/framework: torchtune
+spec:
+  mlPolicy:
+    numNodes: 1
+    torch:
+      numProcPerNode: auto
+  template:
+    spec:
+      replicatedJobs:
+        - name: dataset-initializer
+          template:
+            metadata:
+              labels:
+                trainer.kubeflow.org/trainjob-ancestor-step: dataset-initializer
+            spec:
+              template:
+                spec:
+                  containers:
+                    - name: dataset-initializer
+                      image: ghcr.io/kubeflow/trainer/dataset-initializer
+                      env:
+                        - name: STORAGE_URI
+                          value: hf://tatsu-lab/alpaca
+                      volumeMounts:
+                        - mountPath: /workspace
+                          name: initializer
+                  volumes:
+                    - name: initializer
+                      persistentVolumeClaim:
+                        claimName: torchtune-qwen2.5-1.5b
+        - name: model-initializer
+          template:
+            metadata:
+              labels:
+                trainer.kubeflow.org/trainjob-ancestor-step: model-initializer
+            spec:
+              template:
+                spec:
+                  containers:
+                    - name: model-initializer
+                      image: ghcr.io/kubeflow/trainer/model-initializer
+                      env:
+                        - name: STORAGE_URI
+                          value: hf://Qwen/Qwen2.5-1.5B-Instruct
+                      volumeMounts:
+                        - name: initializer
+                          mountPath: /workspace
+                  volumes:
+                    - name: initializer
+                      persistentVolumeClaim:
+                        claimName: torchtune-qwen2.5-1.5b
+        - name: node
+          dependsOn:
+            - name: dataset-initializer
+              status: Complete
+            - name: model-initializer
+              status: Complete
+          template:
+            metadata:
+              labels:
+                trainer.kubeflow.org/trainjob-ancestor-step: trainer
+            spec:
+              template:
+                spec:
+                  containers:
+                    - name: node
+                      image: ghcr.io/kubeflow/trainer/torchtune-trainer
+                      command:
+                        - tune
+                        - run
+                        - --rdzv_endpoint=localhost:29500
+                        - full_finetune_distributed
+                        - --config
+                        - qwen2_5/1.5B_full
+                        - dataset=torchtune.datasets.instruct_dataset
+                        - dataset.source=parquet
+                        - dataset.data_dir=/workspace/dataset/data
+                        - output_dir=/workspace/output
+                        - tokenizer.path=/workspace/model/vocab.json
+                        - tokenizer.merges_file=/workspace/model/merges.txt
+                        - checkpointer.checkpoint_dir=/workspace/model
+                      resources:
+                        limits:
+                          nvidia.com/gpu: 2
+                      volumeMounts:
+                        - mountPath: /workspace
+                          name: initializer
+                  volumes:
+                    - name: initializer
+                      persistentVolumeClaim:
+                        claimName: torchtune-qwen2.5-1.5b