Merge pull request #6812 from mszacillo/replica-estimator

karmada-bot · web-flow · commit 10a5f0fed595 · 2025-10-11T16:11:38.000+08:00
Implement MaxAvailableComponentSets for general estimator
diff --git a/pkg/estimator/client/general.go b/pkg/estimator/client/general.go
@@ -53,15 +53,6 @@ func (ge *GeneralEstimator) MaxAvailableReplicas(_ context.Context, clusters []*
 	return availableTargetClusters, nil
 }
 
-// MaxAvailableComponentSets returns the maximum number of complete multi-component sets (in terms of replicas) that each cluster can host.
-func (ge *GeneralEstimator) MaxAvailableComponentSets(
-	_ context.Context,
-	_ *ComponentSetEstimationRequest) ([]ComponentSetEstimationResponse, error) {
-	// Dummy implementation: return nothing for now
-	// TODO: Implement as part of #6734
-	return nil, nil
-}
-
 func (ge *GeneralEstimator) maxAvailableReplicas(cluster *clusterv1alpha1.Cluster, replicaRequirements *workv1alpha2.ReplicaRequirements) int32 {
 	//Note: resourceSummary must be deep-copied before using in the function to avoid modifying the original data structure.
 	resourceSummary := cluster.Status.ResourceSummary.DeepCopy()
@@ -102,6 +93,144 @@ func (ge *GeneralEstimator) maxAvailableReplicas(cluster *clusterv1alpha1.Cluste
 	return int32(maximumReplicas) // #nosec G115: integer overflow conversion int64 -> int32
 }
 
+// MaxAvailableComponentSets (generic estimator) – resourceSummary only.
+func (ge *GeneralEstimator) MaxAvailableComponentSets(_ context.Context, req *ComponentSetEstimationRequest) ([]ComponentSetEstimationResponse, error) {
+	responses := make([]ComponentSetEstimationResponse, len(req.Clusters))
+	for i, cluster := range req.Clusters {
+		maxComponentSets := ge.maxAvailableComponentSets(cluster, req.Components)
+		responses[i] = ComponentSetEstimationResponse{Name: cluster.Name, Sets: maxComponentSets}
+	}
+	return responses, nil
+}
+
+func (ge *GeneralEstimator) maxAvailableComponentSets(cluster *clusterv1alpha1.Cluster, components []*workv1alpha2.Component) int32 {
+	resourceSummary := cluster.Status.ResourceSummary.DeepCopy()
+	if resourceSummary == nil {
+		return 0
+	}
+
+	// Aggregate per-set resource requirements
+	perSet := perSetRequirement(components)
+
+	// Check pod constraint
+	available := availableResourceMap(resourceSummary)
+	allowedPods := getAllowedPodNumber(resourceSummary)
+	if allowedPods <= 0 {
+		return 0
+	}
+
+	podsPerSet := podsInSet(components)
+	if podsPerSet <= 0 {
+		// No components or resources are defined, return max pod allowance as estimate
+		return int32(allowedPods) // #nosec G115: integer overflow conversion int64 -> int32
+	}
+
+	podBound := allowedPods / podsPerSet
+	if len(perSet) == 0 || allZero(perSet) {
+		return int32(podBound) // #nosec G115: integer overflow conversion int64 -> int32
+	}
+
+	// Find limiting resource requirement, which will bound maxSet calculation
+	maxSets := podBound
+	for resName, req := range perSet {
+		if req <= 0 {
+			continue
+		}
+
+		resAvail := available[resName]
+		if resAvail <= 0 {
+			return 0 // no capacity for this resource
+		}
+
+		resBound := resAvail / req
+		if resBound < maxSets {
+			maxSets = resBound
+		}
+	}
+
+	if features.FeatureGate.Enabled(features.CustomizedClusterResourceModeling) && len(cluster.Status.ResourceSummary.AllocatableModelings) > 0 {
+		num, err := getMaximumSetsBasedOnResourceModels(cluster, components)
+		if err != nil {
+			klog.Warningf("Failed to get maximum sets based on resource models, skipping: %v", err)
+		} else if num < maxSets {
+			maxSets = num
+		}
+	}
+
+	return int32(maxSets) // #nosec G115: integer overflow conversion int64 -> int32
+}
+
+// getMaximumSetsBasedOnResourceModels is a placeholder for future implementation.
+// It should refine the maximum sets based on cluster resource models, similar
+// to getMaximumReplicasBasedOnResourceModels but adapted to full component sets.
+func getMaximumSetsBasedOnResourceModels(_ *clusterv1alpha1.Cluster, _ []*workv1alpha2.Component) (int64, error) {
+	// TODO: implement logic based on cluster.Spec.ResourceModels
+	// For now, just return MaxInt64 so it never reduces the upper bound.
+	return math.MaxInt64, nil
+}
+
+// podsInSet computes the total number of pods in the CRD
+func podsInSet(components []*workv1alpha2.Component) int64 {
+	var sum int64
+	for _, c := range components {
+		sum += int64(c.Replicas)
+	}
+	return sum
+}
+
+// perSetRequirement computes the aggregate resource(such as CPU, Memory, GPU, etc) demand of one set of components.
+func perSetRequirement(components []*workv1alpha2.Component) map[corev1.ResourceName]int64 {
+	resourceRequirements := map[corev1.ResourceName]int64{}
+	for _, c := range components {
+		if c.ReplicaRequirements == nil || c.ReplicaRequirements.ResourceRequest == nil {
+			continue
+		}
+		replicas := int64(c.Replicas)
+		for resName, qty := range c.ReplicaRequirements.ResourceRequest {
+			baseAmount := quantityAsInt64(qty)
+			resourceRequirements[resName] += baseAmount * replicas
+		}
+	}
+	return resourceRequirements
+}
+
+// availableResourceMap parses the cluster resourceSummary and returns map of resourceName -> availableQuantity (int64)
+func availableResourceMap(resourceSummary *clusterv1alpha1.ResourceSummary) map[corev1.ResourceName]int64 {
+	available := make(map[corev1.ResourceName]int64, len(resourceSummary.Allocatable))
+	for key, allocatable := range resourceSummary.Allocatable {
+		a := allocatable.DeepCopy()
+		if allocated, ok := resourceSummary.Allocated[key]; ok {
+			a.Sub(allocated)
+		}
+		if allocating, ok := resourceSummary.Allocating[key]; ok {
+			a.Sub(allocating)
+		}
+		available[key] = quantityAsInt64(a)
+	}
+	return available
+}
+
+// Converts quantity into an int representation depending on format
+func quantityAsInt64(q resource.Quantity) int64 {
+	switch q.Format {
+	case resource.DecimalSI, resource.DecimalExponent:
+		return q.MilliValue()
+	case resource.BinarySI:
+		return q.Value()
+	default:
+		return q.Value()
+	}
+}
+
+func allZero(m map[corev1.ResourceName]int64) bool {
+	for _, v := range m {
+		if v != 0 {
+			return false
+		}
+	}
+	return true
+}
+
 func getAllowedPodNumber(resourceSummary *clusterv1alpha1.ResourceSummary) int64 {
 	var allocatable, allocated, allocating int64
 	if resourceSummary.Allocatable != nil {
diff --git a/pkg/estimator/client/general_test.go b/pkg/estimator/client/general_test.go
@@ -800,3 +800,209 @@ func TestMinimumModelIndex(t *testing.T) {
 		})
 	}
 }
+
+func TestGetMaxAvailableComponentSetsGeneral(t *testing.T) {
+	tests := []struct {
+		name       string
+		cluster    *clusterv1alpha1.Cluster
+		components []*workv1alpha2.Component
+		expected   int32
+	}{
+		{
+			name: "nil resource summary",
+			cluster: &clusterv1alpha1.Cluster{
+				Status: clusterv1alpha1.ClusterStatus{},
+			},
+			expected: 0,
+		},
+		{
+			name: "no allowed pods",
+			cluster: &clusterv1alpha1.Cluster{
+				Status: clusterv1alpha1.ClusterStatus{
+					ResourceSummary: &clusterv1alpha1.ResourceSummary{
+						Allocatable: corev1.ResourceList{
+							corev1.ResourcePods: resource.MustParse("10"),
+						},
+						Allocated: corev1.ResourceList{
+							corev1.ResourcePods: resource.MustParse("10"),
+						},
+					},
+				},
+			},
+			expected: 0,
+		},
+		{
+			name: "empty component list should return max pod allowance",
+			cluster: &clusterv1alpha1.Cluster{
+				Status: clusterv1alpha1.ClusterStatus{
+					ResourceSummary: &clusterv1alpha1.ResourceSummary{
+						Allocatable: corev1.ResourceList{
+							corev1.ResourcePods: resource.MustParse("10"),
+						},
+					},
+				},
+			},
+			expected: 10,
+		},
+		{
+			name: "basic resource estimation",
+			cluster: &clusterv1alpha1.Cluster{
+				Status: clusterv1alpha1.ClusterStatus{
+					ResourceSummary: &clusterv1alpha1.ResourceSummary{
+						Allocatable: corev1.ResourceList{
+							corev1.ResourcePods:   resource.MustParse("100"),
+							corev1.ResourceCPU:    resource.MustParse("10"),
+							corev1.ResourceMemory: resource.MustParse("8Gi"),
+						},
+						Allocated: corev1.ResourceList{
+							corev1.ResourcePods:   resource.MustParse("20"),
+							corev1.ResourceCPU:    resource.MustParse("0"),
+							corev1.ResourceMemory: resource.MustParse("2Gi"),
+						},
+					},
+				},
+			},
+			components: []*workv1alpha2.Component{
+				{
+					Name:     "jobmanager",
+					Replicas: 1,
+					ReplicaRequirements: &workv1alpha2.ComponentReplicaRequirements{
+						ResourceRequest: corev1.ResourceList{
+							corev1.ResourceCPU: resource.MustParse("1"),
+						},
+					},
+				},
+				{
+					Name:     "taskmanager",
+					Replicas: 2,
+					ReplicaRequirements: &workv1alpha2.ComponentReplicaRequirements{
+						ResourceRequest: corev1.ResourceList{
+							corev1.ResourceCPU: resource.MustParse("1.5"),
+						},
+					},
+				},
+			},
+			expected: 2,
+		},
+		{
+			name: "resource estimation with mixed components",
+			cluster: &clusterv1alpha1.Cluster{
+				Status: clusterv1alpha1.ClusterStatus{
+					ResourceSummary: &clusterv1alpha1.ResourceSummary{
+						Allocatable: corev1.ResourceList{
+							corev1.ResourcePods:   resource.MustParse("100"),
+							corev1.ResourceCPU:    resource.MustParse("10"),
+							corev1.ResourceMemory: resource.MustParse("8Gi"),
+						},
+						Allocated: corev1.ResourceList{
+							corev1.ResourcePods:   resource.MustParse("20"),
+							corev1.ResourceCPU:    resource.MustParse("0"),
+							corev1.ResourceMemory: resource.MustParse("2Gi"),
+						},
+					},
+				},
+			},
+			components: []*workv1alpha2.Component{
+				{
+					Name:     "jobmanager",
+					Replicas: 1,
+					ReplicaRequirements: &workv1alpha2.ComponentReplicaRequirements{
+						ResourceRequest: corev1.ResourceList{
+							corev1.ResourceCPU:    resource.MustParse("1"),
+							corev1.ResourceMemory: resource.MustParse("2Gi"),
+						},
+					},
+				},
+				{
+					Name:     "taskmanager",
+					Replicas: 2,
+					ReplicaRequirements: &workv1alpha2.ComponentReplicaRequirements{
+						ResourceRequest: corev1.ResourceList{
+							corev1.ResourceCPU:    resource.MustParse("2000m"),
+							corev1.ResourceMemory: resource.MustParse("2Gi"),
+						},
+					},
+				},
+			},
+			// Per-set demand: 2 replicas × (2 CPU, 2Gi) + 1 replica x (1 CPU, 2Gi) = (5 CPU, 6Gi)
+			// Cluster allocatable: 10 CPU, 6Gi
+			// 10/5 = 2 sets (CPU), 6Gi/6 = 1 set (Mem)
+			// min(2,1) = 1 set total
+			expected: 1,
+		},
+		{
+			name: "estimation limited by pod count",
+			cluster: &clusterv1alpha1.Cluster{
+				Status: clusterv1alpha1.ClusterStatus{
+					ResourceSummary: &clusterv1alpha1.ResourceSummary{
+						Allocatable: corev1.ResourceList{
+							corev1.ResourcePods:   resource.MustParse("3"),
+							corev1.ResourceCPU:    resource.MustParse("100"),
+							corev1.ResourceMemory: resource.MustParse("1Ti"),
+						},
+					},
+				},
+			},
+			components: []*workv1alpha2.Component{
+				{
+					Name:     "small-component",
+					Replicas: 3,
+					ReplicaRequirements: &workv1alpha2.ComponentReplicaRequirements{
+						ResourceRequest: corev1.ResourceList{
+							corev1.ResourceCPU:    resource.MustParse("10m"),
+							corev1.ResourceMemory: resource.MustParse("1Mi"),
+						},
+					},
+				},
+			},
+			expected: 1, // limited by pods
+		},
+		{
+			name: "custom resource estimation with GPUs",
+			cluster: &clusterv1alpha1.Cluster{
+				Status: clusterv1alpha1.ClusterStatus{
+					ResourceSummary: &clusterv1alpha1.ResourceSummary{
+						Allocatable: corev1.ResourceList{
+							corev1.ResourcePods:   resource.MustParse("20"),
+							corev1.ResourceCPU:    resource.MustParse("40"),
+							corev1.ResourceMemory: resource.MustParse("64Gi"),
+							"nvidia.com/gpu":      resource.MustParse("8"),
+						},
+						Allocated: corev1.ResourceList{
+							corev1.ResourcePods:   resource.MustParse("0"),
+							corev1.ResourceCPU:    resource.MustParse("0"),
+							corev1.ResourceMemory: resource.MustParse("0Gi"),
+							"nvidia.com/gpu":      resource.MustParse("0"),
+						},
+					},
+				},
+			},
+			components: []*workv1alpha2.Component{
+				{
+					Name:     "gpu-worker",
+					Replicas: 2,
+					ReplicaRequirements: &workv1alpha2.ComponentReplicaRequirements{
+						ResourceRequest: corev1.ResourceList{
+							corev1.ResourceCPU:    resource.MustParse("4"),
+							corev1.ResourceMemory: resource.MustParse("8Gi"),
+							"nvidia.com/gpu":      resource.MustParse("1"),
+						},
+					},
+				},
+			},
+			// Per-set demand: 2 replicas × (4 CPU, 8Gi, 1 GPU) = (8 CPU, 16Gi, 2 GPUs)
+			// Cluster allocatable: 40 CPU, 64Gi, 8 GPUs
+			// 40/8 = 5 sets (CPU), 64/16 = 4 sets (Mem), 8/2 = 4 sets (GPU)
+			// min(5, 4, 4) = 4 sets total
+			expected: 4,
+		},
+	}
+
+	estimator := NewGeneralEstimator()
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := estimator.maxAvailableComponentSets(tt.cluster, tt.components)
+			assert.Equal(t, tt.expected, result)
+		})
+	}
+}