Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,11 @@ public enum RecommendationNotification {
RecommendationConstants.RecommendationNotificationMsgConstant.NOT_ENOUGH_DATA,
RecommendationConstants.RecommendationNotificationTypes.INFO
),
INFO_ACCELERATOR_RECOMMENDATIONS_AVAILABLE(
NotificationCodes.INFO_ACCELERATOR_RECOMMENDATIONS_AVAILABLE,
RecommendationConstants.RecommendationNotificationMsgConstant.ACCELERATOR_RECOMMENDATIONS_AVAILABLE,
RecommendationConstants.RecommendationNotificationTypes.INFO
),
ERROR_AMOUNT_MISSING_IN_CPU_SECTION(
RecommendationConstants.NotificationCodes.ERROR_AMOUNT_MISSING_IN_CPU_SECTION,
RecommendationConstants.RecommendationNotificationMsgConstant.AMOUNT_MISSING_IN_CPU_SECTION,
Expand Down Expand Up @@ -243,6 +248,11 @@ public enum RecommendationNotification {
RecommendationConstants.RecommendationNotificationMsgConstant.MEMORY_LIMITS_OPTIMISED,
RecommendationConstants.RecommendationNotificationTypes.NOTICE
),
NOTICE_ACCELERATOR_NOT_SUPPORTED(
NotificationCodes.NOTICE_ACCELERATOR_NOT_SUPPORTED,
RecommendationConstants.RecommendationNotificationMsgConstant.ACCELERATOR_NOT_SUPPORTED,
RecommendationConstants.RecommendationNotificationTypes.NOTICE
),
CRITICAL_CPU_REQUEST_NOT_SET(
RecommendationConstants.NotificationCodes.CRITICAL_CPU_REQUEST_NOT_SET,
RecommendationConstants.RecommendationNotificationMsgConstant.CPU_REQUEST_NOT_SET,
Expand Down Expand Up @@ -306,6 +316,7 @@ public static final class NotificationCodes {
// SubSystem - Network 125000 - 125999 (10% of availability)
// SubSystem - Disk 126000 - 126999 (10% of availability)
// SubSystem - Power 127000 - 127999 (10% of availability)
// SubSystem - Accelerator 128000 - 128999 (10% of availability)
public static final int SECTION_INFO_END = 199999;
public static final int SECTION_INFO_SUBSECTION_GENERAL_INFO_START = 110000;
public static final int SECTION_INFO_SUBSECTION_GENERAL_INFO_END = 119999;
Expand Down Expand Up @@ -346,6 +357,9 @@ public static final class NotificationCodes {
public static final int SECTION_INFO_SUBSECTION_DATA_SUBSYSTEM_DISK_END = 126999;
public static final int SECTION_INFO_SUBSECTION_DATA_SUBSYSTEM_POWER_START = 127000;
public static final int SECTION_INFO_SUBSECTION_DATA_SUBSYSTEM_POWER_END = 127999;
public static final int SECTION_INFO_SUBSECTION_DATA_SUBSYSTEM_ACCELERATOR_START = 128000;
public static final int INFO_ACCELERATOR_RECOMMENDATIONS_AVAILABLE = 128001;
public static final int SECTION_INFO_SUBSECTION_DATA_SUBSYSTEM_ACCELERATOR_END = 128999;
public static final int SECTION_ERROR_START = 200000;

// Section - Error: 200000 - 299999
Expand Down Expand Up @@ -435,6 +449,9 @@ public static final class NotificationCodes {
public static final int SECTION_NOTICE_SUBSECTION_DATA_SUBSYSTEM_DISK_END = 326999;
public static final int SECTION_NOTICE_SUBSECTION_DATA_SUBSYSTEM_POWER_START = 327000;
public static final int SECTION_NOTICE_SUBSECTION_DATA_SUBSYSTEM_POWER_END = 327999;
public static final int SECTION_NOTICE_SUBSECTION_DATA_SUBSYSTEM_ACCELERATOR_START = 328000;
public static final int NOTICE_ACCELERATOR_NOT_SUPPORTED = 328001;
public static final int SECTION_NOTICE_SUBSECTION_DATA_SUBSYSTEM_ACCELERATOR_END = 328999;
public static final int SECTION_WARNING_START = 400000;

// Section - Warning: 400000 - 499999
Expand Down Expand Up @@ -619,6 +636,16 @@ public static final class NotificationCodes {
CRITICAL_MEMORY_LIMIT_NOT_SET,
Arrays.asList(CODES_CONTRADICT_MEMORY_LIMIT_NOT_SET)
);

// Contradicting Codes for ACCELERATOR_NOT_SUPPORTED
Integer[] CODES_CONTRADICT_ACCELERATOR_NOT_SUPPORTED = {
INFO_ACCELERATOR_RECOMMENDATIONS_AVAILABLE
};

CONTRADICTING_MAP.put(
NOTICE_ACCELERATOR_NOT_SUPPORTED,
Arrays.asList(CODES_CONTRADICT_ACCELERATOR_NOT_SUPPORTED)
);
}

private NotificationCodes() {
Expand Down Expand Up @@ -679,6 +706,8 @@ public static final class RecommendationNotificationMsgConstant {
public static final String MEMORY_REQUESTS_OPTIMISED = "Workload is optimised wrt MEMORY REQUESTS, no changes needed";
public static final String MEMORY_LIMITS_OPTIMISED = "Workload is optimised wrt MEMORY LIMITS, no changes needed";
public static final String ADDING_RECOMMENDATIONS_TO_DB_FAILED = "Failed to add recommendations to the DB ";
public static final String ACCELERATOR_RECOMMENDATIONS_AVAILABLE = "Accelerator Recommendations are available";
public static final String ACCELERATOR_NOT_SUPPORTED = "Accelerator is not supported by kruize";

private RecommendationNotificationMsgConstant() {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,11 @@ public static JSONObject calculateNamespaceMemoryUsage(IntervalResults intervalR
@Override
public Map<AnalyzerConstants.RecommendationItem, RecommendationConfigItem> getAcceleratorRequestRecommendation(Map<Timestamp, IntervalResults> filteredResultsMap, ArrayList<RecommendationNotification> notifications) {

boolean setNotification = true;
if (null == notifications) {
LOGGER.error(KruizeConstants.ErrorMsgs.RecommendationErrorMsgs.EMPTY_NOTIFICATIONS_OBJECT);
setNotification = false;
}

List<Double> acceleratorCoreMaxValues = new ArrayList<>();
List<Double> acceleratorMemoryMaxValues = new ArrayList<>();
Expand Down Expand Up @@ -614,6 +619,12 @@ public Map<AnalyzerConstants.RecommendationItem, RecommendationConfigItem> getAc
}

if (!isGpuWorkload) {
if (!acceleratorCoreMaxValues.isEmpty() || !acceleratorMemoryMaxValues.isEmpty()) {
if (setNotification) {
notifications.add(new RecommendationNotification(
RecommendationConstants.RecommendationNotification.NOTICE_ACCELERATOR_NOT_SUPPORTED));
}
}
return null;
}

Expand Down Expand Up @@ -656,7 +667,18 @@ public Map<AnalyzerConstants.RecommendationItem, RecommendationConfigItem> getAc
memoryFraction = 1;
}

return RecommendationUtils.getMapWithOptimalProfile(acceleratorModel, coreFraction, memoryFraction);
Map<AnalyzerConstants.RecommendationItem, RecommendationConfigItem> returnMap = RecommendationUtils.getMapWithOptimalProfile(acceleratorModel, coreFraction, memoryFraction);

if (null != returnMap && !returnMap.isEmpty()) {
// Add notification based on isGpuWorkload and accelerator model name
if (setNotification) {
if (RecommendationUtils.checkIfModelIsKruizeSupportedMIG(acceleratorModel))
notifications.add(new RecommendationNotification(
RecommendationConstants.RecommendationNotification.INFO_ACCELERATOR_RECOMMENDATIONS_AVAILABLE));
}
}

return returnMap;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,12 @@ public static boolean checkIfModelIsKruizeSupportedMIG(String modelName) {
H100_CHECK = (modelName.contains("H100") && modelName.contains("80GB"));
}

return A100_CHECK || H100_CHECK;
boolean H200_CHECK = false;
if (!A100_CHECK && !H100_CHECK) {
H200_CHECK = (modelName.contains("H200"));
}

return A100_CHECK || H100_CHECK || H200_CHECK;
}

public static Timestamp getNearestTimestamp(HashMap<Timestamp, IntervalResults> containerDataResults, Timestamp targetTime, int minutesRange) {
Expand Down Expand Up @@ -477,8 +482,49 @@ public static HashMap<AnalyzerConstants.RecommendationItem, RecommendationConfig
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_7_CORES_40GB, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.PROFILE_7G_80GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_7_CORES_80GB, recommendationConfigItem);
} // Adding H200 Partitions to the ladder
else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.H200_PROFILE_1G_18GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_1_CORE_18GB, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.H200_PROFILE_1G_35GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_1_CORE_35GB, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.H200_PROFILE_2G_35GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_2_CORES_35GB, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.H200_PROFILE_3G_71GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_3_CORES_71GB, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.H200_PROFILE_4G_71GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_4_CORES_71GB, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.H200_PROFILE_7G_141GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_7_CORES_141GB, recommendationConfigItem);
} // Adding B200 Partitions to the ladder
else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.B200_PROFILE_1G_23GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_1_CORE_23GB, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.B200_PROFILE_1G_45GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_1_CORE_45GB, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.B200_PROFILE_2G_45GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_2_CORES_45GB, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.B200_PROFILE_3G_90GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_3_CORES_90GB, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.B200_PROFILE_4G_90GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_4_CORES_90GB, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.B200_PROFILE_7G_180GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_7_CORES_180GB, recommendationConfigItem);
} // Adding RTX PRO 5000 Partitions to the ladder
else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.RTX_PRO_5000_PROFILE_1G_12GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_1_CORE_12GB, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.RTX_PRO_5000_PROFILE_2G_24GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_2_CORES_24GB_ME, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.RTX_PRO_5000_PROFILE_4G_48GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_4_CORES_48GB_GFX, recommendationConfigItem);
} // Adding RTX PRO 6000 Partitions to the ladder
else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.RTX_PRO_6000_PROFILE_1G_24GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_1_CORE_24GB_GFX, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.RTX_PRO_6000_PROFILE_2G_48GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_2_CORES_48GB_GFX, recommendationConfigItem);
} else if (acceleratorProfile.getProfileName().equalsIgnoreCase(AnalyzerConstants.AcceleratorConstants.AcceleratorProfiles.RTX_PRO_6000_PROFILE_4G_96GB)) {
returnMap.put(AnalyzerConstants.RecommendationItem.NVIDIA_GPU_PARTITION_4_CORES_96GB_GFX, recommendationConfigItem);
}
return returnMap;

return returnMap;
}

public static String getSupportedModelBasedOnModelName(String modelName) {
Expand All @@ -495,6 +541,18 @@ public static String getSupportedModelBasedOnModelName(String modelName) {

if (modelName.contains("H100") && modelName.contains("80GB"))
return AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.H100_80_GB;
// NOTE: Not tested in real time, checks for predictable strings in device name
if (modelName.contains("H200"))
return AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.H200_141_GB;
// NOTE: Not tested in real time, checks for predictable strings in device name
if (modelName.contains("B200"))
return AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.B200_180_GB;
// NOTE: Not tested in real time, checks for predictable strings in device name
if (modelName.contains("RTX") && modelName.contains("PRO") && modelName.contains("5000"))
return AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.BW_RTX_PRO_5000_48_GB;
// NOTE: Not tested in real time, checks for predictable strings in device name
if (modelName.contains("RTX") && modelName.contains("PRO") && modelName.contains("6000"))
return AnalyzerConstants.AcceleratorConstants.SupportedAccelerators.BW_RTX_PRO_6000_96_GB;

return null;
}
Expand Down Expand Up @@ -536,6 +594,9 @@ public static double getFrameBufferBasedOnModel(String modelName) {
if (modelName.contains(AnalyzerConstants.AcceleratorConstants.AcceleratorMemory.GB_80))
return 80 * 1024;

if (modelName.contains(AnalyzerConstants.AcceleratorConstants.AcceleratorMemory.GB_141))
return 141 * 1024;

return -1;
}
}
Expand Down
Loading