Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
run: |
git config --global --add safe.directory /__w/gpu-app-collection/gpu-app-collection
git submodule update --init -- src/cuda/cuda-samples
/bin/bash test-build.sh
/bin/bash test-build.sh ci

- name: Print Successful Apps
if: always()
Expand Down
428 changes: 215 additions & 213 deletions src/Makefile

Large diffs are not rendered by default.

4 changes: 0 additions & 4 deletions src/cuda/GPU_Microbenchmark/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,14 @@ BIN_DIR := $(BASE_DIR)/bin
SUB_DIRS = $(wildcard ubench/*/*/)
SUB_DIRS_ALL = $(SUB_DIRS:%=all-%)
SUB_DIRS_CLEAN = $(SUB_DIRS:%=clean-%)
SUB_DIRS_TUNER = $(SUB_DIRS:%=tuner-%)


all: create_dir $(SUB_DIRS_ALL)
tuner: create_dir $(SUB_DIRS_TUNER)
clean: delete_dir $(SUB_DIRS_CLEAN)

$(SUB_DIRS_ALL):
$(MAKE) $(MAKE_FLAGS) -C $(@:all-%=%)

$(SUB_DIRS_TUNER):
$(MAKE) -C $(@:tuner-%=%) tuner
$(SUB_DIRS_CLEAN):
$(MAKE) $(MAKE_FLAGS) -C $(@:clean-%=%) clean

Expand Down
2 changes: 1 addition & 1 deletion src/cuda/GPU_Microbenchmark/hw_def/blackwell_B200_hw_def.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#define L1_SIZE (256 * 1024) // Max L1 size in bytes

#define CLK_FREQUENCY 1665 // frequency in MHz
// #define CLK_FREQUENCY 1665 // frequency in MHz

#define ISSUE_MODEL issue_model::single // single issue core or dual issue
#define CORE_MODEL core_model::subcore // subcore model or shared model
Expand Down
12 changes: 6 additions & 6 deletions src/cuda/GPU_Microbenchmark/hw_def/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,12 @@ unsigned get_adjusted_CCD(enum dram_model model) {
return dram_model_burst_length[model] / dram_model_freq_ratio[model];
}

unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) {
unsigned channel_width =
dram_model_bus_width[model] * dram_model_mem_per_ctrlr[model];
assert(total_memory_width % channel_width == 0);
return total_memory_width / channel_width;
}
// unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) {
// unsigned channel_width =
// dram_model_bus_width[model] * dram_model_mem_per_ctrlr[model];
// assert(total_memory_width % channel_width == 0);
// return total_memory_width / channel_width;
// }

// DDR timing struct
struct DDR_Timing {
Expand Down
104 changes: 97 additions & 7 deletions src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@

#include <string>
#include <cstdlib>
#include <cstdint>
#include <iostream>
#include <cuda_runtime.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <cstring>


// Holds all GPU configuration parameters
struct GpuConfig
Expand Down Expand Up @@ -32,6 +39,9 @@ struct GpuConfig
unsigned THREADS_PER_SM = 2048; // Threads per SM (launch config)
unsigned BLOCKS_NUM = 640; // Total blocks launched
unsigned TOTAL_THREADS = 163840; // Total threads launched

unsigned FBP_COUNT = 0; // Frame Buffer Partitions
unsigned L2_BANKS = 0; // L2 Cache Banks (LTCs)
};
GpuConfig config;
// Parses short flags like --sm 80 into a GpuConfig object
Expand Down Expand Up @@ -124,7 +134,9 @@ inline void printGpuConfig(const GpuConfig &c = config)
<< "BLOCKS_PER_SM: " << c.BLOCKS_PER_SM << "\n"
<< "THREADS_PER_SM: " << c.THREADS_PER_SM << "\n"
<< "BLOCKS_NUM: " << c.BLOCKS_NUM << "\n"
<< "TOTAL_THREADS: " << c.TOTAL_THREADS << "\n";
<< "TOTAL_THREADS: " << c.TOTAL_THREADS << "\n"
<< "FBP_COUNT: " << c.FBP_COUNT << "\n"
<< "L2_BANKS: " << c.L2_BANKS << "\n";
}

// GPU error check
Expand All @@ -146,14 +158,91 @@ inline void gpuAssert(cudaError_t code, const char *file, int line,

cudaDeviceProp deviceProp;

unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[])
// NVIDIA RM API defines
#define NV_IOCTL_MAGIC 'F'
#define NV_ESC_RM_ALLOC 0x2b
#define NV_ESC_RM_CONTROL 0x2a
#define NV_ESC_RM_FREE 0x29
#define NV01_ROOT_CLIENT 0x00000041
#define NV01_DEVICE_0 0x00000080
#define NV20_SUBDEVICE_0 0x00002080
#define NV2080_CTRL_CMD_GR_GET_INFO 0x20801201

// https://github.com/NVIDIA/open-gpu-kernel-modules/blob/580.95.05/src/common/sdk/nvidia/inc/ctrl/ctrl0080/ctrl0080gr.h#L142
#define NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS 0x00000015
#define NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS 0x00000025

typedef uint32_t NvHandle;
typedef uint32_t NvV32;
typedef uint64_t NvP64;

// Query single GR info index using NVIDIA RM API
inline unsigned queryGrInfo(uint32_t info_index)
{
#ifdef TUNER
struct NVOS21_PARAMETERS { NvHandle hRoot, hObjectParent, hObjectNew; NvV32 hClass; NvP64 pAllocParms; uint32_t paramsSize, status; };
struct NVOS54_PARAMETERS { NvHandle hClient, hObject; NvV32 cmd, flags; NvP64 params; uint32_t paramsSize, status; };
struct NVOS00_PARAMETERS { NvHandle hRoot, hObjectParent, hObjectOld; uint32_t status; };
struct NV0080_ALLOC_PARAMETERS { uint32_t deviceId; NvHandle hClientShare, hTargetClient, hTargetDevice; NvV32 flags; uint32_t _pad0; uint64_t vaSpaceSize, vaStartInternal, vaLimitInternal; NvV32 vaMode; uint32_t _pad1; };
struct NV2080_ALLOC_PARAMETERS { uint32_t subDeviceId; };
struct NVXXXX_CTRL_XXX_INFO { uint32_t index, data; };
struct NV0080_CTRL_GR_ROUTE_INFO { uint32_t flags, _pad; uint64_t route; };
struct NV2080_CTRL_GR_GET_INFO_PARAMS { uint32_t grInfoListSize, _pad; NvP64 grInfoList; NV0080_CTRL_GR_ROUTE_INFO grRouteInfo; };

int ctl_fd = open("/dev/nvidiactl", O_RDWR);
if (ctl_fd < 0) {
fprintf(stderr, "DEBUG GR: Failed to open /dev/nvidiactl (errno=%d)\n", errno);
return 0;
}

auto rm_alloc = [&](NvHandle hClient, NvHandle hParent, NvHandle hObject, uint32_t hClass, void *pParams, uint32_t size) {
NVOS21_PARAMETERS p = {hClient, hParent, hObject, hClass, (NvP64)(uintptr_t)pParams, size, 0};
bool success = ioctl(ctl_fd, _IOWR(NV_IOCTL_MAGIC, NV_ESC_RM_ALLOC, NVOS21_PARAMETERS), &p) >= 0 && p.status == 0;
if (!success) fprintf(stderr, "DEBUG GR: rm_alloc failed for class 0x%x, status=0x%x\n", hClass, p.status);
return success;
};
auto rm_control = [&](NvHandle hClient, NvHandle hObject, uint32_t cmd, void *pParams, uint32_t size) {
NVOS54_PARAMETERS p = {hClient, hObject, cmd, 0, (NvP64)(uintptr_t)pParams, size, 0};
bool success = ioctl(ctl_fd, _IOWR(NV_IOCTL_MAGIC, NV_ESC_RM_CONTROL, NVOS54_PARAMETERS), &p) >= 0 && p.status == 0;
if (!success) fprintf(stderr, "DEBUG GR: rm_control failed for cmd 0x%x, status=0x%x\n", cmd, p.status);
return success;
};
auto rm_free = [&](NvHandle hClient, NvHandle hParent, NvHandle hObject) {
NVOS00_PARAMETERS p = {hClient, hParent, hObject, 0};
ioctl(ctl_fd, _IOWR(NV_IOCTL_MAGIC, NV_ESC_RM_FREE, NVOS00_PARAMETERS), &p);
};

NvHandle hClient = 0xCAFE0001, hDevice = 0xCAFE0002, hSubDevice = 0xCAFE0003;
NV0080_ALLOC_PARAMETERS devParams = {0};
NV2080_ALLOC_PARAMETERS subdevParams = {0};
NVXXXX_CTRL_XXX_INFO infoList[1] = {{info_index, 0}};
NV2080_CTRL_GR_GET_INFO_PARAMS grParams = {1, 0, (NvP64)(uintptr_t)infoList, {0, 0, 0}};

#pragma message("TUNER")
unsigned result = 0;
if (rm_alloc(hClient, hClient, hClient, NV01_ROOT_CLIENT, NULL, 0) &&
rm_alloc(hClient, hClient, hDevice, NV01_DEVICE_0, &devParams, sizeof(devParams)) &&
rm_alloc(hClient, hDevice, hSubDevice, NV20_SUBDEVICE_0, &subdevParams, sizeof(subdevParams)) &&
rm_control(hClient, hSubDevice, NV2080_CTRL_CMD_GR_GET_INFO, &grParams, sizeof(grParams))) {
result = infoList[0].data;
fprintf(stderr, "DEBUG GR: Successfully queried index 0x%x = %u\n", info_index, result);
} else {
fprintf(stderr, "DEBUG GR: Query sequence failed for index 0x%x\n", info_index);
}

rm_free(hClient, hDevice, hSubDevice);
rm_free(hClient, hClient, hDevice);
rm_free(hClient, hClient, hClient);
close(ctl_fd);
return result;
}

unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[])
{
cudaSetDevice(deviceID);
cudaGetDeviceProperties(&deviceProp, deviceID);

int clockRateKHz;
cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceID);

// core stats

config.SM_NUMBER = deviceProp.multiProcessorCount;
Expand Down Expand Up @@ -184,11 +273,12 @@ unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[])
config.MEM_SIZE = deviceProp.totalGlobalMem;
config.MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f;
config.MEM_BITWIDTH = deviceProp.memoryBusWidth;
#else
parseGpuConfigArgs(argc, argv);
config.CLK_FREQUENCY = clockRateKHz * 1e-3f;

#endif
config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS);
config.L2_BANKS = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS);

parseGpuConfigArgs(argc, argv);
printGpuConfig();

return 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,9 @@
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef TUNER
#pragma message("TUNER")
#include "../../../hw_def/hw_def.h"
#define REPEAT_TIMES 2048
#else
#include "../../../hw_def/common/gpuConfig.h"
// #define THREADS_PER_BLOCK 1
// #define THREADS_PER_SM 1
// #define BLOCKS_NUM 1
// #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM)
// #define WARP_SIZE 32
#define REPEAT_TIMES 16
#endif

// #define THREADS_PER_BLOCK 1024
// #define THREADS_PER_SM 2048
Expand All @@ -28,7 +17,7 @@

template <class T>
__global__ void atomic_bw(uint64_t *startClk, uint64_t *stopClk, T *data1,
T *res)
T *res, uint32_t repeat_times)
{
int gid = blockIdx.x * blockDim.x + threadIdx.x;
// register T s1 = data1[gid];
Expand All @@ -43,7 +32,7 @@ __global__ void atomic_bw(uint64_t *startClk, uint64_t *stopClk, T *data1,
// start timing
uint64_t start = clock64();

for (uint32_t i = 0; i < REPEAT_TIMES; i++)
for (uint32_t i = 0; i < repeat_times; i++)
{
sum = sum + atomicAdd(&data1[(i * warpSize) + gid], 10);
}
Expand All @@ -64,7 +53,17 @@ int main(int argc, char *argv[])

intilizeDeviceProp(0, argc, argv);

unsigned ARRAY_SIZE = config.TOTAL_THREADS + (REPEAT_TIMES * config.WARP_SIZE);
// Parse command line arguments for --fast flag
uint32_t repeat_times = 2048; // default
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "--fast") == 0) {
std::cout << "Fast mode enabled: reducing repeat_times to 16" << std::endl;
repeat_times = 16;
break;
}
}

unsigned ARRAY_SIZE = config.TOTAL_THREADS + (repeat_times * config.WARP_SIZE);
uint64_t *startClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t));
uint64_t *stopClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t));

Expand All @@ -90,7 +89,7 @@ int main(int argc, char *argv[])
cudaMemcpyHostToDevice));

atomic_bw<int32_t><<<config.BLOCKS_NUM, config.THREADS_PER_BLOCK>>>(startClk_g, stopClk_g,
data1_g, res_g);
data1_g, res_g, repeat_times);
gpuErrchk(cudaPeekAtLastError());

gpuErrchk(cudaMemcpy(startClk, startClk_g, config.TOTAL_THREADS * sizeof(uint32_t),
Expand All @@ -106,7 +105,7 @@ int main(int argc, char *argv[])
*std::min_element(&startClk[0], &startClk[config.TOTAL_THREADS]);
// uint64_t total_time = stopClk[0]-startClk[0];

bw = (((float)REPEAT_TIMES * (float)config.TOTAL_THREADS * 4 * 8) /
bw = (((float)repeat_times * (float)config.TOTAL_THREADS * 4 * 8) /
(float)(total_time));
printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw);
printf("Total Clk number = %ld \n", total_time);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,39 +3,13 @@
#include <iostream>
#include <stdio.h>
#include <stdlib.h>

#ifdef TUNER
#include <string.h>

#include "../../../hw_def/hw_def.h"
#define REPEAT_TIMES 2048
#else
#define REPEAT_TIMES 16
#include "../../../hw_def/common/gpuConfig.h"
// #define THREADS_PER_BLOCK 1024
// #define THREADS_PER_SM 2048
// #define BLOCKS_NUM 160
// #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM)
// #define WARP_SIZE 32

// #define ARRAY_SIZE TOTAL_THREADS

// #define gpuErrchk(ans) \
// { gpuAssert((ans), __FILE__, __LINE__); }
// inline void gpuAssert(cudaError_t code, const char *file, int line,
// bool abort = true) {
// if (code != cudaSuccess) {
// fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
// line);
// if (abort)
// exit(code);
// }
// }

#endif

template <class T>
__global__ void atomic_bw(uint32_t *startClk, uint32_t *stopClk, T *data1,
T *res)
T *res, uint32_t repeat_times)
{
int gid = blockIdx.x * blockDim.x + threadIdx.x;
uint32_t sum;
Expand All @@ -46,7 +20,7 @@ __global__ void atomic_bw(uint32_t *startClk, uint32_t *stopClk, T *data1,
uint32_t start = 0;
asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory");

for (int j = 0; j < REPEAT_TIMES; ++j)
for (int j = 0; j < repeat_times; ++j)
{
sum = sum + atomicAdd(&data1[0], 10);
}
Expand All @@ -67,6 +41,17 @@ int main(int argc, char *argv[])
{

intilizeDeviceProp(0, argc, argv);
config.BLOCKS_NUM = config.SM_NUMBER * 2;
config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM;

// Parse command line arguments for --fast flag
uint32_t repeat_times = 2048; // default
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "--fast") == 0) {
repeat_times = 16;
break;
}
}

uint32_t *startClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t));
uint32_t *stopClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t));
Expand All @@ -92,7 +77,7 @@ int main(int argc, char *argv[])
cudaMemcpyHostToDevice));

atomic_bw<int32_t><<<config.BLOCKS_NUM, config.THREADS_PER_BLOCK>>>(startClk_g, stopClk_g,
data1_g, res_g);
data1_g, res_g, repeat_times);
gpuErrchk(cudaPeekAtLastError());

gpuErrchk(cudaMemcpy(startClk, startClk_g, config.TOTAL_THREADS * sizeof(uint32_t),
Expand All @@ -107,7 +92,7 @@ int main(int argc, char *argv[])
*std::max_element(&stopClk[0], &stopClk[config.TOTAL_THREADS]) -
*std::min_element(&startClk[0], &startClk[config.TOTAL_THREADS]);
// uint32_t total_time = stopClk[0] - startClk[0];
bw = ((float)(REPEAT_TIMES * config.TOTAL_THREADS * 4) / (float)(total_time));
bw = ((float)(repeat_times * config.TOTAL_THREADS * 4) / (float)(total_time));
printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw);
printf("Total Clk number = %u \n", total_time);

Expand Down
Loading
Loading