diff --git a/.gitignore b/.gitignore index a6335ab0a..b729af5d6 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,13 @@ src/cuda/rodinia/3.1/cuda/particlefilter/particlefilter_naive src/cuda/rodinia/3.1/cuda/pathfinder/pathfinder 4.2 .venv/ -__pycache__/ \ No newline at end of file +__pycache__/ +compile_commands.json +.cache/ +tmp/ + +# Ignoring files without extension (but keep Makefile and files with extensions) +src/cuda/GPU_Microbenchmark/ubench/**/* +!src/cuda/GPU_Microbenchmark/ubench/**/*/ +!src/cuda/GPU_Microbenchmark/ubench/**/*.* +!src/cuda/GPU_Microbenchmark/ubench/**/Makefile \ No newline at end of file diff --git a/src/Makefile b/src/Makefile index 64a679840..237c10ebe 100644 --- a/src/Makefile +++ b/src/Makefile @@ -520,6 +520,11 @@ huggingface: chmod u+x $(BINDIR)/$(BINSUBDIR)/huggingface/helloworld chmod u+x $(BINDIR)/$(BINSUBDIR)/huggingface/*.py +tma: + mkdir -p $(BINDIR)/$(BINSUBDIR)/tma + cp -r cuda/tma $(BINDIR)/$(BINSUBDIR) + cd $(BINDIR)/$(BINSUBDIR)/tma && make all + clean_heterosync: rm -rf cuda/heterosync @@ -696,4 +701,7 @@ clean_cuda_samples: make clean -C ./cuda/cuda-samples/build clean_huggingface: - rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface \ No newline at end of file + rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface + +clean_tma: + rm -rf $(BINDIR)/$(BINSUBDIR)/tma diff --git a/src/cuda/GPU_Microbenchmark/.gitignore b/src/cuda/GPU_Microbenchmark/.gitignore index 3eedb5f9c..fc27dfcdc 100644 --- a/src/cuda/GPU_Microbenchmark/.gitignore +++ b/src/cuda/GPU_Microbenchmark/.gitignore @@ -1,3 +1,4 @@ bin/ *.o *.out +*.a \ No newline at end of file diff --git a/src/cuda/GPU_Microbenchmark/common/common.mk b/src/cuda/GPU_Microbenchmark/common/common.mk index 70ac7864f..ec372cecd 100644 --- a/src/cuda/GPU_Microbenchmark/common/common.mk +++ b/src/cuda/GPU_Microbenchmark/common/common.mk @@ -9,16 +9,42 @@ CC := nvcc LIB := -release: - $(CC) $(NVCC_FLAGS) $(CUOPTS) $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart +# Generate object file list from SRC (for parallel compilation) +CUDA_SRC_FILES := $(filter %.cu, $(SRC)) +CPP_SRC_FILES := $(filter %.cpp, $(SRC)) +C_SRC_FILES := $(filter %.c, $(SRC)) + +# To preserve PTX in multi-step compilation, we have to compile the CUDA source files to .a files +CUDA_LIB_FILES := $(CUDA_SRC_FILES:.cu=.a) + +# Host side source files +CPP_OBJECT_FILES := $(CPP_SRC_FILES:.cpp=.o) +C_OBJECT_FILES := $(C_SRC_FILES:.c=.o) +OBJECT_FILES := $(CPP_OBJECT_FILES) $(C_OBJECT_FILES) + +# If multiple source files are provided, compile them separately and link +# To preserve PTX in final binary: First create static library, then link to executable +# This avoids nvlink stripping PTX during device linking +release: $(CUDA_LIB_FILES) $(OBJECT_FILES) + $(CC) $(NVCC_FLAGS) $^ -o $(EXE) -L$(LIB) -lcudart cp $(EXE) $(BIN_DIR) +# Pattern rule for compiling individual .cu files to .o files +%.a: %.cu + $(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) --lib $< -o $@ + +%.o: %.cpp + $(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) -dc $< -o $@ + +%.o: %.c + $(CC) $(NVCC_FLAGS) $(INCLUDE) $(CUOPTS) -dc $< -o $@ + tuner: - $(CC) $(NVCC_FLAGS) $(CUOPTS) -DTUNER $(SRC) -o $(EXE) -I$(INCLUDE) -L$(LIB) -lcudart + $(CC) $(NVCC_FLAGS) $(CUOPTS) -DTUNER $(SRC) -o $(EXE) $(INCLUDE) -L$(LIB) -lcudart cp $(EXE) $(BIN_DIR) clean: - rm -f *.o; rm -f $(EXE) + rm -f *.o $(OBJECTS); rm -f $(EXE) $(LIB_FILE) run: ./$(EXE) @@ -36,7 +62,7 @@ nvsight: nv-nsight-cu-cli --metrics gpc__cycles_elapsed.avg,sm__cycles_elapsed.sum,smsp__inst_executed.sum,sm__warps_active.avg.pct_of_peak_sustained_active,l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum,l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum,lts__t_sectors_srcunit_tex_op_read.sum,lts__t_sectors_srcunit_tex_op_write.sum,lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum,lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum,lts__t_sector_op_read_hit_rate.pct,lts__t_sector_op_write_hit_rate.pct,lts__t_sectors_srcunit_tex_op_read.sum.per_second,dram__sectors_read.sum,dram__sectors_write.sum,dram__bytes_read.sum --csv --page raw ./$(EXE) | tee nsight.csv ptx: - cuobjdump -ptx ./$(EXE) tee ptx.txt + cuobjdump -ptx ./$(EXE) | tee ptx.txt sass: - cuobjdump -sass ./$(EXE) tee sass.txt + cuobjdump -sass ./$(EXE) | tee sass.txt diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/common.h b/src/cuda/GPU_Microbenchmark/hw_def/common/common.h index b84828646..50e9ec2e5 100644 --- a/src/cuda/GPU_Microbenchmark/hw_def/common/common.h +++ b/src/cuda/GPU_Microbenchmark/hw_def/common/common.h @@ -22,7 +22,7 @@ enum dram_model { GDDR5 = 1, GDDR5X = 2, GDDR6 = 3, HBM = 4 }; // source: // https://stackoverflow.com/questions/466204/rounding-up-to-next-power-of-2 -unsigned round_up_2n(unsigned v) { +inline unsigned round_up_2n(unsigned v) { v--; v |= v >> 1; v |= v >> 2; @@ -34,9 +34,9 @@ unsigned round_up_2n(unsigned v) { return v; } -unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); } +inline unsigned round_up_2n(float n) { return round_up_2n((unsigned)ceil(n)); } -bool isPowerOfTwo(int n) { +inline bool isPowerOfTwo(int n) { if (n == 0) return false; @@ -51,17 +51,17 @@ static const unsigned dram_model_burst_length[] = {0, 8, 8, 16, 2}; static const unsigned dram_model_freq_ratio[] = {0, 4, 4, 4, 2}; // atom size = // dram_model_channel_width*dram_model_mem_per_ctrlr*dram_model_burst_length -unsigned get_atom_size_inByte(enum dram_model model) { +inline unsigned get_atom_size_inByte(enum dram_model model) { return (dram_model_bus_width[model] / 8) * dram_model_mem_per_ctrlr[model] * dram_model_burst_length[model]; } // CCD = dram_model_burst_length/dram_model_freq_ratio -unsigned get_adjusted_CCD(enum dram_model model) { +inline unsigned get_adjusted_CCD(enum dram_model model) { assert(dram_model_burst_length[model] % dram_model_freq_ratio[model] == 0); return dram_model_burst_length[model] / dram_model_freq_ratio[model]; } -unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) { +inline unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) { unsigned channel_width = dram_model_bus_width[model] * dram_model_mem_per_ctrlr[model]; assert(total_memory_width % channel_width == 0); diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h index 8780356af..2a043af2d 100644 --- a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h +++ b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h @@ -33,7 +33,7 @@ struct GpuConfig unsigned BLOCKS_NUM = 640; // Total blocks launched unsigned TOTAL_THREADS = 163840; // Total threads launched }; -GpuConfig config; +inline GpuConfig config; // Parses short flags like --sm 80 into a GpuConfig object inline void parseGpuConfigArgs(int argc, char *argv[]) { @@ -144,9 +144,9 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, } } -cudaDeviceProp deviceProp; +inline cudaDeviceProp deviceProp; -unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[]) +inline unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[]) { #ifdef TUNER diff --git a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu index a82a1b6d1..0346bd2de 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu @@ -111,5 +111,5 @@ int main(int argc, char *argv[]) printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw); printf("Total Clk number = %ld \n", total_time); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu index 22052dfb0..bb2ee4f56 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu @@ -111,5 +111,5 @@ int main(int argc, char *argv[]) printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw); printf("Total Clk number = %u \n", total_time); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu index 6506be838..a6e18b554 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu @@ -97,5 +97,5 @@ int main(int argc, char *argv[]) printf("Atomic int32 latency = %f (clk)\n", latency); printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.cu index 72177aa4e..0d10907a7 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.cu @@ -7,5 +7,5 @@ int main(int argc, char *argv[]) dpu_max_flops(); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.cu index 8f2f767e4..54c8fef1e 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.cu @@ -7,5 +7,5 @@ int main(int argc, char *argv[]) fpu_max_flops(); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/Makefile b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/Makefile new file mode 100644 index 000000000..6a5651a5e --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/Makefile @@ -0,0 +1,20 @@ +# Source files split for parallel compilation +# Use wildcard to automatically include all size-specific breakdown files +SRC = MaxFlops_gmma.cu $(wildcard kernels/MaxFlops_gmma_*.cu) + +EXE = MaxFlops_gmma + +# Add include path for CUTLASS +INCLUDE += -I$(GPUAPPS_ROOT)/src/cuda/cutlass-bench/include -I./ + +# GMMA is only supported in sm_90a +ARCH?=sm_90a +# Unset the CUDA_CPPFLAGS which is set based on CUDA version +CUDA_CPPFLAGS= +# Generate code for both sm_XXX and compute_XXX (SASS and PTX) +HOPPER_CUDA_CPPFLAGS=$(foreach arch,$(ARCH),-gencode=arch=compute_$(subst sm_,,$(arch)),code=$(arch) -gencode=arch=compute_$(subst sm_,,$(arch)),code=compute_$(subst sm_,,$(arch))) + +# CUTLASS cute library requires C++17 +NVCC_FLAGS := $(HOPPER_CUDA_CPPFLAGS) -std=c++17 + +include ../../../common/common.mk diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/MaxFlops_gmma.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/MaxFlops_gmma.cu new file mode 100644 index 000000000..275d97065 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/MaxFlops_gmma.cu @@ -0,0 +1,13 @@ +#include +#include "MaxFlops_gmma.h" +#include "../../../hw_def/hw_def.h" + +int main(int argc, char *argv[]) +{ + intilizeDeviceProp(0, argc, argv); + + // Run comprehensive sweep over all valid MMA operations + run_all_wgmma_maxflops_tests(); + + return 0; +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/MaxFlops_gmma.h b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/MaxFlops_gmma.h new file mode 100644 index 000000000..6f2cefdc3 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/MaxFlops_gmma.h @@ -0,0 +1,344 @@ +#ifndef MAXFLOPS_GMMA_DEF_H +#define MAXFLOPS_GMMA_DEF_H + +#include +#include +#include +#include + + +// Function declarations for test suites +// These are defined in separate .cu files for parallel compilation + +// F32 accumulator tests - TF32 x TF32 -> F32 +void run_f32tf32tf32tf32_64x8x8_test(); +void run_f32tf32tf32tf32_64x16x8_test(); +void run_f32tf32tf32tf32_64x32x8_test(); +void run_f32tf32tf32tf32_64x64x8_test(); +void run_f32tf32tf32tf32_64x96x8_test(); +void run_f32tf32tf32tf32_64x128x8_test(); +void run_f32tf32tf32tf32_64x192x8_test(); +void run_f32tf32tf32tf32_64x256x8_test(); + +// F32 accumulator tests - E4M3 x E4M3 -> F32 +void run_f32e4m3e4m3e4m3_64x8x32_test(); +void run_f32e4m3e4m3e4m3_64x16x32_test(); +void run_f32e4m3e4m3e4m3_64x32x32_test(); +void run_f32e4m3e4m3e4m3_64x64x32_test(); +void run_f32e4m3e4m3e4m3_64x96x32_test(); +void run_f32e4m3e4m3e4m3_64x128x32_test(); +void run_f32e4m3e4m3e4m3_64x192x32_test(); +void run_f32e4m3e4m3e4m3_64x256x32_test(); + +// F32 accumulator tests - E4M3 x E5M2 -> F32 +void run_f32e4m3e5m2e4m3_64x8x32_test(); +void run_f32e4m3e5m2e4m3_64x16x32_test(); +void run_f32e4m3e5m2e4m3_64x32x32_test(); +void run_f32e4m3e5m2e4m3_64x64x32_test(); +void run_f32e4m3e5m2e4m3_64x96x32_test(); +void run_f32e4m3e5m2e4m3_64x128x32_test(); +void run_f32e4m3e5m2e4m3_64x192x32_test(); +void run_f32e4m3e5m2e4m3_64x256x32_test(); + +// F32 accumulator tests - E5M2 x E4M3 -> F32 +void run_f32e5m2e4m3e5m2_64x8x32_test(); +void run_f32e5m2e4m3e5m2_64x16x32_test(); +void run_f32e5m2e4m3e5m2_64x32x32_test(); +void run_f32e5m2e4m3e5m2_64x64x32_test(); +void run_f32e5m2e4m3e5m2_64x96x32_test(); +void run_f32e5m2e4m3e5m2_64x128x32_test(); +void run_f32e5m2e4m3e5m2_64x192x32_test(); +void run_f32e5m2e4m3e5m2_64x256x32_test(); + +// F32 accumulator tests - E5M2 x E5M2 -> F32 +void run_f32e5m2e5m2e5m2_64x8x32_test(); +void run_f32e5m2e5m2e5m2_64x16x32_test(); +void run_f32e5m2e5m2e5m2_64x32x32_test(); +void run_f32e5m2e5m2e5m2_64x64x32_test(); +void run_f32e5m2e5m2e5m2_64x96x32_test(); +void run_f32e5m2e5m2e5m2_64x128x32_test(); +void run_f32e5m2e5m2e5m2_64x192x32_test(); +void run_f32e5m2e5m2e5m2_64x256x32_test(); + +// INT32 accumulator tests - INT8 x INT8 -> INT32 +void run_int32s8s8s8_64x8x32_test(); +void run_int32s8s8s8_64x16x32_test(); +void run_int32s8s8s8_64x32x32_test(); +void run_int32s8s8s8_64x64x32_test(); +void run_int32s8s8s8_64x96x32_test(); +void run_int32s8s8s8_64x128x32_test(); +void run_int32s8s8s8_64x192x32_test(); +void run_int32s8s8s8_64x256x32_test(); + +// INT32 accumulator tests - INT8 x UINT8 -> INT32 +void run_int32s8u8s8_64x8x32_test(); +void run_int32s8u8s8_64x16x32_test(); +void run_int32s8u8s8_64x32x32_test(); +void run_int32s8u8s8_64x64x32_test(); +void run_int32s8u8s8_64x96x32_test(); +void run_int32s8u8s8_64x128x32_test(); +void run_int32s8u8s8_64x192x32_test(); +void run_int32s8u8s8_64x256x32_test(); + +// INT32 accumulator tests - UINT8 x INT8 -> INT32 +void run_int32u8s8u8_64x8x32_test(); +void run_int32u8s8u8_64x16x32_test(); +void run_int32u8s8u8_64x32x32_test(); +void run_int32u8s8u8_64x64x32_test(); +void run_int32u8s8u8_64x96x32_test(); +void run_int32u8s8u8_64x128x32_test(); +void run_int32u8s8u8_64x192x32_test(); +void run_int32u8s8u8_64x256x32_test(); + +// INT32 accumulator tests - UINT8 x UINT8 -> INT32 +void run_int32u8u8u8_64x8x32_test(); +void run_int32u8u8u8_64x16x32_test(); +void run_int32u8u8u8_64x32x32_test(); +void run_int32u8u8u8_64x64x32_test(); +void run_int32u8u8u8_64x96x32_test(); +void run_int32u8u8u8_64x128x32_test(); +void run_int32u8u8u8_64x192x32_test(); +void run_int32u8u8u8_64x256x32_test(); + +// F16 accumulator tests (defined in lat_gmma_f16.cu) +// F32 accumulator tests - FP16 x FP16 -> F32 +void run_f32f16f16_64x8x16_test(); +void run_f32f16f16_64x16x16_test(); +void run_f32f16f16_64x32x16_test(); +void run_f32f16f16_64x64x16_test(); +void run_f32f16f16_64x96x16_test(); +void run_f32f16f16_64x128x16_test(); +void run_f32f16f16_64x192x16_test(); +void run_f32f16f16_64x256x16_test(); + +// F32 accumulator tests - BF16 x BF16 -> F32 +void run_f32bf16bf16_64x8x16_test(); +void run_f32bf16bf16_64x16x16_test(); +void run_f32bf16bf16_64x32x16_test(); +void run_f32bf16bf16_64x64x16_test(); +void run_f32bf16bf16_64x96x16_test(); +void run_f32bf16bf16_64x128x16_test(); +void run_f32bf16bf16_64x192x16_test(); +void run_f32bf16bf16_64x256x16_test(); + +// F16 accumulator tests - FP16 x FP16 -> F16 +void run_f16f16f16_64x8x16_test(); +void run_f16f16f16_64x16x16_test(); +void run_f16f16f16_64x32x16_test(); +void run_f16f16f16_64x64x16_test(); +void run_f16f16f16_64x96x16_test(); +void run_f16f16f16_64x128x16_test(); +void run_f16f16f16_64x192x16_test(); +void run_f16f16f16_64x256x16_test(); + +// F16 accumulator tests - E4M3 x E4M3 -> F16 +void run_f16e4m3e4m3_64x8x32_test(); +void run_f16e4m3e4m3_64x16x32_test(); +void run_f16e4m3e4m3_64x32x32_test(); +void run_f16e4m3e4m3_64x64x32_test(); +void run_f16e4m3e4m3_64x96x32_test(); +void run_f16e4m3e4m3_64x128x32_test(); +void run_f16e4m3e4m3_64x192x32_test(); +void run_f16e4m3e4m3_64x256x32_test(); + +// F16 accumulator tests - E4M3 x E5M2 -> F16 +void run_f16e4m3e5m2_64x8x32_test(); +void run_f16e4m3e5m2_64x16x32_test(); +void run_f16e4m3e5m2_64x32x32_test(); +void run_f16e4m3e5m2_64x64x32_test(); +void run_f16e4m3e5m2_64x96x32_test(); +void run_f16e4m3e5m2_64x128x32_test(); +void run_f16e4m3e5m2_64x192x32_test(); +void run_f16e4m3e5m2_64x256x32_test(); + +// F16 accumulator tests - E5M2 x E4M3 -> F16 +void run_f16e5m2e4m3_64x8x32_test(); +void run_f16e5m2e4m3_64x16x32_test(); +void run_f16e5m2e4m3_64x32x32_test(); +void run_f16e5m2e4m3_64x64x32_test(); +void run_f16e5m2e4m3_64x96x32_test(); +void run_f16e5m2e4m3_64x128x32_test(); +void run_f16e5m2e4m3_64x192x32_test(); +void run_f16e5m2e4m3_64x256x32_test(); + +// F16 accumulator tests - E5M2 x E5M2 -> F16 +void run_f16e5m2e5m2_64x8x32_test(); +void run_f16e5m2e5m2_64x16x32_test(); +void run_f16e5m2e5m2_64x32x32_test(); +void run_f16e5m2e5m2_64x64x32_test(); +void run_f16e5m2e5m2_64x96x32_test(); +void run_f16e5m2e5m2_64x128x32_test(); +void run_f16e5m2e5m2_64x192x32_test(); +void run_f16e5m2e5m2_64x256x32_test(); + +void run_f16accumulator_tests() { + run_f16f16f16_64x8x16_test(); + run_f16f16f16_64x16x16_test(); + run_f16f16f16_64x32x16_test(); + run_f16f16f16_64x64x16_test(); + run_f16f16f16_64x96x16_test(); + run_f16f16f16_64x128x16_test(); + run_f16f16f16_64x192x16_test(); + run_f16f16f16_64x256x16_test(); + run_f16e4m3e4m3_64x8x32_test(); + run_f16e4m3e4m3_64x16x32_test(); + run_f16e4m3e4m3_64x32x32_test(); + run_f16e4m3e4m3_64x64x32_test(); + run_f16e4m3e4m3_64x96x32_test(); + run_f16e4m3e4m3_64x128x32_test(); + run_f16e4m3e4m3_64x192x32_test(); + run_f16e4m3e4m3_64x256x32_test(); + run_f16e4m3e5m2_64x8x32_test(); + run_f16e4m3e5m2_64x16x32_test(); + run_f16e4m3e5m2_64x32x32_test(); + run_f16e4m3e5m2_64x64x32_test(); + run_f16e4m3e5m2_64x96x32_test(); + run_f16e4m3e5m2_64x128x32_test(); + run_f16e4m3e5m2_64x192x32_test(); + run_f16e4m3e5m2_64x256x32_test(); + run_f16e5m2e4m3_64x8x32_test(); + run_f16e5m2e4m3_64x16x32_test(); + run_f16e5m2e4m3_64x32x32_test(); + run_f16e5m2e4m3_64x64x32_test(); + run_f16e5m2e4m3_64x96x32_test(); + run_f16e5m2e4m3_64x128x32_test(); + run_f16e5m2e4m3_64x192x32_test(); + run_f16e5m2e4m3_64x256x32_test(); + run_f16e5m2e5m2_64x8x32_test(); + run_f16e5m2e5m2_64x16x32_test(); + run_f16e5m2e5m2_64x32x32_test(); + run_f16e5m2e5m2_64x64x32_test(); + run_f16e5m2e5m2_64x96x32_test(); + run_f16e5m2e5m2_64x128x32_test(); + run_f16e5m2e5m2_64x192x32_test(); + run_f16e5m2e5m2_64x256x32_test(); +} + +void run_f32accumulator_tests() { + run_f32tf32tf32tf32_64x8x8_test(); + run_f32tf32tf32tf32_64x16x8_test(); + run_f32tf32tf32tf32_64x32x8_test(); + run_f32tf32tf32tf32_64x64x8_test(); + run_f32tf32tf32tf32_64x96x8_test(); + run_f32tf32tf32tf32_64x128x8_test(); + run_f32tf32tf32tf32_64x192x8_test(); + run_f32tf32tf32tf32_64x256x8_test(); + run_f32f16f16_64x8x16_test(); + run_f32f16f16_64x16x16_test(); + run_f32f16f16_64x32x16_test(); + run_f32f16f16_64x64x16_test(); + run_f32f16f16_64x96x16_test(); + run_f32f16f16_64x128x16_test(); + run_f32f16f16_64x192x16_test(); + run_f32f16f16_64x256x16_test(); + run_f32bf16bf16_64x8x16_test(); + run_f32bf16bf16_64x16x16_test(); + run_f32bf16bf16_64x32x16_test(); + run_f32bf16bf16_64x64x16_test(); + run_f32bf16bf16_64x96x16_test(); + run_f32bf16bf16_64x128x16_test(); + run_f32bf16bf16_64x192x16_test(); + run_f32bf16bf16_64x256x16_test(); + run_f32e4m3e4m3e4m3_64x8x32_test(); + run_f32e4m3e4m3e4m3_64x16x32_test(); + run_f32e4m3e4m3e4m3_64x32x32_test(); + run_f32e4m3e4m3e4m3_64x64x32_test(); + run_f32e4m3e4m3e4m3_64x96x32_test(); + run_f32e4m3e4m3e4m3_64x128x32_test(); + run_f32e4m3e4m3e4m3_64x192x32_test(); + run_f32e4m3e4m3e4m3_64x256x32_test(); + run_f32e4m3e5m2e4m3_64x8x32_test(); + run_f32e4m3e5m2e4m3_64x16x32_test(); + run_f32e4m3e5m2e4m3_64x32x32_test(); + run_f32e4m3e5m2e4m3_64x64x32_test(); + run_f32e4m3e5m2e4m3_64x96x32_test(); + run_f32e4m3e5m2e4m3_64x128x32_test(); + run_f32e4m3e5m2e4m3_64x192x32_test(); + run_f32e4m3e5m2e4m3_64x256x32_test(); + run_f32e5m2e4m3e5m2_64x8x32_test(); + run_f32e5m2e4m3e5m2_64x16x32_test(); + run_f32e5m2e4m3e5m2_64x32x32_test(); + run_f32e5m2e4m3e5m2_64x64x32_test(); + run_f32e5m2e4m3e5m2_64x96x32_test(); + run_f32e5m2e4m3e5m2_64x128x32_test(); + run_f32e5m2e4m3e5m2_64x192x32_test(); + run_f32e5m2e4m3e5m2_64x256x32_test(); + run_f32e5m2e5m2e5m2_64x8x32_test(); + run_f32e5m2e5m2e5m2_64x16x32_test(); + run_f32e5m2e5m2e5m2_64x32x32_test(); + run_f32e5m2e5m2e5m2_64x64x32_test(); + run_f32e5m2e5m2e5m2_64x96x32_test(); + run_f32e5m2e5m2e5m2_64x128x32_test(); + run_f32e5m2e5m2e5m2_64x192x32_test(); + run_f32e5m2e5m2e5m2_64x256x32_test(); +} + +void run_int32accumulator_tests() { + run_int32s8s8s8_64x8x32_test(); + run_int32s8s8s8_64x16x32_test(); + run_int32s8s8s8_64x32x32_test(); + run_int32s8s8s8_64x64x32_test(); + run_int32s8s8s8_64x96x32_test(); + run_int32s8s8s8_64x128x32_test(); + run_int32s8s8s8_64x192x32_test(); + run_int32s8s8s8_64x256x32_test(); + run_int32s8u8s8_64x8x32_test(); + run_int32s8u8s8_64x16x32_test(); + run_int32s8u8s8_64x32x32_test(); + run_int32s8u8s8_64x64x32_test(); + run_int32s8u8s8_64x96x32_test(); + run_int32s8u8s8_64x128x32_test(); + run_int32s8u8s8_64x192x32_test(); + run_int32s8u8s8_64x256x32_test(); + run_int32u8s8u8_64x8x32_test(); + run_int32u8s8u8_64x16x32_test(); + run_int32u8s8u8_64x32x32_test(); + run_int32u8s8u8_64x64x32_test(); + run_int32u8s8u8_64x96x32_test(); + run_int32u8s8u8_64x128x32_test(); + run_int32u8s8u8_64x192x32_test(); + run_int32u8s8u8_64x256x32_test(); + run_int32u8u8u8_64x8x32_test(); + run_int32u8u8u8_64x16x32_test(); + run_int32u8u8u8_64x32x32_test(); + run_int32u8u8u8_64x64x32_test(); + run_int32u8u8u8_64x96x32_test(); + run_int32u8u8u8_64x128x32_test(); + run_int32u8u8u8_64x192x32_test(); + run_int32u8u8u8_64x256x32_test(); +} + +// ============================================================================ +// Main Test Function - Run All Configurations +// ============================================================================ + +inline void run_all_wgmma_maxflops_tests() { + printf("\n"); + printf("================================================================================\n"); + printf(" SM90 GMMA Max Flops Comprehensive Sweep\n"); + printf("================================================================================\n"); + printf("\n"); + + // Run F32 accumulator tests + run_f32accumulator_tests(); + + // Run F16 accumulator tests + run_f16accumulator_tests(); + + // Run INT32 accumulator tests + run_int32accumulator_tests(); + + printf("================================================================================\n"); + printf(" Sweep Complete\n"); + printf("================================================================================\n"); + printf("\n"); +} + +// Legacy function signatures for compatibility +float gmma_maxflops_ss() { + printf("Running comprehensive WGMMA max flops tests...\n"); + run_all_wgmma_maxflops_tests(); + return 0.0f; +} + +#endif diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/MaxFlops_gmma_common.h b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/MaxFlops_gmma_common.h new file mode 100644 index 000000000..5801f4537 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/MaxFlops_gmma_common.h @@ -0,0 +1,204 @@ +/*************************************************************************************************** + * GMMA Max Flops Microbenchmark - Common Definitions + * + * This header contains shared kernel templates and helper macros used by all GMMA max flops tests. + * + **************************************************************************************************/ + +#ifndef MAXFLOPS_GMMA_COMMON_H +#define MAXFLOPS_GMMA_COMMON_H + +#include +#include +#include +#include +#include +#include + +#include "../../../hw_def/hw_def.h" +#include "cute/arch/util.hpp" + +// CUTLASS cute library headers +#include +#include "cutlass/numeric_types.h" +#include +#include +#include +#include +#include +#include + +using namespace cute; + +#define REPEAT_TIMES 1024 + +// ============================================================================ +// Base Kernel Template +// ============================================================================ + +template< + class ElementA, + class ElementB, + class ElementC, + class TileShape_MNK +> +__global__ void wgmma_max_flops_kernel(uint32_t *startClk, uint32_t *stopClk, uint32_t *checksum) { + int thread_idx = threadIdx.x + blockDim.x * threadIdx.y + threadIdx.z * blockDim.x * blockDim.y ; + int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0); + + static constexpr GMMA::Major GmmaMajorA = cute::GMMA::Major::K; + static constexpr GMMA::Major GmmaMajorB = cute::GMMA::Major::K; + + // Create the GMMA operation + auto gmma_op = cute::GMMA::ss_op_selector< + ElementA, ElementB, ElementC, TileShape_MNK, GmmaMajorA, GmmaMajorB>(); + using MMA_Op = decltype(gmma_op); + + // Create the TiledMma based on element types and tile shape + using TiledMma = decltype(cute::make_tiled_mma(gmma_op)); + using MMA_Traits = typename TiledMma::Traits; + TiledMma tiled_mma; + MMA_Traits traits; + + // Create the fragment A, B, C + // Define the smem layouts using GMMA layout helpers for K-major + // Using Layout_K_INTER_Atom which has minimal swizzling (Swizzle<0,4,3> = identity) + // Layout_K_INTER_Atom_Bits has shape (8, 128 bits) = (8, 4) for 32-bit elements + constexpr int PIPE = 1; + using SmemLayoutA = decltype(tile_to_shape(GMMA::Layout_K_INTER_Atom{}, + make_shape(shape<0>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int{}))); + using SmemLayoutB = decltype(tile_to_shape(GMMA::Layout_K_INTER_Atom{}, + make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int{}))); + + // Allocate shared memory with proper size (using type aliases for constexpr evaluation) + __shared__ ElementA smem_A[cosize_v]; + __shared__ ElementB smem_B[cosize_v]; + + // Create the layout objects for tensor construction + SmemLayoutA sA_layout{}; + SmemLayoutB sB_layout{}; + + // Create the tensors with GMMA-compatible layouts + Tensor sA = make_tensor(make_smem_ptr(smem_A), sA_layout); // (BLK_M, BLK_K, PIPE) + Tensor sB = make_tensor(make_smem_ptr(smem_B), sB_layout); // (BLK_N, BLK_K, PIPE) + constexpr int MmaWarpGroups = size(TiledMma{}) / cutlass::NumThreadsPerWarpGroup; + Layout warp_group_thread_layout = make_layout(Int{}, + Int{}); + auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx)); + + Tensor tCsA = thread_mma.partition_A(sA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCsB = thread_mma.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE) + + // Allocate "fragments/descriptors" + Tensor tCrA = thread_mma.make_fragment_A(tCsA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCrB = thread_mma.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K,PIPE) + + // Get fragment registers for accumulator with MN size + auto accum = partition_fragment_C(tiled_mma, take<0,2>(TileShape_MNK{})); + + tiled_mma.accumulate_ = GMMA::ScaleOut::One; + // tiled_mma.accumulate_ = GMMA::ScaleOut::Zero; + + __syncthreads(); + + // Start timing (only thread 0) + uint32_t start = 0; + if (thread_idx == 0) { + asm volatile("mov.u32 %0, %%clock;" : "=r"(start) :: "memory"); + } + __syncthreads(); + + // Fence accumulator operands + warpgroup_fence_operand(accum); + + // Arrive and execute WGMMA + warpgroup_arrive(); + + #pragma unroll + for (int j = 0; j < REPEAT_TIMES; j++) { + // Call the fma method + cute::gemm(tiled_mma, tCrA(_,_,_,0), tCrB(_,_,_,0), accum); + } + // Wait for WGMMA to complete + warpgroup_commit_batch(); + warpgroup_wait<0>(); + warpgroup_fence_operand(accum); + + __syncthreads(); + + // Stop timing + uint32_t stop = 0; + if (thread_idx == 0) { + asm volatile("mov.u32 %0, %%clock;" : "=r"(stop) :: "memory"); + } + + // Write results + if (thread_idx == 0) { + startClk[blockIdx.x] = start; + stopClk[blockIdx.x] = stop; + + uint32_t total = reinterpret_cast(accum.data())[0]; + checksum[blockIdx.x] = total; + } +} + +// ============================================================================ +// Host Function Template +// ============================================================================ + +template +float run_wgmma_maxflops_test_typed() { + // Allocate device memory + uint32_t *startClk_g, *stopClk_g, *checksum_g; + gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t))); + gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t))); + gpuErrchk(cudaMalloc(&checksum_g, sizeof(uint32_t))); + + // Launch kernel with 256 threads + config.BLOCKS_NUM = 1; + config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; + int TOTAL_WARPS = config.TOTAL_THREADS / 32; + dim3 grid(config.BLOCKS_NUM); + dim3 block(config.THREADS_PER_BLOCK); + wgmma_max_flops_kernel<<>>(startClk_g, stopClk_g, checksum_g); + + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaDeviceSynchronize()); + + // Copy results back + uint32_t startClk, stopClk, checksum; + gpuErrchk(cudaMemcpy(&startClk, startClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(&stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(&checksum, checksum_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); + + // Calculate max instruction throughput + float inst_throughput = ((float)(REPEAT_TIMES) * TOTAL_WARPS) / ((float)(stopClk - startClk)); + + // Cleanup + cudaFree(startClk_g); + cudaFree(stopClk_g); + cudaFree(checksum_g); + + return inst_throughput; +} + +// ============================================================================ +// Helper Macro for Testing +// ============================================================================ + +#define TEST_MMA_CONFIG(EA, EB, EC, M, N, K, DESC) \ + do { \ + try { \ + using TileShape = decltype(make_shape(Int{}, Int{}, Int{})); \ + float num_flop_per_warpgroup = 2 * M * N * K; \ + float warp_inst_per_cycle = run_wgmma_maxflops_test_typed(); \ + const float warps_to_warpgroup = 0.25; \ + float flop_per_cycle_per_warpgroup = num_flop_per_warpgroup * warp_inst_per_cycle * warps_to_warpgroup; \ + printf("%-50s: %6.4f warp instructions/cycle\n", DESC, warp_inst_per_cycle); \ + printf("%-50s: %6.4f flop/warpgroup inst/cycle\n", DESC, flop_per_cycle_per_warpgroup); \ + } catch (...) { \ + printf("%-50s: FAILED\n", DESC); \ + } \ + } while(0) + +#endif // MAXFLOPS_GMMA_COMMON_H diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x128x32.cu new file mode 100644 index 000000000..89cbc223a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x128x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e4m3_64x128x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 128, 32, "MMA_64x128x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x16x32.cu new file mode 100644 index 000000000..7f149d513 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x16x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e4m3_64x16x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 16, 32, "MMA_64x16x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x192x32.cu new file mode 100644 index 000000000..05e22cc82 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x192x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e4m3_64x192x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 192, 32, "MMA_64x192x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x256x32.cu new file mode 100644 index 000000000..e499b955e --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x256x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e4m3_64x256x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 256, 32, "MMA_64x256x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x32x32.cu new file mode 100644 index 000000000..26681630a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x32x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e4m3_64x32x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 32, 32, "MMA_64x32x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x64x32.cu new file mode 100644 index 000000000..a951da8fc --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x64x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e4m3_64x64x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 64, 32, "MMA_64x64x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x8x32.cu new file mode 100644 index 000000000..1381b35a8 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x8x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e4m3_64x8x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 8, 32, "MMA_64x8x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x96x32.cu new file mode 100644 index 000000000..7e55f0212 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e4m3_64x96x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e4m3_64x96x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 96, 32, "MMA_64x96x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x128x32.cu new file mode 100644 index 000000000..f384001f5 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x128x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e5m2_64x128x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 128, 32, "MMA_64x128x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x16x32.cu new file mode 100644 index 000000000..72c6588ab --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x16x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e5m2_64x16x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 16, 32, "MMA_64x16x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x192x32.cu new file mode 100644 index 000000000..7a4818492 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x192x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e5m2_64x192x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 192, 32, "MMA_64x192x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x256x32.cu new file mode 100644 index 000000000..1b7cbad0a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x256x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e5m2_64x256x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 256, 32, "MMA_64x256x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x32x32.cu new file mode 100644 index 000000000..be64c0dad --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x32x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e5m2_64x32x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 32, 32, "MMA_64x32x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x64x32.cu new file mode 100644 index 000000000..988e49f84 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x64x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e5m2_64x64x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 64, 32, "MMA_64x64x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x8x32.cu new file mode 100644 index 000000000..902dbd12e --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x8x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e5m2_64x8x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 8, 32, "MMA_64x8x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x96x32.cu new file mode 100644 index 000000000..a1fa678cb --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e4m3e5m2_64x96x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e4m3e5m2_64x96x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 96, 32, "MMA_64x96x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x128x32.cu new file mode 100644 index 000000000..a6e192790 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x128x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e4m3_64x128x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 128, 32, "MMA_64x128x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x16x32.cu new file mode 100644 index 000000000..db8221394 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x16x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e4m3_64x16x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 16, 32, "MMA_64x16x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x192x32.cu new file mode 100644 index 000000000..6241fde11 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x192x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e4m3_64x192x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 192, 32, "MMA_64x192x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x256x32.cu new file mode 100644 index 000000000..a13409252 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x256x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e4m3_64x256x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 256, 32, "MMA_64x256x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x32x32.cu new file mode 100644 index 000000000..e799b07b4 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x32x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e4m3_64x32x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 32, 32, "MMA_64x32x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x64x32.cu new file mode 100644 index 000000000..d25f70a31 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x64x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e4m3_64x64x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 64, 32, "MMA_64x64x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x8x32.cu new file mode 100644 index 000000000..f2c7b2dfd --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x8x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e4m3_64x8x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 8, 32, "MMA_64x8x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x96x32.cu new file mode 100644 index 000000000..dbd17d4dd --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e4m3_64x96x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e4m3_64x96x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 96, 32, "MMA_64x96x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x128x32.cu new file mode 100644 index 000000000..4e67dd998 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x128x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e5m2_64x128x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 128, 32, "MMA_64x128x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x16x32.cu new file mode 100644 index 000000000..09fc24aab --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x16x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e5m2_64x16x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 16, 32, "MMA_64x16x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x192x32.cu new file mode 100644 index 000000000..44e08c407 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x192x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e5m2_64x192x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 192, 32, "MMA_64x192x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x256x32.cu new file mode 100644 index 000000000..f88b93379 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x256x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e5m2_64x256x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 256, 32, "MMA_64x256x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x32x32.cu new file mode 100644 index 000000000..c45af8174 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x32x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e5m2_64x32x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 32, 32, "MMA_64x32x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x64x32.cu new file mode 100644 index 000000000..bd9197d5b --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x64x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e5m2_64x64x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 64, 32, "MMA_64x64x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x8x32.cu new file mode 100644 index 000000000..fbbe49dfa --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x8x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e5m2_64x8x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 8, 32, "MMA_64x8x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x96x32.cu new file mode 100644 index 000000000..f937be978 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16e5m2e5m2_64x96x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16e5m2e5m2_64x96x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 96, 32, "MMA_64x96x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x128x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x128x16.cu new file mode 100644 index 000000000..e5460eeec --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x128x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16f16f16_64x128x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 128, 16, "MMA_64x128x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x16x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x16x16.cu new file mode 100644 index 000000000..711317c7d --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x16x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16f16f16_64x16x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 16, 16, "MMA_64x16x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x192x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x192x16.cu new file mode 100644 index 000000000..58df17df9 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x192x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16f16f16_64x192x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 192, 16, "MMA_64x192x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x256x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x256x16.cu new file mode 100644 index 000000000..c1e93af3e --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x256x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16f16f16_64x256x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 256, 16, "MMA_64x256x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x32x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x32x16.cu new file mode 100644 index 000000000..da04f8487 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x32x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16f16f16_64x32x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 32, 16, "MMA_64x32x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x64x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x64x16.cu new file mode 100644 index 000000000..04b4b15d0 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x64x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16f16f16_64x64x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 64, 16, "MMA_64x64x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x8x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x8x16.cu new file mode 100644 index 000000000..2af004d15 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x8x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16f16f16_64x8x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 8, 16, "MMA_64x8x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x96x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x96x16.cu new file mode 100644 index 000000000..36e47381a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f16f16f16_64x96x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f16f16f16_64x96x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 96, 16, "MMA_64x96x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x128x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x128x16.cu new file mode 100644 index 000000000..d344d779b --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x128x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32bf16bf16_64x128x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 128, 16, "MMA_64x128x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x16x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x16x16.cu new file mode 100644 index 000000000..e71e224b9 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x16x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32bf16bf16_64x16x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 16, 16, "MMA_64x16x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x192x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x192x16.cu new file mode 100644 index 000000000..af2b3f7da --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x192x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32bf16bf16_64x192x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 192, 16, "MMA_64x192x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x256x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x256x16.cu new file mode 100644 index 000000000..51479ee83 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x256x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32bf16bf16_64x256x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 256, 16, "MMA_64x256x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x32x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x32x16.cu new file mode 100644 index 000000000..71d0d8004 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x32x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32bf16bf16_64x32x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 32, 16, "MMA_64x32x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x64x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x64x16.cu new file mode 100644 index 000000000..b12725d5e --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x64x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32bf16bf16_64x64x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 64, 16, "MMA_64x64x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x8x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x8x16.cu new file mode 100644 index 000000000..6ef3dd507 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x8x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32bf16bf16_64x8x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 8, 16, "MMA_64x8x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x96x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x96x16.cu new file mode 100644 index 000000000..c5b2d031f --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32bf16bf16_64x96x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32bf16bf16_64x96x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 96, 16, "MMA_64x96x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x128x32.cu new file mode 100644 index 000000000..73bc6e69a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x128x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x128x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 128, 32, "MMA_64x128x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x16x32.cu new file mode 100644 index 000000000..a3a9993ab --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x16x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x16x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 16, 32, "MMA_64x16x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x192x32.cu new file mode 100644 index 000000000..69f8ca0cc --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x192x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x192x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 192, 32, "MMA_64x192x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x256x32.cu new file mode 100644 index 000000000..edb19b2fb --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x256x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x256x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 256, 32, "MMA_64x256x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x32x32.cu new file mode 100644 index 000000000..2fa66a6ec --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x32x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x32x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 32, 32, "MMA_64x32x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x64x32.cu new file mode 100644 index 000000000..1ccbc7d1d --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x64x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x64x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 64, 32, "MMA_64x64x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x8x32.cu new file mode 100644 index 000000000..a88024a5c --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x8x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x8x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 8, 32, "MMA_64x8x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x96x32.cu new file mode 100644 index 000000000..6769f25a3 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e4m3e4m3_64x96x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x96x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 96, 32, "MMA_64x96x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x128x32.cu new file mode 100644 index 000000000..f3afe5bae --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x128x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x128x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 128, 32, "MMA_64x128x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x16x32.cu new file mode 100644 index 000000000..6aa004aaa --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x16x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x16x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 16, 32, "MMA_64x16x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x192x32.cu new file mode 100644 index 000000000..cb554a03f --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x192x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x192x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 192, 32, "MMA_64x192x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x256x32.cu new file mode 100644 index 000000000..ed4634b86 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x256x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x256x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 256, 32, "MMA_64x256x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x32x32.cu new file mode 100644 index 000000000..e4525ee3d --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x32x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x32x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 32, 32, "MMA_64x32x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x64x32.cu new file mode 100644 index 000000000..53ad31f79 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x64x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x64x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 64, 32, "MMA_64x64x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x8x32.cu new file mode 100644 index 000000000..31ee085c7 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x8x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x8x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 8, 32, "MMA_64x8x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x96x32.cu new file mode 100644 index 000000000..e0ef50a7a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e4m3e5m2e4m3_64x96x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x96x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 96, 32, "MMA_64x96x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x128x32.cu new file mode 100644 index 000000000..7204e23d8 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x128x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x128x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 128, 32, "MMA_64x128x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x16x32.cu new file mode 100644 index 000000000..8a5f5f1c9 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x16x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x16x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 16, 32, "MMA_64x16x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x192x32.cu new file mode 100644 index 000000000..908010764 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x192x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x192x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 192, 32, "MMA_64x192x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x256x32.cu new file mode 100644 index 000000000..67f4c1017 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x256x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x256x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 256, 32, "MMA_64x256x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x32x32.cu new file mode 100644 index 000000000..e295fd0a0 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x32x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x32x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 32, 32, "MMA_64x32x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x64x32.cu new file mode 100644 index 000000000..a56fe8029 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x64x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x64x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 64, 32, "MMA_64x64x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x8x32.cu new file mode 100644 index 000000000..a727adb81 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x8x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x8x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 8, 32, "MMA_64x8x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x96x32.cu new file mode 100644 index 000000000..c4e574bbb --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e4m3e5m2_64x96x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x96x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 96, 32, "MMA_64x96x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x128x32.cu new file mode 100644 index 000000000..01c4bd415 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x128x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x128x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 128, 32, "MMA_64x128x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x16x32.cu new file mode 100644 index 000000000..b7f661ea9 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x16x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x16x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 16, 32, "MMA_64x16x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x192x32.cu new file mode 100644 index 000000000..f8d226033 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x192x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x192x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 192, 32, "MMA_64x192x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x256x32.cu new file mode 100644 index 000000000..e7ae64270 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x256x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x256x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 256, 32, "MMA_64x256x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x32x32.cu new file mode 100644 index 000000000..21491ddaa --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x32x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x32x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 32, 32, "MMA_64x32x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x64x32.cu new file mode 100644 index 000000000..2b422e261 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x64x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x64x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 64, 32, "MMA_64x64x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x8x32.cu new file mode 100644 index 000000000..c61a78e14 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x8x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x8x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 8, 32, "MMA_64x8x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x96x32.cu new file mode 100644 index 000000000..11d421c0b --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32e5m2e5m2e5m2_64x96x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x96x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 96, 32, "MMA_64x96x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x128x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x128x16.cu new file mode 100644 index 000000000..3517a5084 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x128x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32f16f16_64x128x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 128, 16, "MMA_64x128x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x16x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x16x16.cu new file mode 100644 index 000000000..cb68a3c74 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x16x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32f16f16_64x16x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 16, 16, "MMA_64x16x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x192x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x192x16.cu new file mode 100644 index 000000000..cc6629687 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x192x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32f16f16_64x192x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 192, 16, "MMA_64x192x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x256x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x256x16.cu new file mode 100644 index 000000000..169ff92b9 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x256x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32f16f16_64x256x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 256, 16, "MMA_64x256x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x32x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x32x16.cu new file mode 100644 index 000000000..02fdcb42f --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x32x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32f16f16_64x32x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 32, 16, "MMA_64x32x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x64x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x64x16.cu new file mode 100644 index 000000000..0bff3ff7f --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x64x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32f16f16_64x64x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 64, 16, "MMA_64x64x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x8x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x8x16.cu new file mode 100644 index 000000000..6360475b2 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x8x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32f16f16_64x8x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 8, 16, "MMA_64x8x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x96x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x96x16.cu new file mode 100644 index 000000000..db2b8fba8 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32f16f16_64x96x16.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32f16f16_64x96x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 96, 16, "MMA_64x96x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x128x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x128x8.cu new file mode 100644 index 000000000..9fb3d7d81 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x128x8.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32tf32tf32tf32_64x128x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 128, 8, "MMA_64x128x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x16x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x16x8.cu new file mode 100644 index 000000000..9fde1c917 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x16x8.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32tf32tf32tf32_64x16x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 16, 8, "MMA_64x16x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x192x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x192x8.cu new file mode 100644 index 000000000..0a2561dff --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x192x8.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32tf32tf32tf32_64x192x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 192, 8, "MMA_64x192x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x256x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x256x8.cu new file mode 100644 index 000000000..8b27cca9f --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x256x8.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32tf32tf32tf32_64x256x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 256, 8, "MMA_64x256x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x32x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x32x8.cu new file mode 100644 index 000000000..cee95e28a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x32x8.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32tf32tf32tf32_64x32x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 32, 8, "MMA_64x32x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x64x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x64x8.cu new file mode 100644 index 000000000..10f88cbaf --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x64x8.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32tf32tf32tf32_64x64x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 64, 8, "MMA_64x64x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x8x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x8x8.cu new file mode 100644 index 000000000..c1d7add84 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x8x8.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32tf32tf32tf32_64x8x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 8, 8, "MMA_64x8x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x96x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x96x8.cu new file mode 100644 index 000000000..4075057c2 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_f32tf32tf32tf32_64x96x8.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_f32tf32tf32tf32_64x96x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 96, 8, "MMA_64x96x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x128x32.cu new file mode 100644 index 000000000..677ad96f3 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x128x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8s8s8_64x128x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 128, 32, "MMA_64x128x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x16x32.cu new file mode 100644 index 000000000..d602bada1 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x16x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8s8s8_64x16x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 16, 32, "MMA_64x16x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x192x32.cu new file mode 100644 index 000000000..1e0feafcc --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x192x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8s8s8_64x192x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 192, 32, "MMA_64x192x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x256x32.cu new file mode 100644 index 000000000..b8f5804c9 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x256x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8s8s8_64x256x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 256, 32, "MMA_64x256x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x32x32.cu new file mode 100644 index 000000000..30d2350cf --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x32x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8s8s8_64x32x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 32, 32, "MMA_64x32x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x64x32.cu new file mode 100644 index 000000000..1326baa08 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x64x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8s8s8_64x64x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 64, 32, "MMA_64x64x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x8x32.cu new file mode 100644 index 000000000..5bd8ac69e --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x8x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8s8s8_64x8x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 8, 32, "MMA_64x8x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x96x32.cu new file mode 100644 index 000000000..f93f633dd --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8s8s8_64x96x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8s8s8_64x96x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 96, 32, "MMA_64x96x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x128x32.cu new file mode 100644 index 000000000..7216c2c21 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x128x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8u8s8_64x128x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 128, 32, "MMA_64x128x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x16x32.cu new file mode 100644 index 000000000..510fa4217 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x16x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8u8s8_64x16x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 16, 32, "MMA_64x16x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x192x32.cu new file mode 100644 index 000000000..c0b8f07fe --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x192x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8u8s8_64x192x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 192, 32, "MMA_64x192x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x256x32.cu new file mode 100644 index 000000000..d3367abfd --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x256x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8u8s8_64x256x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 256, 32, "MMA_64x256x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x32x32.cu new file mode 100644 index 000000000..ff0820a5d --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x32x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8u8s8_64x32x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 32, 32, "MMA_64x32x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x64x32.cu new file mode 100644 index 000000000..dd3002ae4 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x64x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8u8s8_64x64x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 64, 32, "MMA_64x64x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x8x32.cu new file mode 100644 index 000000000..8ee29a58a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x8x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8u8s8_64x8x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 8, 32, "MMA_64x8x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x96x32.cu new file mode 100644 index 000000000..6277abec7 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32s8u8s8_64x96x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32s8u8s8_64x96x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 96, 32, "MMA_64x96x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x128x32.cu new file mode 100644 index 000000000..c919cd6c1 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x128x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8s8u8_64x128x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 128, 32, "MMA_64x128x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x16x32.cu new file mode 100644 index 000000000..d5bfd7df6 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x16x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8s8u8_64x16x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 16, 32, "MMA_64x16x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x192x32.cu new file mode 100644 index 000000000..b090c6db7 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x192x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8s8u8_64x192x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 192, 32, "MMA_64x192x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x256x32.cu new file mode 100644 index 000000000..ec46d78a6 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x256x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8s8u8_64x256x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 256, 32, "MMA_64x256x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x32x32.cu new file mode 100644 index 000000000..feb47af08 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x32x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8s8u8_64x32x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 32, 32, "MMA_64x32x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x64x32.cu new file mode 100644 index 000000000..f1a35ae92 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x64x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8s8u8_64x64x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 64, 32, "MMA_64x64x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x8x32.cu new file mode 100644 index 000000000..d98d4586f --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x8x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8s8u8_64x8x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 8, 32, "MMA_64x8x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x96x32.cu new file mode 100644 index 000000000..3d43542f5 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8s8u8_64x96x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8s8u8_64x96x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 96, 32, "MMA_64x96x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x128x32.cu new file mode 100644 index 000000000..b98ecea1e --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x128x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8u8u8_64x128x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 128, 32, "MMA_64x128x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x16x32.cu new file mode 100644 index 000000000..e33011e69 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x16x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8u8u8_64x16x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 16, 32, "MMA_64x16x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x192x32.cu new file mode 100644 index 000000000..236bda022 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x192x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8u8u8_64x192x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 192, 32, "MMA_64x192x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x256x32.cu new file mode 100644 index 000000000..240050ccc --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x256x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8u8u8_64x256x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 256, 32, "MMA_64x256x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x32x32.cu new file mode 100644 index 000000000..afc3cf5b9 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x32x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8u8u8_64x32x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 32, 32, "MMA_64x32x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x64x32.cu new file mode 100644 index 000000000..2243ef73b --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x64x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8u8u8_64x64x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 64, 32, "MMA_64x64x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x8x32.cu new file mode 100644 index 000000000..a1440dd8d --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x8x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8u8u8_64x8x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 8, 32, "MMA_64x8x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x96x32.cu new file mode 100644 index 000000000..37ea24f85 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_gmma/kernels/MaxFlops_gmma_int32u8u8u8_64x96x32.cu @@ -0,0 +1,5 @@ +#include "MaxFlops_gmma_common.h" + +void run_int32u8u8u8_64x96x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 96, 32, "MMA_64x96x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.cu index 9627ccb98..c4263d153 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.cu @@ -7,5 +7,5 @@ int main(int argc, char *argv[]) fpu16_max_flops(); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxIops_int32/MaxFlops_int32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/MaxIops_int32/MaxFlops_int32.cu index 2f7781b43..08fb250af 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/MaxIops_int32/MaxFlops_int32.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxIops_int32/MaxFlops_int32.cu @@ -5,5 +5,5 @@ int main(int argc, char* argv[]) { max_int32_flops(argc,argv); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/config_dpu/config_dpu.cu b/src/cuda/GPU_Microbenchmark/ubench/core/config_dpu/config_dpu.cu index b275901a4..4f46a710b 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/config_dpu/config_dpu.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/config_dpu/config_dpu.cu @@ -34,5 +34,5 @@ int main(int argc, char *argv[]) << std::endl; } - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/config_fpu/config_fpu.cu b/src/cuda/GPU_Microbenchmark/ubench/core/config_fpu/config_fpu.cu index 36ba52a7c..82dfb88fa 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/config_fpu/config_fpu.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/config_fpu/config_fpu.cu @@ -33,5 +33,5 @@ int main(int argc, char *argv[]) << std::endl; } - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/config_int/config_int.cu b/src/cuda/GPU_Microbenchmark/ubench/core/config_int/config_int.cu index 414f0a19b..e40bcad46 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/config_int/config_int.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/config_int/config_int.cu @@ -40,5 +40,5 @@ int main(int argc, char *argv[]) } } - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/config_sfu/config_sfu.cu b/src/cuda/GPU_Microbenchmark/ubench/core/config_sfu/config_sfu.cu index 7a14a9d86..891aa2246 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/config_sfu/config_sfu.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/config_sfu/config_sfu.cu @@ -27,5 +27,5 @@ int main(int argc, char *argv[]) << std::endl; } - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/config_tensor/config_tensor.cu b/src/cuda/GPU_Microbenchmark/ubench/core/config_tensor/config_tensor.cu index d572303d2..cdd2749d6 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/config_tensor/config_tensor.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/config_tensor/config_tensor.cu @@ -46,5 +46,5 @@ int main(int argc, char *argv[]) } } - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/config_udp/config_udp.cu b/src/cuda/GPU_Microbenchmark/ubench/core/config_udp/config_udp.cu index 12bc844de..ffcce5794 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/config_udp/config_udp.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/config_udp/config_udp.cu @@ -29,5 +29,5 @@ int main(int argc, char *argv[]) } } - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/core_config/core_config.cu b/src/cuda/GPU_Microbenchmark/ubench/core/core_config/core_config.cu index 95da83f96..71a643672 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/core_config/core_config.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/core_config/core_config.cu @@ -86,5 +86,5 @@ int main(int argc, char *argv[]) << std::endl; } - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_double/lat_double.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_double/lat_double.cu index e10247061..fa822abc7 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/lat_double/lat_double.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_double/lat_double.cu @@ -7,5 +7,5 @@ int main(int argc, char *argv[]) dpu_latency(); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_float/lat_float.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_float/lat_float.cu index e0e2e0fa0..89ffb19c9 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/lat_float/lat_float.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_float/lat_float.cu @@ -7,5 +7,5 @@ int main(int argc, char *argv[]) fpu_latency(); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/Makefile b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/Makefile new file mode 100644 index 000000000..18baeea9c --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/Makefile @@ -0,0 +1,20 @@ +# Source files split for parallel compilation +# Use wildcard to automatically include all size-specific breakdown files +SRC = lat_gmma.cu $(wildcard kernels/lat_gmma_*.cu) + +EXE = lat_gmma + +# Add include path for CUTLASS +INCLUDE += -I$(GPUAPPS_ROOT)/src/cuda/cutlass-bench/include -I./ + +# GMMA is only supported in sm_90a +ARCH?=sm_90a +# Unset the CUDA_CPPFLAGS which is set based on CUDA version +CUDA_CPPFLAGS= +# Generate code for both sm_90a and compute_90a (SASS and PTX) +HOPPER_CUDA_CPPFLAGS=$(foreach arch,$(ARCH),-gencode=arch=compute_$(subst sm_,,$(arch)),code=$(arch) -gencode=arch=compute_$(subst sm_,,$(arch)),code=compute_$(subst sm_,,$(arch))) + +# CUTLASS cute library requires C++17 +NVCC_FLAGS := $(HOPPER_CUDA_CPPFLAGS) -std=c++17 + +include ../../../common/common.mk diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x128x32.cu new file mode 100644 index 000000000..ec65d5eff --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x128x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e4m3_64x128x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 128, 32, "MMA_64x128x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x16x32.cu new file mode 100644 index 000000000..0da1c52bd --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x16x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e4m3_64x16x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 16, 32, "MMA_64x16x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x192x32.cu new file mode 100644 index 000000000..b227b645d --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x192x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e4m3_64x192x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 192, 32, "MMA_64x192x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x256x32.cu new file mode 100644 index 000000000..a885466d9 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x256x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e4m3_64x256x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 256, 32, "MMA_64x256x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x32x32.cu new file mode 100644 index 000000000..cbb28cea9 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x32x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e4m3_64x32x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 32, 32, "MMA_64x32x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x64x32.cu new file mode 100644 index 000000000..c9687a66c --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x64x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e4m3_64x64x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 64, 32, "MMA_64x64x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x8x32.cu new file mode 100644 index 000000000..9b445f92a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x8x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e4m3_64x8x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 8, 32, "MMA_64x8x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x96x32.cu new file mode 100644 index 000000000..fc4180513 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e4m3_64x96x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e4m3_64x96x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, half_t, 64, 96, 32, "MMA_64x96x32_F16E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x128x32.cu new file mode 100644 index 000000000..a02254201 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x128x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e5m2_64x128x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 128, 32, "MMA_64x128x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x16x32.cu new file mode 100644 index 000000000..68b9546e7 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x16x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e5m2_64x16x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 16, 32, "MMA_64x16x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x192x32.cu new file mode 100644 index 000000000..c0f3993b2 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x192x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e5m2_64x192x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 192, 32, "MMA_64x192x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x256x32.cu new file mode 100644 index 000000000..297053fe7 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x256x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e5m2_64x256x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 256, 32, "MMA_64x256x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x32x32.cu new file mode 100644 index 000000000..0126fcc9a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x32x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e5m2_64x32x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 32, 32, "MMA_64x32x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x64x32.cu new file mode 100644 index 000000000..744a42dfb --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x64x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e5m2_64x64x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 64, 32, "MMA_64x64x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x8x32.cu new file mode 100644 index 000000000..bfca52f2a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x8x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e5m2_64x8x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 8, 32, "MMA_64x8x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x96x32.cu new file mode 100644 index 000000000..d4a580d6f --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e4m3e5m2_64x96x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e4m3e5m2_64x96x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, half_t, 64, 96, 32, "MMA_64x96x32_F16E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x128x32.cu new file mode 100644 index 000000000..9319c9ee6 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x128x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e4m3_64x128x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 128, 32, "MMA_64x128x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x16x32.cu new file mode 100644 index 000000000..a1602c766 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x16x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e4m3_64x16x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 16, 32, "MMA_64x16x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x192x32.cu new file mode 100644 index 000000000..2db60e978 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x192x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e4m3_64x192x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 192, 32, "MMA_64x192x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x256x32.cu new file mode 100644 index 000000000..22e2df853 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x256x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e4m3_64x256x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 256, 32, "MMA_64x256x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x32x32.cu new file mode 100644 index 000000000..ec0fdbdef --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x32x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e4m3_64x32x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 32, 32, "MMA_64x32x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x64x32.cu new file mode 100644 index 000000000..1101feb0f --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x64x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e4m3_64x64x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 64, 32, "MMA_64x64x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x8x32.cu new file mode 100644 index 000000000..755c81a33 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x8x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e4m3_64x8x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 8, 32, "MMA_64x8x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x96x32.cu new file mode 100644 index 000000000..350d8f5c4 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e4m3_64x96x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e4m3_64x96x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, half_t, 64, 96, 32, "MMA_64x96x32_F16E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x128x32.cu new file mode 100644 index 000000000..d466a9795 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x128x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e5m2_64x128x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 128, 32, "MMA_64x128x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x16x32.cu new file mode 100644 index 000000000..44d9835ad --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x16x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e5m2_64x16x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 16, 32, "MMA_64x16x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x192x32.cu new file mode 100644 index 000000000..31db9558a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x192x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e5m2_64x192x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 192, 32, "MMA_64x192x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x256x32.cu new file mode 100644 index 000000000..3ddacc999 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x256x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e5m2_64x256x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 256, 32, "MMA_64x256x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x32x32.cu new file mode 100644 index 000000000..25d80681d --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x32x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e5m2_64x32x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 32, 32, "MMA_64x32x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x64x32.cu new file mode 100644 index 000000000..696c59694 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x64x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e5m2_64x64x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 64, 32, "MMA_64x64x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x8x32.cu new file mode 100644 index 000000000..1c2c62651 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x8x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e5m2_64x8x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 8, 32, "MMA_64x8x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x96x32.cu new file mode 100644 index 000000000..c4b93b17f --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16e5m2e5m2_64x96x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16e5m2e5m2_64x96x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, half_t, 64, 96, 32, "MMA_64x96x32_F16E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x128x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x128x16.cu new file mode 100644 index 000000000..38d284502 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x128x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16f16f16_64x128x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 128, 16, "MMA_64x128x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x16x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x16x16.cu new file mode 100644 index 000000000..40e841ad0 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x16x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16f16f16_64x16x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 16, 16, "MMA_64x16x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x192x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x192x16.cu new file mode 100644 index 000000000..2200355a9 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x192x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16f16f16_64x192x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 192, 16, "MMA_64x192x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x256x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x256x16.cu new file mode 100644 index 000000000..057ee4222 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x256x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16f16f16_64x256x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 256, 16, "MMA_64x256x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x32x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x32x16.cu new file mode 100644 index 000000000..252e4cde7 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x32x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16f16f16_64x32x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 32, 16, "MMA_64x32x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x64x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x64x16.cu new file mode 100644 index 000000000..d3d8715bd --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x64x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16f16f16_64x64x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 64, 16, "MMA_64x64x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x8x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x8x16.cu new file mode 100644 index 000000000..df7d161ba --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x8x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16f16f16_64x8x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 8, 16, "MMA_64x8x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x96x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x96x16.cu new file mode 100644 index 000000000..92dc99dc5 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f16f16f16_64x96x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f16f16f16_64x96x16_test() { + TEST_MMA_CONFIG(half_t, half_t, half_t, 64, 96, 16, "MMA_64x96x16_F16F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x128x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x128x16.cu new file mode 100644 index 000000000..58c068112 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x128x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32bf16bf16_64x128x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 128, 16, "MMA_64x128x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x16x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x16x16.cu new file mode 100644 index 000000000..63b5adc61 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x16x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32bf16bf16_64x16x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 16, 16, "MMA_64x16x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x192x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x192x16.cu new file mode 100644 index 000000000..1275b3a16 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x192x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32bf16bf16_64x192x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 192, 16, "MMA_64x192x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x256x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x256x16.cu new file mode 100644 index 000000000..e3aabaa75 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x256x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32bf16bf16_64x256x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 256, 16, "MMA_64x256x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x32x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x32x16.cu new file mode 100644 index 000000000..0d9e07f78 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x32x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32bf16bf16_64x32x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 32, 16, "MMA_64x32x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x64x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x64x16.cu new file mode 100644 index 000000000..14f6fcd8c --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x64x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32bf16bf16_64x64x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 64, 16, "MMA_64x64x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x8x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x8x16.cu new file mode 100644 index 000000000..1b0d0e6d2 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x8x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32bf16bf16_64x8x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 8, 16, "MMA_64x8x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x96x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x96x16.cu new file mode 100644 index 000000000..62a4331de --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32bf16bf16_64x96x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32bf16bf16_64x96x16_test() { + TEST_MMA_CONFIG(bfloat16_t, bfloat16_t, float, 64, 96, 16, "MMA_64x96x16_F32BF16BF16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x128x32.cu new file mode 100644 index 000000000..71ea82971 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x128x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x128x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 128, 32, "MMA_64x128x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x16x32.cu new file mode 100644 index 000000000..6ef3e404c --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x16x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x16x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 16, 32, "MMA_64x16x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x192x32.cu new file mode 100644 index 000000000..ad1cd2296 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x192x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x192x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 192, 32, "MMA_64x192x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x256x32.cu new file mode 100644 index 000000000..000a40d72 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x256x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x256x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 256, 32, "MMA_64x256x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x32x32.cu new file mode 100644 index 000000000..522366660 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x32x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x32x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 32, 32, "MMA_64x32x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x64x32.cu new file mode 100644 index 000000000..1394b2a40 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x64x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x64x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 64, 32, "MMA_64x64x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x8x32.cu new file mode 100644 index 000000000..f39d1456c --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x8x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x8x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 8, 32, "MMA_64x8x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x96x32.cu new file mode 100644 index 000000000..fe7242b99 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e4m3e4m3_64x96x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e4m3e4m3_64x96x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e4m3_t, float, 64, 96, 32, "MMA_64x96x32_F32E4M3E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x128x32.cu new file mode 100644 index 000000000..dc46a76b6 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x128x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x128x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 128, 32, "MMA_64x128x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x16x32.cu new file mode 100644 index 000000000..14c139710 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x16x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x16x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 16, 32, "MMA_64x16x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x192x32.cu new file mode 100644 index 000000000..3678c5414 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x192x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x192x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 192, 32, "MMA_64x192x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x256x32.cu new file mode 100644 index 000000000..39f479742 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x256x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x256x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 256, 32, "MMA_64x256x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x32x32.cu new file mode 100644 index 000000000..7607963c4 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x32x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x32x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 32, 32, "MMA_64x32x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x64x32.cu new file mode 100644 index 000000000..e20b25a04 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x64x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x64x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 64, 32, "MMA_64x64x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x8x32.cu new file mode 100644 index 000000000..8b2f9c8d6 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x8x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x8x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 8, 32, "MMA_64x8x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x96x32.cu new file mode 100644 index 000000000..a8fb10753 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e4m3e5m2e4m3_64x96x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e4m3e5m2e4m3_64x96x32_test() { + TEST_MMA_CONFIG(float_e4m3_t, float_e5m2_t, float, 64, 96, 32, "MMA_64x96x32_F32E4M3E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x128x32.cu new file mode 100644 index 000000000..279aa4d37 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x128x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x128x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 128, 32, "MMA_64x128x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x16x32.cu new file mode 100644 index 000000000..0a5aaf41a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x16x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x16x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 16, 32, "MMA_64x16x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x192x32.cu new file mode 100644 index 000000000..5eac44263 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x192x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x192x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 192, 32, "MMA_64x192x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x256x32.cu new file mode 100644 index 000000000..47bc6a3ef --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x256x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x256x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 256, 32, "MMA_64x256x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x32x32.cu new file mode 100644 index 000000000..4f0d5856a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x32x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x32x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 32, 32, "MMA_64x32x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x64x32.cu new file mode 100644 index 000000000..d8326f3ac --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x64x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x64x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 64, 32, "MMA_64x64x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x8x32.cu new file mode 100644 index 000000000..fbec6e6f1 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x8x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x8x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 8, 32, "MMA_64x8x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x96x32.cu new file mode 100644 index 000000000..1ab9ef53d --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e4m3e5m2_64x96x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e4m3e5m2_64x96x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e4m3_t, float, 64, 96, 32, "MMA_64x96x32_F32E5M2E4M3_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x128x32.cu new file mode 100644 index 000000000..b41d328e1 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x128x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x128x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 128, 32, "MMA_64x128x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x16x32.cu new file mode 100644 index 000000000..414ea343e --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x16x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x16x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 16, 32, "MMA_64x16x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x192x32.cu new file mode 100644 index 000000000..53d23cdc9 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x192x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x192x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 192, 32, "MMA_64x192x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x256x32.cu new file mode 100644 index 000000000..7055d9edc --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x256x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x256x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 256, 32, "MMA_64x256x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x32x32.cu new file mode 100644 index 000000000..b34f08491 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x32x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x32x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 32, 32, "MMA_64x32x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x64x32.cu new file mode 100644 index 000000000..0f1c26687 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x64x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x64x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 64, 32, "MMA_64x64x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x8x32.cu new file mode 100644 index 000000000..1450e5f7e --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x8x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x8x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 8, 32, "MMA_64x8x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x96x32.cu new file mode 100644 index 000000000..6077058b7 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32e5m2e5m2e5m2_64x96x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32e5m2e5m2e5m2_64x96x32_test() { + TEST_MMA_CONFIG(float_e5m2_t, float_e5m2_t, float, 64, 96, 32, "MMA_64x96x32_F32E5M2E5M2_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x128x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x128x16.cu new file mode 100644 index 000000000..9eaa49ab0 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x128x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32f16f16_64x128x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 128, 16, "MMA_64x128x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x16x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x16x16.cu new file mode 100644 index 000000000..183878140 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x16x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32f16f16_64x16x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 16, 16, "MMA_64x16x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x192x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x192x16.cu new file mode 100644 index 000000000..ea30da004 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x192x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32f16f16_64x192x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 192, 16, "MMA_64x192x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x256x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x256x16.cu new file mode 100644 index 000000000..5e82afe2b --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x256x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32f16f16_64x256x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 256, 16, "MMA_64x256x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x32x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x32x16.cu new file mode 100644 index 000000000..24f208beb --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x32x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32f16f16_64x32x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 32, 16, "MMA_64x32x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x64x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x64x16.cu new file mode 100644 index 000000000..68a4b96cb --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x64x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32f16f16_64x64x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 64, 16, "MMA_64x64x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x8x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x8x16.cu new file mode 100644 index 000000000..2f3dab3a4 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x8x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32f16f16_64x8x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 8, 16, "MMA_64x8x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x96x16.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x96x16.cu new file mode 100644 index 000000000..2b2e8722e --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32f16f16_64x96x16.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32f16f16_64x96x16_test() { + TEST_MMA_CONFIG(half_t, half_t, float, 64, 96, 16, "MMA_64x96x16_F32F16F16_SS"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x128x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x128x8.cu new file mode 100644 index 000000000..158088161 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x128x8.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32tf32tf32tf32_64x128x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 128, 8, "MMA_64x128x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x16x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x16x8.cu new file mode 100644 index 000000000..fe1492be2 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x16x8.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32tf32tf32tf32_64x16x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 16, 8, "MMA_64x16x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x192x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x192x8.cu new file mode 100644 index 000000000..463e9905b --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x192x8.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32tf32tf32tf32_64x192x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 192, 8, "MMA_64x192x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x256x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x256x8.cu new file mode 100644 index 000000000..53d323884 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x256x8.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32tf32tf32tf32_64x256x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 256, 8, "MMA_64x256x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x32x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x32x8.cu new file mode 100644 index 000000000..a59c41861 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x32x8.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32tf32tf32tf32_64x32x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 32, 8, "MMA_64x32x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x64x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x64x8.cu new file mode 100644 index 000000000..ab5d032ca --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x64x8.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32tf32tf32tf32_64x64x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 64, 8, "MMA_64x64x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x8x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x8x8.cu new file mode 100644 index 000000000..97eb9b4b6 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x8x8.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32tf32tf32tf32_64x8x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 8, 8, "MMA_64x8x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x96x8.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x96x8.cu new file mode 100644 index 000000000..fca3734c2 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_f32tf32tf32tf32_64x96x8.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_f32tf32tf32tf32_64x96x8_test() { + TEST_MMA_CONFIG(tfloat32_t, tfloat32_t, float, 64, 96, 8, "MMA_64x96x8_F32TF32TF32_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x128x32.cu new file mode 100644 index 000000000..c41a660a4 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x128x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8s8s8_64x128x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 128, 32, "MMA_64x128x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x16x32.cu new file mode 100644 index 000000000..9c3895b45 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x16x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8s8s8_64x16x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 16, 32, "MMA_64x16x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x192x32.cu new file mode 100644 index 000000000..f4b60f169 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x192x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8s8s8_64x192x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 192, 32, "MMA_64x192x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x256x32.cu new file mode 100644 index 000000000..c6522cb86 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x256x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8s8s8_64x256x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 256, 32, "MMA_64x256x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x32x32.cu new file mode 100644 index 000000000..468abc59b --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x32x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8s8s8_64x32x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 32, 32, "MMA_64x32x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x64x32.cu new file mode 100644 index 000000000..9786b4c45 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x64x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8s8s8_64x64x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 64, 32, "MMA_64x64x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x8x32.cu new file mode 100644 index 000000000..823375478 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x8x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8s8s8_64x8x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 8, 32, "MMA_64x8x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x96x32.cu new file mode 100644 index 000000000..0104423e3 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8s8s8_64x96x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8s8s8_64x96x32_test() { + TEST_MMA_CONFIG(int8_t, int8_t, int32_t, 64, 96, 32, "MMA_64x96x32_S32S8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x128x32.cu new file mode 100644 index 000000000..6d8210349 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x128x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8u8s8_64x128x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 128, 32, "MMA_64x128x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x16x32.cu new file mode 100644 index 000000000..8622cacfb --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x16x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8u8s8_64x16x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 16, 32, "MMA_64x16x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x192x32.cu new file mode 100644 index 000000000..a4feb009a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x192x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8u8s8_64x192x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 192, 32, "MMA_64x192x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x256x32.cu new file mode 100644 index 000000000..e9d817de8 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x256x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8u8s8_64x256x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 256, 32, "MMA_64x256x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x32x32.cu new file mode 100644 index 000000000..65ff0d122 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x32x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8u8s8_64x32x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 32, 32, "MMA_64x32x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x64x32.cu new file mode 100644 index 000000000..48938cac9 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x64x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8u8s8_64x64x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 64, 32, "MMA_64x64x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x8x32.cu new file mode 100644 index 000000000..55d601042 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x8x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8u8s8_64x8x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 8, 32, "MMA_64x8x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x96x32.cu new file mode 100644 index 000000000..38925b524 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32s8u8s8_64x96x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32s8u8s8_64x96x32_test() { + TEST_MMA_CONFIG(int8_t, uint8_t, int32_t, 64, 96, 32, "MMA_64x96x32_S32S8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x128x32.cu new file mode 100644 index 000000000..70d09f174 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x128x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8s8u8_64x128x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 128, 32, "MMA_64x128x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x16x32.cu new file mode 100644 index 000000000..f260d2d33 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x16x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8s8u8_64x16x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 16, 32, "MMA_64x16x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x192x32.cu new file mode 100644 index 000000000..93f12eede --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x192x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8s8u8_64x192x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 192, 32, "MMA_64x192x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x256x32.cu new file mode 100644 index 000000000..e25b2a8b8 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x256x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8s8u8_64x256x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 256, 32, "MMA_64x256x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x32x32.cu new file mode 100644 index 000000000..81329112b --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x32x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8s8u8_64x32x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 32, 32, "MMA_64x32x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x64x32.cu new file mode 100644 index 000000000..7ad6c9175 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x64x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8s8u8_64x64x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 64, 32, "MMA_64x64x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x8x32.cu new file mode 100644 index 000000000..e6e4fc35a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x8x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8s8u8_64x8x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 8, 32, "MMA_64x8x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x96x32.cu new file mode 100644 index 000000000..c35ec92ce --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8s8u8_64x96x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8s8u8_64x96x32_test() { + TEST_MMA_CONFIG(uint8_t, int8_t, int32_t, 64, 96, 32, "MMA_64x96x32_S32U8S8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x128x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x128x32.cu new file mode 100644 index 000000000..326a3f4dd --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x128x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8u8u8_64x128x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 128, 32, "MMA_64x128x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x16x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x16x32.cu new file mode 100644 index 000000000..46192dee8 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x16x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8u8u8_64x16x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 16, 32, "MMA_64x16x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x192x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x192x32.cu new file mode 100644 index 000000000..d26787621 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x192x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8u8u8_64x192x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 192, 32, "MMA_64x192x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x256x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x256x32.cu new file mode 100644 index 000000000..9a5962ddd --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x256x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8u8u8_64x256x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 256, 32, "MMA_64x256x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x32x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x32x32.cu new file mode 100644 index 000000000..bbca73ed8 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x32x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8u8u8_64x32x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 32, 32, "MMA_64x32x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x64x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x64x32.cu new file mode 100644 index 000000000..9914d67a8 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x64x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8u8u8_64x64x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 64, 32, "MMA_64x64x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x8x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x8x32.cu new file mode 100644 index 000000000..f2044b708 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x8x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8u8u8_64x8x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 8, 32, "MMA_64x8x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x96x32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x96x32.cu new file mode 100644 index 000000000..d0d84c06b --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/kernels/lat_gmma_int32u8u8u8_64x96x32.cu @@ -0,0 +1,5 @@ +#include "lat_gmma_common.h" + +void run_int32u8u8u8_64x96x32_test() { + TEST_MMA_CONFIG(uint8_t, uint8_t, int32_t, 64, 96, 32, "MMA_64x96x32_S32U8U8_SS_TN"); +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/lat_gmma.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/lat_gmma.cu new file mode 100644 index 000000000..109de0bef --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/lat_gmma.cu @@ -0,0 +1,13 @@ +#include +#include "lat_gmma.h" +#include "../../../hw_def/hw_def.h" + +int main(int argc, char *argv[]) +{ + intilizeDeviceProp(0, argc, argv); + + // Run comprehensive sweep over all valid MMA operations + run_all_wgmma_latency_tests(); + + return 0; +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/lat_gmma.h b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/lat_gmma.h new file mode 100644 index 000000000..ccb2ca7d7 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/lat_gmma.h @@ -0,0 +1,385 @@ +/*************************************************************************************************** + * GMMA Latency Microbenchmark for SM90 (Hopper Architecture) + * + * This file provides a comprehensive sweep over all valid WGMMA (Warpgroup Matrix Multiply- + * Accumulate) operations supported by the NVIDIA Hopper architecture (SM90). + * + * The sweep covers: + * + * 1. F32 Accumulator (ElementC = float): + * - FP16 x FP16 -> FP32 + * - BF16 x BF16 -> FP32 + * - TF32 x TF32 -> FP32 + * - E4M3 x E4M3 -> FP32 + * - E4M3 x E5M2 -> FP32 + * - E5M2 x E4M3 -> FP32 + * - E5M2 x E5M2 -> FP32 + * + * 2. F16 Accumulator (ElementC = half_t): + * - FP16 x FP16 -> FP16 + * - E4M3 x E4M3 -> FP16 + * - E4M3 x E5M2 -> FP16 + * - E5M2 x E4M3 -> FP16 + * - E5M2 x E5M2 -> FP16 + * + * 3. INT32 Accumulator (ElementC = int32_t): + * - INT8 x INT8 -> INT32 + * - INT8 x UINT8 -> INT32 + * - UINT8 x INT8 -> INT32 + * - UINT8 x UINT8 -> INT32 + * + * For each data type combination, the sweep tests multiple tile shapes: + * - 64x8, 64x16, 64x32, 64x64, 64x96, 64x128, 64x192, 64x256 + * + * Usage: + * Call run_all_wgmma_latency_tests() to run the comprehensive sweep. + * + * NOTE: This file has been refactored for parallel compilation. Test implementations + * are split across lat_gmma_f32.cu, lat_gmma_f16.cu, and lat_gmma_int32.cu. + * + **************************************************************************************************/ + +#ifndef LAT_GMMA_DEF_H +#define LAT_GMMA_DEF_H + +#include +#include +#include +#include + + +// Function declarations for test suites +// These are defined in separate .cu files for parallel compilation + +// F32 accumulator tests - TF32 x TF32 -> F32 +void run_f32tf32tf32tf32_64x8x8_test(); +void run_f32tf32tf32tf32_64x16x8_test(); +void run_f32tf32tf32tf32_64x32x8_test(); +void run_f32tf32tf32tf32_64x64x8_test(); +void run_f32tf32tf32tf32_64x96x8_test(); +void run_f32tf32tf32tf32_64x128x8_test(); +void run_f32tf32tf32tf32_64x192x8_test(); +void run_f32tf32tf32tf32_64x256x8_test(); + +// F32 accumulator tests - E4M3 x E4M3 -> F32 +void run_f32e4m3e4m3e4m3_64x8x32_test(); +void run_f32e4m3e4m3e4m3_64x16x32_test(); +void run_f32e4m3e4m3e4m3_64x32x32_test(); +void run_f32e4m3e4m3e4m3_64x64x32_test(); +void run_f32e4m3e4m3e4m3_64x96x32_test(); +void run_f32e4m3e4m3e4m3_64x128x32_test(); +void run_f32e4m3e4m3e4m3_64x192x32_test(); +void run_f32e4m3e4m3e4m3_64x256x32_test(); + +// F32 accumulator tests - E4M3 x E5M2 -> F32 +void run_f32e4m3e5m2e4m3_64x8x32_test(); +void run_f32e4m3e5m2e4m3_64x16x32_test(); +void run_f32e4m3e5m2e4m3_64x32x32_test(); +void run_f32e4m3e5m2e4m3_64x64x32_test(); +void run_f32e4m3e5m2e4m3_64x96x32_test(); +void run_f32e4m3e5m2e4m3_64x128x32_test(); +void run_f32e4m3e5m2e4m3_64x192x32_test(); +void run_f32e4m3e5m2e4m3_64x256x32_test(); + +// F32 accumulator tests - E5M2 x E4M3 -> F32 +void run_f32e5m2e4m3e5m2_64x8x32_test(); +void run_f32e5m2e4m3e5m2_64x16x32_test(); +void run_f32e5m2e4m3e5m2_64x32x32_test(); +void run_f32e5m2e4m3e5m2_64x64x32_test(); +void run_f32e5m2e4m3e5m2_64x96x32_test(); +void run_f32e5m2e4m3e5m2_64x128x32_test(); +void run_f32e5m2e4m3e5m2_64x192x32_test(); +void run_f32e5m2e4m3e5m2_64x256x32_test(); + +// F32 accumulator tests - E5M2 x E5M2 -> F32 +void run_f32e5m2e5m2e5m2_64x8x32_test(); +void run_f32e5m2e5m2e5m2_64x16x32_test(); +void run_f32e5m2e5m2e5m2_64x32x32_test(); +void run_f32e5m2e5m2e5m2_64x64x32_test(); +void run_f32e5m2e5m2e5m2_64x96x32_test(); +void run_f32e5m2e5m2e5m2_64x128x32_test(); +void run_f32e5m2e5m2e5m2_64x192x32_test(); +void run_f32e5m2e5m2e5m2_64x256x32_test(); + +// INT32 accumulator tests - INT8 x INT8 -> INT32 +void run_int32s8s8s8_64x8x32_test(); +void run_int32s8s8s8_64x16x32_test(); +void run_int32s8s8s8_64x32x32_test(); +void run_int32s8s8s8_64x64x32_test(); +void run_int32s8s8s8_64x96x32_test(); +void run_int32s8s8s8_64x128x32_test(); +void run_int32s8s8s8_64x192x32_test(); +void run_int32s8s8s8_64x256x32_test(); + +// INT32 accumulator tests - INT8 x UINT8 -> INT32 +void run_int32s8u8s8_64x8x32_test(); +void run_int32s8u8s8_64x16x32_test(); +void run_int32s8u8s8_64x32x32_test(); +void run_int32s8u8s8_64x64x32_test(); +void run_int32s8u8s8_64x96x32_test(); +void run_int32s8u8s8_64x128x32_test(); +void run_int32s8u8s8_64x192x32_test(); +void run_int32s8u8s8_64x256x32_test(); + +// INT32 accumulator tests - UINT8 x INT8 -> INT32 +void run_int32u8s8u8_64x8x32_test(); +void run_int32u8s8u8_64x16x32_test(); +void run_int32u8s8u8_64x32x32_test(); +void run_int32u8s8u8_64x64x32_test(); +void run_int32u8s8u8_64x96x32_test(); +void run_int32u8s8u8_64x128x32_test(); +void run_int32u8s8u8_64x192x32_test(); +void run_int32u8s8u8_64x256x32_test(); + +// INT32 accumulator tests - UINT8 x UINT8 -> INT32 +void run_int32u8u8u8_64x8x32_test(); +void run_int32u8u8u8_64x16x32_test(); +void run_int32u8u8u8_64x32x32_test(); +void run_int32u8u8u8_64x64x32_test(); +void run_int32u8u8u8_64x96x32_test(); +void run_int32u8u8u8_64x128x32_test(); +void run_int32u8u8u8_64x192x32_test(); +void run_int32u8u8u8_64x256x32_test(); + +// F16 accumulator tests (defined in lat_gmma_f16.cu) +// F32 accumulator tests - FP16 x FP16 -> F32 +void run_f32f16f16_64x8x16_test(); +void run_f32f16f16_64x16x16_test(); +void run_f32f16f16_64x32x16_test(); +void run_f32f16f16_64x64x16_test(); +void run_f32f16f16_64x96x16_test(); +void run_f32f16f16_64x128x16_test(); +void run_f32f16f16_64x192x16_test(); +void run_f32f16f16_64x256x16_test(); + +// F32 accumulator tests - BF16 x BF16 -> F32 +void run_f32bf16bf16_64x8x16_test(); +void run_f32bf16bf16_64x16x16_test(); +void run_f32bf16bf16_64x32x16_test(); +void run_f32bf16bf16_64x64x16_test(); +void run_f32bf16bf16_64x96x16_test(); +void run_f32bf16bf16_64x128x16_test(); +void run_f32bf16bf16_64x192x16_test(); +void run_f32bf16bf16_64x256x16_test(); + +// F16 accumulator tests - FP16 x FP16 -> F16 +void run_f16f16f16_64x8x16_test(); +void run_f16f16f16_64x16x16_test(); +void run_f16f16f16_64x32x16_test(); +void run_f16f16f16_64x64x16_test(); +void run_f16f16f16_64x96x16_test(); +void run_f16f16f16_64x128x16_test(); +void run_f16f16f16_64x192x16_test(); +void run_f16f16f16_64x256x16_test(); + +// F16 accumulator tests - E4M3 x E4M3 -> F16 +void run_f16e4m3e4m3_64x8x32_test(); +void run_f16e4m3e4m3_64x16x32_test(); +void run_f16e4m3e4m3_64x32x32_test(); +void run_f16e4m3e4m3_64x64x32_test(); +void run_f16e4m3e4m3_64x96x32_test(); +void run_f16e4m3e4m3_64x128x32_test(); +void run_f16e4m3e4m3_64x192x32_test(); +void run_f16e4m3e4m3_64x256x32_test(); + +// F16 accumulator tests - E4M3 x E5M2 -> F16 +void run_f16e4m3e5m2_64x8x32_test(); +void run_f16e4m3e5m2_64x16x32_test(); +void run_f16e4m3e5m2_64x32x32_test(); +void run_f16e4m3e5m2_64x64x32_test(); +void run_f16e4m3e5m2_64x96x32_test(); +void run_f16e4m3e5m2_64x128x32_test(); +void run_f16e4m3e5m2_64x192x32_test(); +void run_f16e4m3e5m2_64x256x32_test(); + +// F16 accumulator tests - E5M2 x E4M3 -> F16 +void run_f16e5m2e4m3_64x8x32_test(); +void run_f16e5m2e4m3_64x16x32_test(); +void run_f16e5m2e4m3_64x32x32_test(); +void run_f16e5m2e4m3_64x64x32_test(); +void run_f16e5m2e4m3_64x96x32_test(); +void run_f16e5m2e4m3_64x128x32_test(); +void run_f16e5m2e4m3_64x192x32_test(); +void run_f16e5m2e4m3_64x256x32_test(); + +// F16 accumulator tests - E5M2 x E5M2 -> F16 +void run_f16e5m2e5m2_64x8x32_test(); +void run_f16e5m2e5m2_64x16x32_test(); +void run_f16e5m2e5m2_64x32x32_test(); +void run_f16e5m2e5m2_64x64x32_test(); +void run_f16e5m2e5m2_64x96x32_test(); +void run_f16e5m2e5m2_64x128x32_test(); +void run_f16e5m2e5m2_64x192x32_test(); +void run_f16e5m2e5m2_64x256x32_test(); + +void run_f16accumulator_tests() { + run_f16f16f16_64x8x16_test(); + run_f16f16f16_64x16x16_test(); + run_f16f16f16_64x32x16_test(); + run_f16f16f16_64x64x16_test(); + run_f16f16f16_64x96x16_test(); + run_f16f16f16_64x128x16_test(); + run_f16f16f16_64x192x16_test(); + run_f16f16f16_64x256x16_test(); + run_f16e4m3e4m3_64x8x32_test(); + run_f16e4m3e4m3_64x16x32_test(); + run_f16e4m3e4m3_64x32x32_test(); + run_f16e4m3e4m3_64x64x32_test(); + run_f16e4m3e4m3_64x96x32_test(); + run_f16e4m3e4m3_64x128x32_test(); + run_f16e4m3e4m3_64x192x32_test(); + run_f16e4m3e4m3_64x256x32_test(); + run_f16e4m3e5m2_64x8x32_test(); + run_f16e4m3e5m2_64x16x32_test(); + run_f16e4m3e5m2_64x32x32_test(); + run_f16e4m3e5m2_64x64x32_test(); + run_f16e4m3e5m2_64x96x32_test(); + run_f16e4m3e5m2_64x128x32_test(); + run_f16e4m3e5m2_64x192x32_test(); + run_f16e4m3e5m2_64x256x32_test(); + run_f16e5m2e4m3_64x8x32_test(); + run_f16e5m2e4m3_64x16x32_test(); + run_f16e5m2e4m3_64x32x32_test(); + run_f16e5m2e4m3_64x64x32_test(); + run_f16e5m2e4m3_64x96x32_test(); + run_f16e5m2e4m3_64x128x32_test(); + run_f16e5m2e4m3_64x192x32_test(); + run_f16e5m2e4m3_64x256x32_test(); + run_f16e5m2e5m2_64x8x32_test(); + run_f16e5m2e5m2_64x16x32_test(); + run_f16e5m2e5m2_64x32x32_test(); + run_f16e5m2e5m2_64x64x32_test(); + run_f16e5m2e5m2_64x96x32_test(); + run_f16e5m2e5m2_64x128x32_test(); + run_f16e5m2e5m2_64x192x32_test(); + run_f16e5m2e5m2_64x256x32_test(); +} + +void run_f32accumulator_tests() { + run_f32tf32tf32tf32_64x8x8_test(); + run_f32tf32tf32tf32_64x16x8_test(); + run_f32tf32tf32tf32_64x32x8_test(); + run_f32tf32tf32tf32_64x64x8_test(); + run_f32tf32tf32tf32_64x96x8_test(); + run_f32tf32tf32tf32_64x128x8_test(); + run_f32tf32tf32tf32_64x192x8_test(); + run_f32tf32tf32tf32_64x256x8_test(); + run_f32f16f16_64x8x16_test(); + run_f32f16f16_64x16x16_test(); + run_f32f16f16_64x32x16_test(); + run_f32f16f16_64x64x16_test(); + run_f32f16f16_64x96x16_test(); + run_f32f16f16_64x128x16_test(); + run_f32f16f16_64x192x16_test(); + run_f32f16f16_64x256x16_test(); + run_f32bf16bf16_64x8x16_test(); + run_f32bf16bf16_64x16x16_test(); + run_f32bf16bf16_64x32x16_test(); + run_f32bf16bf16_64x64x16_test(); + run_f32bf16bf16_64x96x16_test(); + run_f32bf16bf16_64x128x16_test(); + run_f32bf16bf16_64x192x16_test(); + run_f32bf16bf16_64x256x16_test(); + run_f32e4m3e4m3e4m3_64x8x32_test(); + run_f32e4m3e4m3e4m3_64x16x32_test(); + run_f32e4m3e4m3e4m3_64x32x32_test(); + run_f32e4m3e4m3e4m3_64x64x32_test(); + run_f32e4m3e4m3e4m3_64x96x32_test(); + run_f32e4m3e4m3e4m3_64x128x32_test(); + run_f32e4m3e4m3e4m3_64x192x32_test(); + run_f32e4m3e4m3e4m3_64x256x32_test(); + run_f32e4m3e5m2e4m3_64x8x32_test(); + run_f32e4m3e5m2e4m3_64x16x32_test(); + run_f32e4m3e5m2e4m3_64x32x32_test(); + run_f32e4m3e5m2e4m3_64x64x32_test(); + run_f32e4m3e5m2e4m3_64x96x32_test(); + run_f32e4m3e5m2e4m3_64x128x32_test(); + run_f32e4m3e5m2e4m3_64x192x32_test(); + run_f32e4m3e5m2e4m3_64x256x32_test(); + run_f32e5m2e4m3e5m2_64x8x32_test(); + run_f32e5m2e4m3e5m2_64x16x32_test(); + run_f32e5m2e4m3e5m2_64x32x32_test(); + run_f32e5m2e4m3e5m2_64x64x32_test(); + run_f32e5m2e4m3e5m2_64x96x32_test(); + run_f32e5m2e4m3e5m2_64x128x32_test(); + run_f32e5m2e4m3e5m2_64x192x32_test(); + run_f32e5m2e4m3e5m2_64x256x32_test(); + run_f32e5m2e5m2e5m2_64x8x32_test(); + run_f32e5m2e5m2e5m2_64x16x32_test(); + run_f32e5m2e5m2e5m2_64x32x32_test(); + run_f32e5m2e5m2e5m2_64x64x32_test(); + run_f32e5m2e5m2e5m2_64x96x32_test(); + run_f32e5m2e5m2e5m2_64x128x32_test(); + run_f32e5m2e5m2e5m2_64x192x32_test(); + run_f32e5m2e5m2e5m2_64x256x32_test(); +} + +void run_int32accumulator_tests() { + run_int32s8s8s8_64x8x32_test(); + run_int32s8s8s8_64x16x32_test(); + run_int32s8s8s8_64x32x32_test(); + run_int32s8s8s8_64x64x32_test(); + run_int32s8s8s8_64x96x32_test(); + run_int32s8s8s8_64x128x32_test(); + run_int32s8s8s8_64x192x32_test(); + run_int32s8s8s8_64x256x32_test(); + run_int32s8u8s8_64x8x32_test(); + run_int32s8u8s8_64x16x32_test(); + run_int32s8u8s8_64x32x32_test(); + run_int32s8u8s8_64x64x32_test(); + run_int32s8u8s8_64x96x32_test(); + run_int32s8u8s8_64x128x32_test(); + run_int32s8u8s8_64x192x32_test(); + run_int32s8u8s8_64x256x32_test(); + run_int32u8s8u8_64x8x32_test(); + run_int32u8s8u8_64x16x32_test(); + run_int32u8s8u8_64x32x32_test(); + run_int32u8s8u8_64x64x32_test(); + run_int32u8s8u8_64x96x32_test(); + run_int32u8s8u8_64x128x32_test(); + run_int32u8s8u8_64x192x32_test(); + run_int32u8s8u8_64x256x32_test(); + run_int32u8u8u8_64x8x32_test(); + run_int32u8u8u8_64x16x32_test(); + run_int32u8u8u8_64x32x32_test(); + run_int32u8u8u8_64x64x32_test(); + run_int32u8u8u8_64x96x32_test(); + run_int32u8u8u8_64x128x32_test(); + run_int32u8u8u8_64x192x32_test(); + run_int32u8u8u8_64x256x32_test(); +} + +// ============================================================================ +// Main Test Function - Run All Configurations +// ============================================================================ + +inline void run_all_wgmma_latency_tests() { + printf("\n"); + printf("================================================================================\n"); + printf(" SM90 GMMA Latency Comprehensive Sweep\n"); + printf("================================================================================\n"); + printf("\n"); + + // Run F32 accumulator tests + run_f32accumulator_tests(); + + // Run F16 accumulator tests + run_f16accumulator_tests(); + + // Run INT32 accumulator tests + run_int32accumulator_tests(); + + printf("================================================================================\n"); + printf(" Sweep Complete\n"); + printf("================================================================================\n"); + printf("\n"); +} + +// Legacy function signatures for compatibility +float gmma_latency_ss() { + printf("Running comprehensive WGMMA latency tests...\n"); + run_all_wgmma_latency_tests(); + return 0.0f; +} + +#endif diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/lat_gmma_common.h b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/lat_gmma_common.h new file mode 100644 index 000000000..e3248e7a1 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_gmma/lat_gmma_common.h @@ -0,0 +1,196 @@ +/*************************************************************************************************** + * GMMA Latency Microbenchmark - Common Definitions + * + * This header contains shared kernel templates and helper macros used by all GMMA latency tests. + * + **************************************************************************************************/ + +#ifndef LAT_GMMA_COMMON_H +#define LAT_GMMA_COMMON_H + +#include +#include +#include +#include +#include +#include + +#include "cute/arch/util.hpp" +#include "../../../hw_def/hw_def.h" + +// CUTLASS cute library headers +#include +#include "cutlass/numeric_types.h" +#include +#include +#include +#include +#include +#include + +using namespace cute; + +#define REPEAT_TIMES 1024 + +// ============================================================================ +// Base Kernel Template +// ============================================================================ + +template< + class ElementA, + class ElementB, + class ElementC, + class TileShape_MNK +> +__global__ void wgmma_latency_kernel(uint32_t *startClk, uint32_t *stopClk, uint32_t *checksum) { + int thread_idx = threadIdx.x + blockDim.x * threadIdx.y + threadIdx.z * blockDim.x * blockDim.y ; + int warp_group_idx = __shfl_sync(0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0); + + static constexpr GMMA::Major GmmaMajorA = cute::GMMA::Major::K; + static constexpr GMMA::Major GmmaMajorB = cute::GMMA::Major::K; + + // Create the GMMA operation + auto gmma_op = cute::GMMA::ss_op_selector< + ElementA, ElementB, ElementC, TileShape_MNK, GmmaMajorA, GmmaMajorB>(); + using MMA_Op = decltype(gmma_op); + + // Create the TiledMma based on element types and tile shape + using TiledMma = decltype(cute::make_tiled_mma(gmma_op)); + using MMA_Traits = typename TiledMma::Traits; + TiledMma tiled_mma; + MMA_Traits traits; + + // Create the fragment A, B, C + // Define the smem layouts using GMMA layout helpers for K-major + // Using Layout_K_INTER_Atom which has minimal swizzling (Swizzle<0,4,3> = identity) + // Layout_K_INTER_Atom_Bits has shape (8, 128 bits) = (8, 4) for 32-bit elements + constexpr int PIPE = 1; + using SmemLayoutA = decltype(tile_to_shape(GMMA::Layout_K_INTER_Atom{}, + make_shape(shape<0>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int{}))); + using SmemLayoutB = decltype(tile_to_shape(GMMA::Layout_K_INTER_Atom{}, + make_shape(shape<1>(TileShape_MNK{}), shape<2>(TileShape_MNK{}), Int{}))); + + // Allocate shared memory with proper size (using type aliases for constexpr evaluation) + __shared__ ElementA smem_A[cosize_v]; + __shared__ ElementB smem_B[cosize_v]; + + // Create the layout objects for tensor construction + SmemLayoutA sA_layout{}; + SmemLayoutB sB_layout{}; + + + // Create the tensors with GMMA-compatible layouts + Tensor sA = make_tensor(make_smem_ptr(smem_A), sA_layout); // (BLK_M, BLK_K, PIPE) + Tensor sB = make_tensor(make_smem_ptr(smem_B), sB_layout); // (BLK_N, BLK_K, PIPE) + constexpr int MmaWarpGroups = size(TiledMma{}) / cutlass::NumThreadsPerWarpGroup; + Layout warp_group_thread_layout = make_layout(Int{}, + Int{}); + auto thread_mma = tiled_mma.get_slice(warp_group_thread_layout(warp_group_idx)); + + Tensor tCsA = thread_mma.partition_A(sA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCsB = thread_mma.partition_B(sB); // (MMA,MMA_N,MMA_K,PIPE) + + // Allocate "fragments/descriptors" + Tensor tCrA = thread_mma.make_fragment_A(tCsA); // (MMA,MMA_M,MMA_K,PIPE) + Tensor tCrB = thread_mma.make_fragment_B(tCsB); // (MMA,MMA_N,MMA_K,PIPE) + + // Get fragment registers for accumulator with MN size + auto accum = partition_fragment_C(tiled_mma, take<0,2>(TileShape_MNK{})); + + __syncthreads(); + + // Start timing (only thread 0) + uint32_t start = 0; + if (thread_idx == 0) { + asm volatile("mov.u32 %0, %%clock;" : "=r"(start) :: "memory"); + } + __syncthreads(); + + // Fence accumulator operands + warpgroup_fence_operand(accum); + + // Arrive and execute WGMMA + warpgroup_arrive(); + #pragma unroll + for (int j = 0; j < REPEAT_TIMES; j++) { + // Call the fma method + cute::gemm(tiled_mma, tCrA(_,_,_,0), tCrB(_,_,_,0), accum); + } + // Wait for WGMMA to complete + warpgroup_commit_batch(); + warpgroup_wait<0>(); + warpgroup_fence_operand(accum); + + __syncthreads(); + + // Stop timing + uint32_t stop = 0; + if (thread_idx == 0) { + asm volatile("mov.u32 %0, %%clock;" : "=r"(stop) :: "memory"); + } + + // Write results + if (thread_idx == 0) { + startClk[blockIdx.x] = start; + stopClk[blockIdx.x] = stop; + + // Compute checksum to prevent optimization + uint32_t sum = reinterpret_cast(accum.data())[0]; + // Simple checksum over accumulator + checksum[blockIdx.x] = sum; + } +} + +// ============================================================================ +// Host Function Template +// ============================================================================ + +template +float run_wgmma_latency_test_typed() { + // Allocate device memory + uint32_t *startClk_g, *stopClk_g, *checksum_g; + gpuErrchk(cudaMalloc(&startClk_g, sizeof(uint32_t))); + gpuErrchk(cudaMalloc(&stopClk_g, sizeof(uint32_t))); + gpuErrchk(cudaMalloc(&checksum_g, sizeof(uint32_t))); + + // Launch kernel with 128 threads (warpgroup size) + dim3 grid(1); + dim3 block(128); + wgmma_latency_kernel<<>>(startClk_g, stopClk_g, checksum_g); + + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaDeviceSynchronize()); + + // Copy results back + uint32_t startClk, stopClk, checksum; + gpuErrchk(cudaMemcpy(&startClk, startClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(&stopClk, stopClk_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(&checksum, checksum_g, sizeof(uint32_t), cudaMemcpyDeviceToHost)); + + // Calculate latency + float latency = ((float)(stopClk - startClk)) / ((float)REPEAT_TIMES); + + // Cleanup + cudaFree(startClk_g); + cudaFree(stopClk_g); + cudaFree(checksum_g); + + return latency; +} + +// ============================================================================ +// Helper Macro for Testing +// ============================================================================ + +#define TEST_MMA_CONFIG(EA, EB, EC, M, N, K, DESC) \ + do { \ + try { \ + using TileShape = decltype(make_shape(Int{}, Int{}, Int{})); \ + float lat = run_wgmma_latency_test_typed(); \ + printf("%-50s: %6.2f cycles\n", DESC, lat); \ + } catch (...) { \ + printf("%-50s: FAILED\n", DESC); \ + } \ + } while(0) + +#endif // LAT_GMMA_COMMON_H diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_half/lat_half.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_half/lat_half.cu index b74ff8484..96c67f260 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/lat_half/lat_half.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_half/lat_half.cu @@ -7,5 +7,5 @@ int main(int argc, char *argv[]) fpu16_latency(); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.cu b/src/cuda/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.cu index bc3012192..9754f9b78 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/lat_int32/lat_int32.cu @@ -7,5 +7,5 @@ int main(int argc, char *argv[]) int32_latency(); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/regfile_bw/regfile_bw.cu b/src/cuda/GPU_Microbenchmark/ubench/core/regfile_bw/regfile_bw.cu index 7e9889784..02d82f85f 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/regfile_bw/regfile_bw.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/regfile_bw/regfile_bw.cu @@ -61,5 +61,5 @@ int main(int argc, char *argv[]) std::cout << "-gpgpu_reg_file_port_throughput " << reg_ports << std::endl; } - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.cu b/src/cuda/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.cu index c06028a2f..409062b03 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.cu @@ -7,5 +7,5 @@ int main(int argc, char *argv[]) sfu_max_flops(); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.cu b/src/cuda/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.cu index 8deaa1c74..f2fdaf7db 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/sfu_lat_fsqrt/sfu_lat_fsqrt.cu @@ -7,5 +7,5 @@ int main(int argc, char *argv[]) sfu_latency(); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.cu b/src/cuda/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.cu index 31beeb382..b72369548 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/tensor_bw_half/tensor_bw_half.cu @@ -16,5 +16,5 @@ int main(int argc, char *argv[]) // tensor_max_flops(); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.cu b/src/cuda/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.cu index 923627f9b..9477f035e 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/core/tensor_lat_half/tensor_lat_half.cu @@ -16,5 +16,5 @@ int main(int argc, char *argv[]) // tensor_lat(); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/l1_associativity.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/l1_associativity.cu index 87c57effe..b1068afc9 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/l1_associativity.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_associativity/l1_associativity.cu @@ -198,5 +198,5 @@ int main(int argc, char *argv[]) std::cout << "Saving L1 cache assoc data at L1asso.csv" << std::endl; myfile1.close(); myfile2.close(); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu index 167078b88..5034a3cd4 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu @@ -150,5 +150,5 @@ int main(int argc, char *argv[]) << "(GB/s/SM)\n"; std::cout << "Total Clk number = " << total_time << "\n"; - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu index 170bb4b19..c03995f84 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu @@ -156,5 +156,5 @@ int main(int argc, char *argv[]) << "(GB/s/SM)\n"; std::cout << "Total Clk number = " << total_time << "\n"; - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu index f15465060..708e90ec1 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu @@ -142,5 +142,5 @@ int main(int argc, char *argv[]) printf("L1 bandwidth = %f (byte/clk/SM)\n", bw); printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu index dd3421985..a118c86fe 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu @@ -148,5 +148,5 @@ int main(int argc, char *argv[]) << "(GB/s/SM)\n"; std::cout << "Total Clk number = " << total_time << "\n"; - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu index f4916cb7c..45f920971 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu @@ -137,5 +137,5 @@ int main(int argc, char *argv[]) << "(GB/s/SM)\n"; std::cout << "Total Clk number = " << total_time << "\n"; - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_config/l1_config.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_config/l1_config.cu index dffe033c6..57bb24a55 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_config/l1_config.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_config/l1_config.cu @@ -116,5 +116,5 @@ int main(int argc, char *argv[]) << std::endl; } - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.cu index 5566fcc9b..a96be86dd 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.cu @@ -12,5 +12,5 @@ int main(int argc, char* argv[]) { std::cout << "-gpgpu_l1_latency " << (unsigned)lat << std::endl; - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu index 4958b2d5d..7efa14253 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu @@ -153,5 +153,5 @@ int main(int argc, char *argv[]) //l1_structure (stride, array_size, shared_mem_size_byte, iteration); } */ - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_sector/l1_sector.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_sector/l1_sector.cu index 6238f0935..81a4d97b5 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_sector/l1_sector.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_sector/l1_sector.cu @@ -132,5 +132,5 @@ int main(int argc, char* argv[]) { myfile.close(); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu index 8175693f7..01df4a0d9 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu @@ -139,5 +139,5 @@ int main(int argc, char *argv[]) printf("Shared Memory Bandwidth = %f (byte/clk/SM)\n", bw); printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/l1_write_policy.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/l1_write_policy.cu index 8267e8a04..558003d6a 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/l1_write_policy.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_write_policy/l1_write_policy.cu @@ -115,5 +115,5 @@ int main(int argc, char *argv[]) "l1tex__t_sectors_pipe_lsu_mem_global_op_ld_lookup_hit.sum & " "l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_hit.sum \n\n"; - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu index e439f3648..8315c19e6 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu @@ -160,5 +160,5 @@ int main(int argc, char *argv[]) << BW << "(GB/s)\n"; std::cout << "L2 BW achievable = " << (bw / max_bw) * 100 << "%\n"; #endif - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu index e423022fa..0bee6a579 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu @@ -157,5 +157,5 @@ int main(int argc, char *argv[]) << BW << "(GB/s)\n"; std::cout << "L2 BW achievable = " << (bw / max_bw) * 100 << "%\n"; #endif - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu index ce4a532ab..828be220b 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu @@ -160,5 +160,5 @@ int main(int argc, char* argv[]) { << BW << "(GB/s)\n"; std::cout << "L2 BW achievable = " << (bw / max_bw) * 100 << "%\n"; #endif - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu index 397d06704..ec24b5a9e 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu @@ -92,5 +92,5 @@ int main(int argc, char *argv[]) << "A:192:4,32:0,32" << std::endl; } - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/l2_copy_engine.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/l2_copy_engine.cu index eb49d33e9..6b028bd67 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/l2_copy_engine.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_copy_engine/l2_copy_engine.cu @@ -139,5 +139,5 @@ int main(int argc, char *argv[]) std::cout << "-gpgpu_perf_sim_memcpy " << cached << std::endl; } - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu index 932f2b537..d1d92ca72 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu @@ -14,5 +14,5 @@ int main(int argc, char *argv[]) std::cout << "-gpgpu_l2_rop_latency " << (unsigned)(lat2 - lat1) << std::endl; - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/l2_write_policy.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/l2_write_policy.cu index fd43d5a76..ba1fe10c1 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/l2_write_policy.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_write_policy/l2_write_policy.cu @@ -115,5 +115,5 @@ int main(int argc, char *argv[]) "lts__t_sectors_srcunit_tex_op_read_lookup_hit.sum & " "lts__t_sectors_srcunit_tex_op_write_lookup_hit.sum \n\n"; - return 1; + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu index 18f2a7657..75fed9cfd 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu @@ -81,5 +81,5 @@ int main(int argc, char *argv[]) // dramid@8;00000000.00000000.00000000.00000000.0000RRRR.RRRRRRRR.RBBBCCCC.BCCSSSSS"< // PFN_cuTensorMapEncodeTiled, CUtensorMap +#include // CUtensormap +#include +#include +#include +#include +#include +#include +using barrier = cuda::barrier; +namespace ptx = cuda::ptx; + +/* + * Test cuda program to map mbarrier related PTX instructions to SASS. + * + */ + +#define CUDA_SAFECALL(call) \ + { \ + call; \ + cudaError err = cudaGetLastError(); \ + if (cudaSuccess != err) \ + { \ + fprintf( \ + stderr, \ + "Cuda error in function '%s' file '%s' in line %i : %s.\n", \ + #call, __FILE__, __LINE__, cudaGetErrorString(err)); \ + fflush(stderr); \ + exit(EXIT_FAILURE); \ + } \ + } + +__global__ __noinline__ void test_mbarrier_kernel() { + // mbarrier object is 64bit in shared memory + __shared__ uint64_t mbarrier; + uint64_t state; + int32_t count; + // Initialize the mbarrier with PTX asm + int block_size = blockDim.x * blockDim.y; + if (threadIdx.x == 0 && threadIdx.y == 0) { + asm("mbarrier.init.shared::cta.b64 [%0], %1;" : : "l"(&mbarrier), "r"(block_size) : "memory"); + } + __syncthreads(); + + // Expect on the mbarrier +#if __CUDA_ARCH__ >= 900 + int bytes_per_thread = 4; + asm("mbarrier.expect_tx.shared::cta.b64 [%0], %1;" : : "l"(&mbarrier), "r"(bytes_per_thread) : "memory"); + __syncthreads(); + + // Complete on the mbarrier + asm("mbarrier.complete_tx.shared::cta.b64 [%0], %1;" : : "l"(&mbarrier), "r"(bytes_per_thread) : "memory"); + __syncthreads(); + + // All threads in the block arrive on the mbarrier + asm("mbarrier.arrive.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(1) : "memory"); + __syncthreads(); + + // Arrive and expect on the mbarrier + asm("mbarrier.arrive.expect_tx.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(2) : "memory"); + __syncthreads(); + + // Arrive and drop + asm("mbarrier.arrive_drop.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(3) : "memory"); + __syncthreads(); +#else + // For sm_80 + // All threads in the block arrive on the mbarrier + asm("mbarrier.arrive.noComplete.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(1) : "memory"); + __syncthreads(); + + // Arrive and drop + asm("mbarrier.arrive_drop.noComplete.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(3) : "memory"); + __syncthreads(); +#endif + +#if __CUDA_ARCH__ >= 900 + // Arrive and drop + asm("mbarrier.arrive_drop.expect_tx.b64 %0, [%1], %2;" : "=l"(state) : "l"(&mbarrier), "n"(4) : "memory"); + __syncthreads(); +#endif + + // Get pending count + asm("mbarrier.pending_count.b64 %0, %1;" : "=r"(count) : "l"(state) : "memory"); + // Prevent optimizing away + if (threadIdx.x == 0 && threadIdx.y == 0) { + printf("Pending count: %d\n", count); + } + __syncthreads(); + + // cp async barrier arrive + asm("cp.async.mbarrier.arrive.shared::cta.b64 [%0];" : : "l"(&mbarrier) : "memory"); + __syncthreads(); + asm("cp.async.mbarrier.arrive.noinc.shared::cta.b64 [%0];" : : "l"(&mbarrier) : "memory"); + __syncthreads(); + + // Wait on the mbarrier +#if __CUDA_ARCH__ >= 900 + asm ("\n\t" + ".reg .pred complete;\n\t" + "mbarrier.test_wait.parity.b64 complete, [%0], %1;" + : : "l"(&mbarrier), "n"(0) : "memory" + ); + __syncthreads(); + asm ("\n\t" + "mbarrier.try_wait.parity.b64 complete, [%0], %1;" + : : "l"(&mbarrier), "n"(0) : "memory" + ); + __syncthreads(); +#endif +} + +int main(int argc, char *argv[]) { + CUDA_SAFECALL((test_mbarrier_kernel<<<1, 1>>>())); + CUDA_SAFECALL(cudaDeviceSynchronize()); + + printf("Mbarrier test completed\n"); + return 0; +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile new file mode 100644 index 000000000..d5b109de8 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/Makefile @@ -0,0 +1,13 @@ +SRC = tma_bulk.cu + +EXE = tma_bulk + +# TMA is supported on SM_90a and above +ARCH?=sm_90a sm_100a sm_101 sm_120 +# Unset the CUDA_CPPFLAGS which is set based on CUDA version +# but TMA is only supported on SM_90a and above +CUDA_CPPFLAGS= +# Generate code for both sm_XXX and compute_XXX (SASS and PTX) +NVCC_FLAGS := $(foreach arch,$(ARCH),-gencode=arch=compute_$(subst sm_,,$(arch)),code=$(arch) -gencode=arch=compute_$(subst sm_,,$(arch)),code=compute_$(subst sm_,,$(arch))) -std=c++14 + +include ../../../common/common.mk diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu new file mode 100644 index 000000000..b5058d6d3 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_bulk/tma_bulk.cu @@ -0,0 +1,273 @@ +#include +#include +#include +#include +#include + +/* + * Test application for TMA bulk operations. + * + * Usage: ./tma_bulk -n -o + * + */ + +#define CUDA_SAFECALL(call) \ + { \ + call; \ + cudaError err = cudaGetLastError(); \ + if (cudaSuccess != err) \ + { \ + fprintf( \ + stderr, \ + "Cuda error in function '%s' file '%s' in line %i : %s.\n", \ + #call, __FILE__, __LINE__, cudaGetErrorString(err)); \ + fflush(stderr); \ + exit(EXIT_FAILURE); \ + } \ + } + +// Adapt from https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=tma#using-tma-to-transfer-one-dimensional-arrays +using barrier = cuda::barrier; +namespace ptx = cuda::ptx; + +#if defined(__CUDA_MINIMUM_ARCH__) && __CUDA_MINIMUM_ARCH__ < 900 +static_assert(false, "Device code is being compiled with older architectures that are incompatible with TMA."); +#endif // __CUDA_MINIMUM_ARCH__ + +static constexpr size_t buf_len = 1024; +#define DEFAULT_RUN_ITERS 128 + +__global__ void test_UBLKPF(int32_t *data, int run_iters) +{ + size_t offset = blockIdx.x * blockDim.x; + + // Trigger a bulk prefetch + uint64_t prefetch_addr = uint64_t(data + offset); + uint32_t prefetch_count = buf_len * sizeof(int32_t); + if (threadIdx.x == 0) + { + asm volatile( + "cp.async.bulk.prefetch.L2.global" + " [%0], %1;" + : + : "l"(prefetch_addr), + "r"(prefetch_count) + : "memory"); + ptx::cp_async_bulk_commit_group(); + ptx::cp_async_bulk_wait_group_read(ptx::n32_t<0>()); + } +} + +__global__ void test_UBLKCP_S_G(int32_t *data, int run_iters) +{ + // Shared memory buffer. The destination shared memory buffer of + // a bulk operations should be 16 byte aligned. + __shared__ alignas(16) int32_t smem_data[buf_len]; + + size_t offset = blockIdx.x * blockDim.x; + #pragma nv_diag_suppress static_var_with_dynamic_init + __shared__ barrier bar; + if (threadIdx.x == 0) { + init(&bar, blockDim.x); + ptx::fence_proxy_async(ptx::space_shared); + } + __syncthreads(); + + for (int i = 0; i < run_iters; i++) { + // Initiate TMA transfer to copy global to shared memory. + if (threadIdx.x == 0) + { + cuda::memcpy_async( + smem_data, + data + offset, + cuda::aligned_size_t<16>(sizeof(smem_data)), + bar); + } + barrier::arrival_token token = bar.arrive(); + bar.wait(std::move(token)); + } +} + +__global__ void test_UBLKCP_G_S(int32_t *data, int run_iters) +{ + // Shared memory buffer. The destination shared memory buffer of + // a bulk operations should be 16 byte aligned. + __shared__ alignas(16) int32_t smem_data[buf_len]; + + size_t offset = blockIdx.x * blockDim.x; + + // Compute a unique value for each thread across all blocks + for (int i = threadIdx.x; i < buf_len; i += blockDim.x) + { + smem_data[i] = threadIdx.x + blockIdx.x * blockDim.x; + } + + ptx::fence_proxy_async(ptx::space_shared); // b) + __syncthreads(); + + for (int i = 0; i < run_iters; i++) { + if (threadIdx.x == 0) { + ptx::cp_async_bulk( + ptx::space_global, + ptx::space_shared, + data + offset, smem_data, sizeof(smem_data)); + ptx::cp_async_bulk_commit_group(); + ptx::cp_async_bulk_wait_group_read(ptx::n32_t<0>()); + } + } +} + +__global__ void test_UBLKRED_G_S(int32_t *data, int run_iters) +{ + // Shared memory buffer. The destination shared memory buffer of + // a bulk operations should be 16 byte aligned. + __shared__ alignas(16) int32_t smem_data[buf_len]; + + size_t offset = blockIdx.x * blockDim.x; + + // Compute a unique value for each thread across all blocks + for (int i = threadIdx.x; i < buf_len; i += blockDim.x) + { + smem_data[i] = threadIdx.x + blockIdx.x * blockDim.x; + } + + ptx::fence_proxy_async(ptx::space_shared); // b) + __syncthreads(); + + for (int i = 0; i < run_iters; i++) { + if (threadIdx.x == 0) + { + // Use max so the result wont change compared with + // before the reduction, as TMA can only get source values + ptx::cp_reduce_async_bulk( + ptx::space_global, + ptx::space_shared, + ptx::op_max, + data + offset, smem_data, sizeof(smem_data)); + ptx::cp_async_bulk_commit_group(); + ptx::cp_async_bulk_wait_group_read(ptx::n32_t<0>()); + } + } +} + +int main(int argc, char *argv[]) +{ + // Parse command line arguments + int n = 1024 * 16; + const char* opcode = "UBLKPF"; + int opt; + int run_iters = DEFAULT_RUN_ITERS; + while ((opt = getopt(argc, argv, "n:o:i:")) != -1) { + switch (opt) { + case 'n': + n = atoi(optarg); + break; + case 'o': + opcode = strdup(optarg); + break; + case 'i': + run_iters = atoi(optarg); + break; + default: + fprintf(stderr, "Usage: %s -n -o \n", argv[0]); + fprintf(stderr, " -n : number of elements\n"); + fprintf(stderr, " -o : opcode\n"); + fprintf(stderr, " -o UBLKPF: prefetch\n"); + fprintf(stderr, " -o UBLKCP_S_G: bulk copy shared to global\n"); + fprintf(stderr, " -o UBLKCP_G_S: bulk copy global to shared\n"); + fprintf(stderr, " -o UBLKRED_G_S: bulk reduce global to shared\n"); + fprintf(stderr, " -i : number of iterations\n"); + return 1; + } + } + + // Check if opcode is valid + if (strcmp(opcode, "UBLKPF") != 0 && + strcmp(opcode, "UBLKCP_S_G") != 0 && + strcmp(opcode, "UBLKCP_G_S") != 0 && + strcmp(opcode, "UBLKRED_G_S") != 0) { + fprintf(stderr, "Invalid opcode\n"); + return 1; + } + + // Host input vectors + int32_t *h_a; + + // Host destination vectors + int32_t *h_b; + + // Device input vectors + int32_t *d_a; + + // Size, in bytes, of each vector + size_t bytes = n * sizeof(int32_t); + + // Allocate memory for each vector on host + h_a = (int32_t *)malloc(bytes); + h_b = (int32_t *)malloc(bytes); + // Allocate memory for each vector on GPU + cudaMalloc(&d_a, bytes); + + uint32_t i; + // Initialize vectors on host with unique values + for (i = 0; i < n; i++) + { + h_a[i] = i; + } + + // Copy host vectors to device + cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice); + + uint32_t blockSize, gridSize; + + // Number of threads in each thread block + blockSize = 1024; + + // Number of thread blocks in grid + gridSize = (uint32_t)ceil((float)n / blockSize); + + // Execute the kernel based on the opcode + if (strcmp(opcode, "UBLKPF") == 0) { + CUDA_SAFECALL((test_UBLKPF<<>>(d_a, run_iters))); + } else if (strcmp(opcode, "UBLKCP_S_G") == 0) { + CUDA_SAFECALL((test_UBLKCP_S_G<<>>(d_a, run_iters))); + } else if (strcmp(opcode, "UBLKCP_G_S") == 0) { + CUDA_SAFECALL((test_UBLKCP_G_S<<>>(d_a, run_iters))); + } else if (strcmp(opcode, "UBLKRED_G_S") == 0) { + CUDA_SAFECALL((test_UBLKRED_G_S<<>>(d_a, run_iters))); + } + + // Copy array back to host + cudaMemcpy(h_b, d_a, bytes, cudaMemcpyDeviceToHost); + + // Dump the values to a file in hex format + // For load operations, use host source array + int32_t *ptr = h_a; + + // For store operations, use destination array + if (strcmp(opcode, "UBLKCP_G_S") == 0 || + strcmp(opcode, "UBLKRED_G_S") == 0) { + ptr = h_b; + } + + char filename[100]; + sprintf(filename, "tma_bulk_test_%s_%d.txt", opcode, n); + FILE *f = fopen(filename, "w"); + for (i = 0; i < n; i++) + { + fprintf(f, "0x%x ", ptr[i]); + // Add line break after every 512 values + if ((i + 1) % 512 == 0) + fprintf(f, "\n"); + } + fclose(f); + printf("Values dumped to %s\n", filename); + + // Release host memory + free(h_a); + + // Release device memory + cudaFree(d_a); + + return 0; +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile new file mode 100644 index 000000000..0feb9fb6a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/Makefile @@ -0,0 +1,14 @@ +SRC = tma_tensor.cu + +EXE = tma_tensor + +# TMA is supported on SM_90a and above +ARCH?=sm_90a sm_100a sm_101 sm_120 +# Unset the CUDA_CPPFLAGS which is set based on CUDA version +# but TMA is only supported on SM_90a and above +CUDA_CPPFLAGS= + +# Generate code for both sm_XXX and compute_XXX (SASS and PTX) +NVCC_FLAGS := $(foreach arch,$(ARCH),-gencode=arch=compute_$(subst sm_,,$(arch)),code=$(arch) -gencode=arch=compute_$(subst sm_,,$(arch)),code=compute_$(subst sm_,,$(arch))) -std=c++14 + +include ../../../common/common.mk diff --git a/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu new file mode 100644 index 000000000..44a00f85f --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/tma/tma_tensor/tma_tensor.cu @@ -0,0 +1,401 @@ +// Adapt from https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html?highlight=tma#using-tma-to-transfer-multi-dimensional-arrays +#include // PFN_cuTensorMapEncodeTiled, CUtensorMap +#include // CUtensormap +#include +#include +#include +#include +#include +#include +using barrier = cuda::barrier; +namespace ptx = cuda::ptx; + +/* + * Test application for TMA tensor operations. + * + * Usage: ./tma_tensor -w -h -o + * + */ + +#define CUDA_SAFECALL(call) \ + { \ + call; \ + cudaError err = cudaGetLastError(); \ + if (cudaSuccess != err) \ + { \ + fprintf( \ + stderr, \ + "Cuda error in function '%s' file '%s' in line %i : %s.\n", \ + #call, __FILE__, __LINE__, cudaGetErrorString(err)); \ + fflush(stderr); \ + exit(EXIT_FAILURE); \ + } \ + } + +#define GMEM_WIDTH 1024 +#define GMEM_HEIGHT 1024 +#define SMEM_WIDTH 32 +#define SMEM_HEIGHT 32 +#define DEFAULT_RUN_ITERS 128 + +enum class TestType { + UTMAPF, + UTMALDG, + UTMALDG_L2Hint, + UTMASTG, + UTMAREDG, + REGULAR_LOAD +}; + +static const std::unordered_map opcode_map = { + {"UTMAPF", TestType::UTMAPF}, + {"UTMALDG", TestType::UTMALDG}, + {"UTMASTG", TestType::UTMASTG}, + {"UTMAREDG", TestType::UTMAREDG}, + {"REGULAR_LOAD", TestType::REGULAR_LOAD} +}; + +__global__ void test_kernel(const __grid_constant__ CUtensorMap tensor_map, TestType test_type, int width_stride, int run_iters); +__device__ void test_UTMAPF_kernel(CUtensorMap const& tensor_map, int x, int y, int run_iters); +__device__ void test_UTMALDG_kernel(CUtensorMap const& tensor_map, int x, int y, int run_iters); +__device__ void test_UTMASTG_kernel(CUtensorMap const& tensor_map, int x, int y, int run_iters); +__device__ void test_UTMAREDG_kernel(CUtensorMap const& tensor_map, int x, int y, int run_iters); +__device__ void test_REGULAR_LOAD_kernel(int *mat, int x, int y, int width_stride, int run_iters); + +__global__ void test_kernel(const __grid_constant__ CUtensorMap tensor_map, int *mat, TestType test_type, int width_stride, int run_iters) { + int x = blockDim.x * blockIdx.x; + int y = blockDim.y * blockIdx.y; + if (blockIdx.x == 0 && blockIdx.y == 0 && + threadIdx.x == 0 && threadIdx.y == 0) { + printf("TensorMap address: %p\n", &tensor_map); + } + switch (test_type) { + case TestType::UTMAPF: + test_UTMAPF_kernel(tensor_map, x, y, run_iters); + break; + case TestType::UTMALDG: + test_UTMALDG_kernel(tensor_map, x, y, run_iters); + break; + case TestType::UTMASTG: + test_UTMASTG_kernel(tensor_map, x, y, run_iters); + break; + case TestType::UTMAREDG: + test_UTMAREDG_kernel(tensor_map, x, y, run_iters); + break; + case TestType::REGULAR_LOAD: + test_REGULAR_LOAD_kernel(mat, x, y, width_stride, run_iters); + break; + default: + test_UTMAPF_kernel(tensor_map, x, y, run_iters); + break; + } +} + +__device__ void test_UTMAPF_kernel(CUtensorMap const& tensor_map, int x, int y, int run_iters) { + // TensorMap prefetch at tensor_map with tensor coord {x, y} + if (threadIdx.x == 0 && threadIdx.y == 0) { + asm volatile ( + "cp.async.bulk.prefetch.tensor.2d.L2.global.tile" + " [%0, {%1, %2}];" + : + : "l"(&tensor_map), + "r"(x), + "r"(y) + : "memory"); + } +} + +__device__ void test_UTMALDG_kernel(CUtensorMap const& tensor_map, int x, int y, int run_iters) { + // The destination shared memory buffer of a bulk tensor operation should be + // 128 byte aligned. + __shared__ alignas(128) int smem_buffer[SMEM_HEIGHT][SMEM_WIDTH]; + +// Initialize shared memory barrier with the number of threads participating in the barrier. +#pragma nv_diag_suppress static_var_with_dynamic_init + __shared__ barrier bar; + + if (threadIdx.x == 0 && threadIdx.y == 0) + { + // Initialize barrier. All threads in block participate. + init(&bar, blockDim.x * blockDim.y); + // Make initialized barrier visible in async proxy. + ptx::fence_proxy_async(ptx::space_shared); + } + // Syncthreads so initialized barrier is visible to all threads. + __syncthreads(); + + for (int i = 0; i < run_iters; i++) { + barrier::arrival_token token; + if (threadIdx.x == 0 && threadIdx.y == 0) { + // Initiate bulk tensor copy. + ptx::cp_async_bulk_tensor( + ptx::space_cluster, + ptx::space_global, + &smem_buffer, + &tensor_map, + {x, y}, + cuda::device::barrier_native_handle(bar) + ); + // Arrive on the barrier and tell how many bytes are expected to come in. + token = cuda::device::barrier_arrive_tx(bar, 1, sizeof(smem_buffer)); + } + else + { + // Other threads just arrive. + token = bar.arrive(); + } + // Wait for the data to have arrived. + bar.wait(std::move(token)); + } +} + +__device__ void test_UTMASTG_kernel(CUtensorMap const& tensor_map, int x, int y, int run_iters) { + __shared__ alignas(128) int smem_buffer[SMEM_HEIGHT][SMEM_WIDTH]; + + // Compute a unique value for the thread + int thread_x = threadIdx.x + x; + int thread_y = threadIdx.y + y; + smem_buffer[threadIdx.y][threadIdx.x] = 1 + thread_x + thread_y * blockDim.x * gridDim.x; + + // Wait for shared memory writes to be visible to TMA engine. + ptx::fence_proxy_async(ptx::space_shared); + __syncthreads(); + + // Initiate TMA transfer to copy shared memory to global memory + for (int i = 0; i < run_iters; i++) { + if (threadIdx.x == 0 && threadIdx.y == 0) { + ptx::cp_async_bulk_tensor( + ptx::space_global, + ptx::space_shared, + &tensor_map, + {x, y}, + &smem_buffer + ); + ptx::cp_async_bulk_commit_group(); + ptx::cp_async_bulk_wait_group_read(ptx::n32_t<0>()); + } + } +} + +__device__ void test_UTMAREDG_kernel(CUtensorMap const& tensor_map, int x, int y, int run_iters) { + __shared__ alignas(128) int smem_buffer[SMEM_HEIGHT][SMEM_WIDTH]; + + // Compute a unique value for the thread + int thread_x = threadIdx.x + x; + int thread_y = threadIdx.y + y; + // Add 1 so the max op will not be effective + smem_buffer[threadIdx.y][threadIdx.x] = 1 + thread_x + thread_y * blockDim.x * gridDim.x; + + // Wait for shared memory writes to be visible to TMA engine. + ptx::fence_proxy_async(ptx::space_shared); + __syncthreads(); + + // Initiate TMA transfer to copy shared memory to global memory + for (int i = 0; i < run_iters; i++) { + if (threadIdx.x == 0 && threadIdx.y == 0) { + ptx::cp_reduce_async_bulk_tensor( + ptx::space_global, + ptx::space_shared, + ptx::op_max, + &tensor_map, + {x, y}, + &smem_buffer + ); + ptx::cp_async_bulk_commit_group(); + ptx::cp_async_bulk_wait_group_read(ptx::n32_t<0>()); + } + } +} + +__device__ void test_REGULAR_LOAD_kernel(int *mat, int x, int y, int width_stride, int run_iters) { + __shared__ alignas(128) int smem_buffer[SMEM_HEIGHT][SMEM_WIDTH]; + + // Compute a unique value for the thread + int thread_x = threadIdx.x + x; + int thread_y = threadIdx.y + y; + // Mimic a TMA load pattern here + for (int i = 0; i < run_iters; i++) { + if (threadIdx.x == 0 && threadIdx.y == 0) { + for (int row = 0; row < SMEM_HEIGHT; row++) { + for (int col = 0; col < SMEM_WIDTH; col++) { + smem_buffer[row][col] = mat[(y + row) * width_stride + (x + col)] + 1; + } + } + } + __syncthreads(); + // Mimic a TMA store pattern here to make compiler happy + if (threadIdx.x == 0 && threadIdx.y == 0) { + for (int row = 0; row < SMEM_HEIGHT; row++) { + for (int col = 0; col < SMEM_WIDTH; col++) { + mat[(y + row) * width_stride + (x + col)] = smem_buffer[row][col]; + } + } + } + __syncthreads(); + } +} + +PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled() +{ + // Get pointer to cuTensorMapEncodeTiled + cudaDriverEntryPointQueryResult driver_status; + void *cuTensorMapEncodeTiled_ptr = nullptr; + CUDA_SAFECALL(cudaGetDriverEntryPointByVersion("cuTensorMapEncodeTiled", &cuTensorMapEncodeTiled_ptr, 12000, cudaEnableDefault, &driver_status)); + assert(driver_status == cudaDriverEntryPointSuccess); + + return reinterpret_cast(cuTensorMapEncodeTiled_ptr); +} + + +int main(int argc, char *argv[]) { + uint64_t width = GMEM_WIDTH; + uint64_t height = GMEM_HEIGHT; + std::string opcode = "UTMAPF"; + TestType test_type = TestType::UTMAPF; + int run_iters = DEFAULT_RUN_ITERS; + int opt; + while ((opt = getopt(argc, argv, "w:h:o:i:")) != -1) { + switch (opt) { + case 'w': + width = uint64_t(atoi(optarg)); + break; + case 'h': + height = uint64_t(atoi(optarg)); + break; + case 'o': + opcode = std::string(optarg); + break; + case 'i': + run_iters = atoi(optarg); + break; + default: + fprintf(stderr, "Usage: %s -w -h -o \n", argv[0]); + fprintf(stderr, " Block size: %d x %d\n", SMEM_WIDTH, SMEM_HEIGHT); + fprintf(stderr, " -w : width of the matrix\n"); + fprintf(stderr, " -h : height of the matrix\n"); + fprintf(stderr, " -o : opcode\n"); + fprintf(stderr, " -o UTMAPF: tensor prefetch\n"); + fprintf(stderr, " -o UTMALDG: tensor load async\n"); + fprintf(stderr, " -o UTMASTG: tensor store async\n"); + fprintf(stderr, " -o UTMAREDG: tensor reduce async\n"); + fprintf(stderr, " -i : number of iterations\n"); + return 1; + } + } + + // Check if opcode is valid + if (opcode_map.find(opcode) == opcode_map.end()) { + fprintf(stderr, "Invalid opcode\n"); + return 1; + } + + test_type = opcode_map.at(opcode); + + // Initialize data matrix + int *mat, *out_mat, *d_mat; + // height|width_stride must be a multiple of 16 and must be greater than height|width + // Here we make it multiple of SMEM_HEIGHT and SMEM_WIDTH to fit our shmem size + uint64_t height_stride = ((height + (SMEM_HEIGHT - 1)) / SMEM_HEIGHT) * SMEM_HEIGHT; + uint64_t width_stride = ((width + (SMEM_WIDTH - 1)) / SMEM_WIDTH) * SMEM_WIDTH; + printf("height: %lu, width: %lu\n", height, width); + printf("height_stride: %lu, width_stride: %lu\n", height_stride, width_stride); + size_t byte_count = height_stride * width_stride * sizeof(int); + mat = (int *)malloc(byte_count); + out_mat = (int *)malloc(byte_count); + cudaMalloc(&d_mat, byte_count); + + // Initialize the matrix, set the values to 0 for out of bounds + uint64_t i = 1; + for (uint64_t r = 0; r < height_stride; r++) { + for (uint64_t c = 0; c < width_stride; c++) { + // If within the tensor, set some unique values + if (r < height && c < width) { + mat[r * width_stride + c] = i; + i++; + } + // Otherwise, set the value to 0 + else { + mat[r * width_stride + c] = 0; + } + } + } + + // TMA tensor map object + CUtensorMap tensor_map{}; + // rank is the number of dimensions of the array. + constexpr uint32_t rank = 2; + // The tensor size + uint64_t size[rank] = {width, height}; + // The stride is the number of bytes to traverse from the first element of one row to the next. + // It must be a multiple of 16. + uint64_t stride[rank - 1] = {width_stride * sizeof(int)}; + // The box_size is the size of the shared memory buffer that is used as the + // destination of a TMA transfer. + uint32_t box_size[rank] = {SMEM_WIDTH, SMEM_HEIGHT}; + // The distance between elements in units of sizeof(element). A stride of 2 + // can be used to load only the real component of a complex-valued tensor, for instance. + uint32_t elem_stride[rank] = {1, 1}; + + // Get a function pointer to the cuTensorMapEncodeTiled driver API. + auto cuTensorMapEncodeTiled = get_cuTensorMapEncodeTiled(); + + // Create the tensor descriptor. + CUresult res = cuTensorMapEncodeTiled( + &tensor_map, // CUtensorMap *tensorMap, + CUtensorMapDataType::CU_TENSOR_MAP_DATA_TYPE_INT32, + rank, // cuuint32_t tensorRank, + d_mat, // void *globalAddress, + size, // const cuuint64_t *globalDim, + stride, // const cuuint64_t *globalStrides, + box_size, // const cuuint32_t *boxDim, + elem_stride, // const cuuint32_t *elementStrides, + // Interleave patterns can be used to accelerate loading of values that + // are less than 4 bytes long. + CUtensorMapInterleave::CU_TENSOR_MAP_INTERLEAVE_NONE, + // Swizzling can be used to avoid shared memory bank conflicts. + CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE, + // L2 Promotion can be used to widen the effect of a cache-policy to a wider + // set of L2 cache lines. + CUtensorMapL2promotion::CU_TENSOR_MAP_L2_PROMOTION_NONE, + // Any element that is outside of bounds will be set to zero by the TMA transfer. + CUtensorMapFloatOOBfill::CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + + // Kernel launch + dim3 grid_dim(width_stride / SMEM_WIDTH, height_stride / SMEM_HEIGHT); + dim3 block_dim(SMEM_WIDTH, SMEM_HEIGHT); + printf("grid_dim: x: %d, y: %d\n", grid_dim.x, grid_dim.y); + printf("block_dim: x: %d, y: %d\n", block_dim.x, block_dim.y); + cudaMemcpy(d_mat, mat, byte_count, cudaMemcpyHostToDevice); + CUDA_SAFECALL((test_kernel<<>>(tensor_map, d_mat, test_type, width_stride, run_iters))); + CUDA_SAFECALL(cudaMemcpy(out_mat, d_mat, byte_count, cudaMemcpyDeviceToHost)); + + // Print the matrix to output file + // Dump the values to a file in hex format + // For load operations, use host source array + int *ptr = mat; + + // For store operations, use destination array + if (test_type == TestType::UTMASTG || test_type == TestType::UTMAREDG) { + ptr = out_mat; + } + + char filename[100]; + sprintf(filename, "tma_tensor_test_%s_%lu_%lu.txt", opcode.c_str(), height, width); + FILE *f = fopen(filename, "w"); + for (int i = 0; i < height_stride * width_stride; i++) + { + fprintf(f, "0x%x ", ptr[i]); + // Add line break after every 512 values + if ((i + 1) % 512 == 0) + fprintf(f, "\n"); + } + fclose(f); + printf("Values dumped to %s\n", filename); + + // Release device memory + cudaFree(d_mat); + + // Release host memory + free(mat); + return 0; +} diff --git a/src/setup_environment b/src/setup_environment index a7122c679..0b21b2431 100755 --- a/src/setup_environment +++ b/src/setup_environment @@ -1,6 +1,14 @@ export GPUAPPS_SETUP_ENVIRONMENT_WAS_RUN= -export GPUAPPS_ROOT="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )"/../ +# Determine script location (bash and zsh compatible) +if [ -n "$BASH_SOURCE" ]; then + SCRIPT_PATH="$BASH_SOURCE" +elif [ -n "$ZSH_VERSION" ]; then + SCRIPT_PATH="${(%):-%x}" +else + SCRIPT_PATH="$0" +fi +export GPUAPPS_ROOT="$( cd "$( dirname "$SCRIPT_PATH" )" && pwd )"/../ export CUDA_PATH=$CUDA_INSTALL_PATH export CUDA_VERSION=`nvcc --version | grep release | sed -re 's/.*release ([0-9]+\.[0-9]+).*/\1/'`;