diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 57f9c00ea..a1f193662 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -30,7 +30,7 @@ jobs: run: | git config --global --add safe.directory /__w/gpu-app-collection/gpu-app-collection git submodule update --init -- src/cuda/cuda-samples - /bin/bash test-build.sh + /bin/bash test-build.sh ci - name: Print Successful Apps if: always() diff --git a/src/Makefile b/src/Makefile index 64a679840..86404973a 100644 --- a/src/Makefile +++ b/src/Makefile @@ -12,7 +12,7 @@ else all: pannotia rodinia_2.0-ft proxy-apps microbench rodinia-3.1 ispass-2009 polybench parboil shoc custom_apps endif endif -ci: rodinia_2.0-ft rodinia-3.1 GPU_Microbenchmark +ci: rodinia_2.0-ft rodinia-3.1 GPU_Microbenchmark cutlass cuda_samples accelwattch: accelwattch_validation accelwattch_hw_power accelwattch_ubench accelwattch_validation: rodinia-3.1_accelwattch_validation parboil_accelwattch_validation cutlass cuda_samples-11.0_accelwattch_validation accelwattch_hw_power: rodinia-3.1_hw_power parboil_hw_power cuda_samples-11.0_hw_power @@ -34,7 +34,7 @@ data: ###################################################################################################3 UVMSmart_test: mkdir -p $(BINDIR)/$(BINSUBDIR)/ - cd cuda/UVMSmart_test/; make + cd cuda/UVMSmart_test/; $(MAKE) mv cuda/UVMSmart_test/Managed/2DCONV/2dconv $(BINDIR)/$(BINSUBDIR)/uvm_2dconv mv cuda/UVMSmart_test/Managed/ATAX/atax $(BINDIR)/$(BINSUBDIR)/uvm_atax mv cuda/UVMSmart_test/Managed/AddVectors/add_vectors $(BINDIR)/$(BINSUBDIR)/uvm_add_vectors @@ -54,23 +54,23 @@ UVMSmart_test: # Rodinia 2.0 Functional Test Stuff ###################################################################################################3 rodinia_2.0-ft: - $(SETENV) make $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/backprop - $(SETENV) make $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/bfs - $(SETENV) make $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/heartwall - $(SETENV) make $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/hotspot -# $(SETENV) make $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/kmeans - $(SETENV) make $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/lud - $(SETENV) make $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/nn - $(SETENV) make $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/nw - $(SETENV) make $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/pathfinder - $(SETENV) make $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/srad - $(SETENV) make $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/streamcluster + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/backprop + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/bfs + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/heartwall + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/hotspot +# $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/kmeans + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/lud + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/nn + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/nw + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/pathfinder + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/srad + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/rodinia/2.0-ft/streamcluster ###################################################################################################3 # Purdue microbenchmarks for added functionality ###################################################################################################3 microbench: - $(SETENV) make $(MAKE_ARGS) -C cuda/microbench cuda-$(CUDA_VERSION_MAJOR) + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/microbench cuda-$(CUDA_VERSION_MAJOR) ###################################################################################################3 # For Dragon, we need to change the archs manually! (TO DO) @@ -107,12 +107,14 @@ dragon-cdp: dragon-naive GPU_Microbenchmark: mkdir -p $(BINDIR)/$(BINSUBDIR)/ - $(SETENV) make $(MAKE_ARGS) -C cuda/GPU_Microbenchmark -j12 + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/GPU_Microbenchmark cp -r cuda/GPU_Microbenchmark/bin/* $(BINDIR)/$(BINSUBDIR)/ +clean_GPU_Microbenchmark: + find cuda/GPU_Microbenchmark/ubench -type f -executable -delete Deepbench_nvidia: - $(SETENV) make $(MAKE_ARGS) -C cuda/DeepBench/code/nvidia + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/DeepBench/code/nvidia cp -r cuda/DeepBench/code/nvidia/bin/conv_bench* $(BINDIR)/$(BINSUBDIR)/ cp -r cuda/DeepBench/code/nvidia/bin/gemm_bench* $(BINDIR)/$(BINSUBDIR)/ cp -r cuda/DeepBench/code/nvidia/bin/rnn_bench* $(BINDIR)/$(BINSUBDIR)/ @@ -122,16 +124,16 @@ Deepbench_nvidia: ###################################################################################################3 pannotia: - $(SETENV) make $(MAKE_ARGS) -C cuda/pannotia/bc - $(SETENV) export VARIANT="MAX"; make $(MAKE_ARGS) -C cuda/pannotia/color - $(SETENV) export VARIANT="MAXMIN"; make $(MAKE_ARGS) -C cuda/pannotia/color - $(SETENV) export VARIANT="DEFAULT"; make $(MAKE_ARGS) -C cuda/pannotia/fw - $(SETENV) export VARIANT="BLOCK"; make $(MAKE_ARGS) -C cuda/pannotia/fw - $(SETENV) make $(MAKE_ARGS) -C cuda/pannotia/mis - $(SETENV) export VARIANT="DEFAULT"; make $(MAKE_ARGS) -C cuda/pannotia/pagerank - $(SETENV) export VARIANT="SPMV"; make $(MAKE_ARGS) -C cuda/pannotia/pagerank - $(SETENV) export VARIANT="CSR"; make $(MAKE_ARGS) -C cuda/pannotia/sssp - $(SETENV) export VARIANT="ELL"; make $(MAKE_ARGS) -C cuda/pannotia/sssp + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/pannotia/bc + $(SETENV) export VARIANT="MAX"; $(MAKE) $(MAKE_ARGS) -C cuda/pannotia/color + $(SETENV) export VARIANT="MAXMIN"; $(MAKE) $(MAKE_ARGS) -C cuda/pannotia/color + $(SETENV) export VARIANT="DEFAULT"; $(MAKE) $(MAKE_ARGS) -C cuda/pannotia/fw + $(SETENV) export VARIANT="BLOCK"; $(MAKE) $(MAKE_ARGS) -C cuda/pannotia/fw + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/pannotia/mis + $(SETENV) export VARIANT="DEFAULT"; $(MAKE) $(MAKE_ARGS) -C cuda/pannotia/pagerank + $(SETENV) export VARIANT="SPMV"; $(MAKE) $(MAKE_ARGS) -C cuda/pannotia/pagerank + $(SETENV) export VARIANT="CSR"; $(MAKE) $(MAKE_ARGS) -C cuda/pannotia/sssp + $(SETENV) export VARIANT="ELL"; $(MAKE) $(MAKE_ARGS) -C cuda/pannotia/sssp ###################################################################################################3 #TO DO @@ -146,42 +148,42 @@ proxy-apps: ($(SETENV) cd cuda/proxy-apps-doe/cns/ ; ./compile.bash) #chmod +x cuda/proxy-apps-doe/comd/cmd_compile.sh #( cd cuda/proxy-apps-doe/comd ; ./cmd_compile.sh) - $(SETENV) make $(MAKE_ARGS) -C cuda/proxy-apps-doe/lulesh + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/proxy-apps-doe/lulesh if [ ${CUDA_VERSION_MAJOR} -lt 7 ] ; then \ - $(SETENV) make $(MAKE_ARGS) -C cuda/proxy-apps-doe/minife_matvec_ell;\ + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/proxy-apps-doe/minife_matvec_ell;\ fi - $(SETENV) make $(MAKE_ARGS) -C cuda/proxy-apps-doe/xsbench + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/proxy-apps-doe/xsbench rodinia-3.1: mkdir -p $(BINDIR)/$(BINSUBDIR) if [ ${CUDA_VERSION_MAJOR} -gt 5 ]; then \ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/huffman/; \ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/huffman/; \ fi - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop -f Makefile_nvidia - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/bfs -f Makefile_nvidia - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/cfd -f Makefile_nvidia - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot -f Makefile_nvidia + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop -f Makefile_nvidia + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/bfs -f Makefile_nvidia + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/cfd -f Makefile_nvidia + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot -f Makefile_nvidia # All the texture binding stuff in this old workload seems to cause issues with the CUDA API -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/kmeans -f Makefile_nvidia - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/nw -f Makefile_nvidia - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/streamcluster -f Makefile_nvidia -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/mummergpu - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/dwt2d/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/heartwall/ -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hybridsort/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/myocyte/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/nn/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/particlefilter/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/particlefilter/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/pathfinder/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/lavaMD/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/lud/cuda/ -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/leukocyte/CUDA/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot3D/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/gaussian - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/srad/srad_v1 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/srad/srad_v2 -f Makefile_nvidia +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/kmeans -f Makefile_nvidia + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/nw -f Makefile_nvidia + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/streamcluster -f Makefile_nvidia +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/mummergpu + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/dwt2d/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/heartwall/ +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hybridsort/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/myocyte/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/nn/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/particlefilter/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/particlefilter/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/pathfinder/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/lavaMD/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/lud/cuda/ +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/leukocyte/CUDA/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot3D/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/gaussian + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/srad/srad_v1 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/srad/srad_v2 -f Makefile_nvidia if [ ${CUDA_VERSION_MAJOR} -gt 5 ]; then \ mv cuda/rodinia/3.1/cuda/huffman/pavle $(BINDIR)/$(BINSUBDIR)/huffman-rodinia-3.1; \ fi @@ -212,12 +214,12 @@ rodinia-3.1: rodinia-3.1_accelwattch_validation: mkdir -p $(BINDIR)/$(BINSUBDIR) - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop -f Makefile_nvidia - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot -f Makefile_nvidia -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/kmeans -f Makefile_nvidia - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/pathfinder/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/srad/srad_v1 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop -f Makefile_nvidia + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot -f Makefile_nvidia +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/kmeans -f Makefile_nvidia + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/pathfinder/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/srad/srad_v1 mv cuda/rodinia/3.1/cuda/b+tree/b+tree.out $(BINDIR)/$(BINSUBDIR)/b+tree-rodinia-3.1 mv cuda/rodinia/3.1/cuda/pathfinder/pathfinder $(BINDIR)/$(BINSUBDIR)/pathfinder-rodinia-3.1 mv cuda/rodinia/3.1/cuda/srad/srad_v1/srad $(BINDIR)/$(BINSUBDIR)/srad_v1-rodinia-3.1 @@ -227,18 +229,18 @@ rodinia-3.1_accelwattch_validation: rodinia-3.1_hw_power: mkdir -p $(BINDIR)/$(BINSUBDIR) - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop_k1 -f Makefile_nvidia + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop_k1 -f Makefile_nvidia mv $(BINDIR)/$(BINSUBDIR)/backprop $(BINDIR)/$(BINSUBDIR)/backprop_k1 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop_k2 -f Makefile_nvidia + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop_k2 -f Makefile_nvidia mv $(BINDIR)/$(BINSUBDIR)/backprop $(BINDIR)/$(BINSUBDIR)/backprop_k2 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot_k1 -f Makefile_nvidia + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot_k1 -f Makefile_nvidia mv $(BINDIR)/$(BINSUBDIR)/hotspot $(BINDIR)/$(BINSUBDIR)/hotspot_k1 -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/kmeans_k1 -f Makefile_nvidia +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/kmeans_k1 -f Makefile_nvidia # mv $(BINDIR)/$(BINSUBDIR)/kmeans $(BINDIR)/$(BINSUBDIR)/kmeans_k1 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree_k1/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree_k2/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/pathfinder_k1/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/srad/srad_v1_k1 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree_k1/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree_k2/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/pathfinder_k1/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/srad/srad_v1_k1 mv cuda/rodinia/3.1/cuda/b+tree_k1/b+tree.out $(BINDIR)/$(BINSUBDIR)/btree_k1 mv cuda/rodinia/3.1/cuda/b+tree_k2/b+tree.out $(BINDIR)/$(BINSUBDIR)/btree_k2 mv cuda/rodinia/3.1/cuda/pathfinder_k1/pathfinder $(BINDIR)/$(BINSUBDIR)/pathfinder_k1 @@ -247,14 +249,14 @@ rodinia-3.1_hw_power: cuda_samples-11.0_accelwattch_validation: mkdir -p $(BINDIR)/$(BINSUBDIR) - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/binomialOptions - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/dct8x8 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/fastWalshTransform - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/histogram - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/mergeSort - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/quasirandomGenerator - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/SobolQRNG - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/cudaTensorCoreGemm + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/binomialOptions + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/dct8x8 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/fastWalshTransform + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/histogram + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/mergeSort + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/quasirandomGenerator + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/SobolQRNG + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/cudaTensorCoreGemm mv cuda/NVIDIA_CUDA-11.0_Samples/binomialOptions/binomialOptions $(BINDIR)/$(BINSUBDIR)/binomialOptions mv cuda/NVIDIA_CUDA-11.0_Samples/dct8x8/dct8x8 $(BINDIR)/$(BINSUBDIR)/dct8x8 mv cuda/NVIDIA_CUDA-11.0_Samples/fastWalshTransform/fastWalshTransform $(BINDIR)/$(BINSUBDIR)/fastWalshTransform @@ -266,18 +268,18 @@ cuda_samples-11.0_accelwattch_validation: cuda_samples-11.0_hw_power: mkdir -p $(BINDIR)/$(BINSUBDIR) - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/binomialOptions_k1 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/dct8x8_k1 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/dct8x8_k2 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/fastWalshTransform_k1 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/fastWalshTransform_k2 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/histogram_k1 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/mergeSort_k1 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/mergeSort_k2 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/quasirandomGenerator_k1 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/quasirandomGenerator_k2 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/SobolQRNG_k1 - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/cudaTensorCoreGemm_k1 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/binomialOptions_k1 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/dct8x8_k1 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/dct8x8_k2 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/fastWalshTransform_k1 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/fastWalshTransform_k2 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/histogram_k1 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/mergeSort_k1 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/mergeSort_k2 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/quasirandomGenerator_k1 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/quasirandomGenerator_k2 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/SobolQRNG_k1 + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/cudaTensorCoreGemm_k1 mv cuda/NVIDIA_CUDA-11.0_Samples/binomialOptions_k1/binomialOptions $(BINDIR)/$(BINSUBDIR)/binomialOptions_k1 mv cuda/NVIDIA_CUDA-11.0_Samples/dct8x8_k1/dct8x8 $(BINDIR)/$(BINSUBDIR)/dct8x8_k1 mv cuda/NVIDIA_CUDA-11.0_Samples/dct8x8_k2/dct8x8 $(BINDIR)/$(BINSUBDIR)/dct8x8_k2 @@ -294,23 +296,23 @@ cuda_samples-11.0_hw_power: ispass-2009: mkdir -p $(BINDIR)/$(BINSUBDIR) -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/AES - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/BFS -# cd CP; export PARBOIL_ROOT=`pwd`; cd common/src; make $(MAKE_ARGS); cd -; ./parboil compile cp cuda_short; cp benchmarks/cp/build/cuda_short/cp $(BINDIR)/$(BINSUBDIR)/CP -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C DG/3rdParty/ParMetis-3.1 -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C DG - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/LIB - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/LPS -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/MUM - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/NN - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/NQU - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/RAY - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/STO - PID=$$$$ && cp -r cuda/ispass-2009/WP cuda/ispass-2009/WP-$$PID && $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/WP-$$PID && rm -rf cuda/ispass-2009/WP-$$PID +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/AES + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/BFS +# cd CP; export PARBOIL_ROOT=`pwd`; cd common/src; $(MAKE) $(MAKE_ARGS); cd -; ./parboil compile cp cuda_short; cp benchmarks/cp/build/cuda_short/cp $(BINDIR)/$(BINSUBDIR)/CP +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C DG/3rdParty/ParMetis-3.1 +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C DG + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/LIB + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/LPS +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/MUM + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/NN + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/NQU + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/RAY + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/STO + PID=$$$$ && cp -r cuda/ispass-2009/WP cuda/ispass-2009/WP-$$PID && $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/ispass-2009/WP-$$PID && rm -rf cuda/ispass-2009/WP-$$PID lonestargpu-2.0: mkdir -p $(BINDIR)/$(BINSUBDIR) - $(setenv) make $(make_args) noinline=$(noinline) -C cuda/lonestargpu-2.0 all + $(setenv) $(MAKE) $(make_args) noinline=$(noinline) -C cuda/lonestargpu-2.0 all mv cuda/lonestargpu-2.0/apps/bfs/bfs $(BINDIR)/$(BINSUBDIR)/lonestar-bfs mv cuda/lonestargpu-2.0/apps/bfs/bfs-atomic $(BINDIR)/$(BINSUBDIR)/lonestar-bfs-atomic # mv cuda/lonestargpu-2.0/apps/bfs/bfs-wlc $(BINDIR)/$(BINSUBDIR)/lonestar-bfs-wlc @@ -326,7 +328,7 @@ lonestargpu-2.0: mv cuda/lonestargpu-2.0/apps/sssp/sssp-wln $(BINDIR)/$(BINSUBDIR)/lonestar-sssp-wln parboil: - make data + $(MAKE) data mkdir -p $(BINDIR)/$(BINSUBDIR) $(SETENV) cd cuda/parboil; ./parboil compile cutcp cuda $(SETENV) cd cuda/parboil; ./parboil compile bfs cuda @@ -352,7 +354,7 @@ parboil: mv ./cuda/parboil/benchmarks/tpacf/build/cuda_default/tpacf $(BINDIR)/$(BINSUBDIR)/parboil-tpacf parboil_accelwattch_validation: - make data + $(MAKE) data mkdir -p $(BINDIR)/$(BINSUBDIR) $(SETENV) cd cuda/parboil; ./parboil compile mri-q cuda $(SETENV) cd cuda/parboil; ./parboil compile sad cuda @@ -362,7 +364,7 @@ parboil_accelwattch_validation: mv ./cuda/parboil/benchmarks/sgemm/build/cuda_default/sgemm $(BINDIR)/$(BINSUBDIR)/parboil-sgemm parboil_hw_power: - make data + $(MAKE) data mkdir -p $(BINDIR)/$(BINSUBDIR) $(SETENV) cd cuda/parboil; ./parboil compile mri-q cuda_k1 $(SETENV) cd cuda/parboil; ./parboil compile sad cuda_k1 @@ -392,7 +394,7 @@ polybench: shoc: mkdir -p $(BINDIR)/$(BINSUBDIR)/ - cd cuda/shoc-master/; ./configure; $(SETENV) make $(MAKE_ARGS); $(SETENV) make $(MAKE_ARGS) -C src/cuda + cd cuda/shoc-master/; ./configure; $(SETENV) $(MAKE) $(MAKE_ARGS); $(SETENV) $(MAKE) $(MAKE_ARGS) -C src/cuda mv cuda/shoc-master/src/cuda/level0/BusSpeedDownload $(BINDIR)/$(BINSUBDIR)/shoc-BusSpeedDownload mv cuda/shoc-master/src/cuda/level0/BusSpeedReadback $(BINDIR)/$(BINSUBDIR)/shoc-BusSpeedReadback mv cuda/shoc-master/src/cuda/level0/DeviceMemory $(BINDIR)/$(BINSUBDIR)/shoc-DeviceMemory @@ -414,25 +416,25 @@ shoc: mv cuda/shoc-master/src/stability/Stability $(BINDIR)/$(BINSUBDIR)/shoc-Stability custom_apps: -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/custom-apps/shoc-modified-spmv/ -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/custom-apps/rodinia-kmn-no-tex/ - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/custom-apps/sdk-matrixMul-modified/ +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/custom-apps/shoc-modified-spmv/ +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/custom-apps/rodinia-kmn-no-tex/ + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/custom-apps/sdk-matrixMul-modified/ accelwattch_ubench: - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/accelwattch-ubench/ power + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/accelwattch-ubench/ power clean_accelwattch_ubench: - $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/accelwattch-ubench/ cleanpower + $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/accelwattch-ubench/ cleanpower # mnist_cudnn: -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/cudnn/mnist +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/cudnn/mnist # cp cuda/cudnn/mnist/mnistCUDNN $(BINDIR)/$(BINSUBDIR)/ cutlass: mkdir -p $(BINDIR)/$(BINSUBDIR)/ echo $(BINDIR) git submodule init && git submodule update - $(SETENV) mkdir -p cuda/cutlass-bench/build && cd cuda/cutlass-bench/build && cmake .. -DUSE_GPGPUSIM=1 -DCUTLASS_NVCC_ARCHS=80 -DCMAKE_CUDA_ARCHITECTURES=80 -DCMAKE_CUDA_COMPILER=$(CUDA_INSTALL_PATH)/bin/nvcc && make cutlass_profiler -j12 && make cutlass_examples -j12 + $(SETENV) mkdir -p cuda/cutlass-bench/build && cd cuda/cutlass-bench/build && cmake .. -DUSE_GPGPUSIM=1 -DCUTLASS_NVCC_ARCHS=80 -DCMAKE_CUDA_ARCHITECTURES=80 -DCMAKE_CUDA_COMPILER=$(CUDA_INSTALL_PATH)/bin/nvcc && $(MAKE) cutlass_profiler && $(MAKE) cutlass_examples echo $(BINDIR) cp cuda/cutlass-bench/build/tools/profiler/cutlass_profiler $(BINDIR)/$(BINSUBDIR)/cutlass_profiler find cuda/cutlass-bench/build/examples -type f -executable -exec sh -c 'cp "$$1" "$(BINDIR)/$(BINSUBDIR)/cutlass_$$(basename "$$1")"' sh {} \; @@ -442,7 +444,7 @@ cutlass: cutlass_examples_turing: mkdir -p $(BINDIR)/$(BINSUBDIR)/ git submodule init && git submodule update - $(SETENV) mkdir -p cuda/cutlass-bench/build && cd cuda/cutlass-bench/build && cmake .. -DUSE_GPGPUSIM=1 -DCUTLASS_NVCC_ARCHS=75 && cd ./examples/09_turing_tensorop_conv2dfprop && make 09_turing_tensorop_conv2dfprop + $(SETENV) mkdir -p cuda/cutlass-bench/build && cd cuda/cutlass-bench/build && cmake .. -DUSE_GPGPUSIM=1 -DCUTLASS_NVCC_ARCHS=75 && cd ./examples/09_turing_tensorop_conv2dfprop && $(MAKE) 09_turing_tensorop_conv2dfprop echo $(BINDIR) cp cuda/cutlass-bench/build/examples/09_turing_tensorop_conv2dfprop/09_turing_tensorop_conv2dfprop $(BINDIR)/$(BINSUBDIR)/turing_tensorop_conv2dfrop @@ -455,7 +457,7 @@ heterosync: git clone https://github.com/mattsinc/heterosync.git; \ fi && \ cd heterosync && git checkout 934a317 - $(SETENV) make $(MAKE_ARGS) CUDA_DIR=$(CUDA_INSTALL_PATH) -C cuda/heterosync/cuda/syncPrims/uvm/ + $(SETENV) $(MAKE) $(MAKE_ARGS) CUDA_DIR=$(CUDA_INSTALL_PATH) -C cuda/heterosync/cuda/syncPrims/uvm/ mv cuda/heterosync/cuda/syncPrims/uvm/allSyncPrims-1kernel $(BINDIR)/$(BINSUBDIR)/ # Only SSD,BERT,RESNET,3DUNET inference for now @@ -501,7 +503,7 @@ mlperf_training: cuda_samples: mkdir -p $(BINDIR)/$(BINSUBDIR)/ - mkdir -p ./cuda/cuda-samples/build && cd ./cuda/cuda-samples/build && cmake .. && make -j8 + mkdir -p ./cuda/cuda-samples/build && cd ./cuda/cuda-samples/build && cmake .. && $(MAKE) find $(GPUAPPS_ROOT)/src/cuda/cuda-samples/build/Samples -type f -executable -exec mv {} "$(BINDIR)/$(BINSUBDIR)/" \; ; pytorch_examples: @@ -527,15 +529,15 @@ clean_cutlass: rm -rf cuda/cutlass-bench/build # clean_deeplearning: -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C cuda/cudnn/mnist clean +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/cudnn/mnist clean clean_custom_apps: -# make clean -C cuda/custom-apps/shoc-modified-spmv/ - make clean -C cuda/custom-apps/rodinia-kmn-no-tex/ - make clean -C cuda/custom-apps/sdk-matrixMul-modified/ +# $(MAKE) clean -C cuda/custom-apps/shoc-modified-spmv/ + $(MAKE) clean -C cuda/custom-apps/rodinia-kmn-no-tex/ + $(MAKE) clean -C cuda/custom-apps/sdk-matrixMul-modified/ clean_shoc: - cd cuda/shoc-master/; make clean; make distclean + cd cuda/shoc-master/; $(MAKE) clean; $(MAKE) distclean clean_parboil: $(SETENV) cd cuda/parboil; ./parboil clean cutcp cuda @@ -556,95 +558,95 @@ clean_parboil_hw_power: $(SETENV) cd cuda/parboil; ./parboil clean sgemm cuda_k1 clean_lonestargpu-2.0: - $(setenv) make $(make_args) noinline=$(noinline) -C cuda/lonestargpu-2.0 clean + $(setenv) $(MAKE) $(make_args) noinline=$(noinline) -C cuda/lonestargpu-2.0 clean clean_ispass-2009: -# $(SETENV) make $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/AES - $(SETENV) make $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/BFS -# cd CP; export PARBOIL_ROOT=`pwd`; cd common/src; make $(MAKE_ARGS); cd -; ./parboil compile cp cuda_short; cp benchmarks/cp/build/cuda_short/cp $(BINDIR)/$(BINSUBDIR)/CP -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C DG/3rdParty/ParMetis-3.1 -# $(SETENV) make $(MAKE_ARGS) noinline=$(noinline) -C DG - $(SETENV) make $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/LIB - $(SETENV) make $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/LPS -# $(SETENV) make $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/MUM - $(SETENV) make $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/NN - $(SETENV) make $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/NQU - $(SETENV) make $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/RAY - $(SETENV) make $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/STO - $(SETENV) make $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/WP +# $(SETENV) $(MAKE) $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/AES + $(SETENV) $(MAKE) $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/BFS +# cd CP; export PARBOIL_ROOT=`pwd`; cd common/src; $(MAKE) $(MAKE_ARGS); cd -; ./parboil compile cp cuda_short; cp benchmarks/cp/build/cuda_short/cp $(BINDIR)/$(BINSUBDIR)/CP +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C DG/3rdParty/ParMetis-3.1 +# $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C DG + $(SETENV) $(MAKE) $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/LIB + $(SETENV) $(MAKE) $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/LPS +# $(SETENV) $(MAKE) $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/MUM + $(SETENV) $(MAKE) $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/NN + $(SETENV) $(MAKE) $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/NQU + $(SETENV) $(MAKE) $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/RAY + $(SETENV) $(MAKE) $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/STO + $(SETENV) $(MAKE) $(MAKE_ARGS) clean noinline=$(noinline) -C cuda/ispass-2009/WP clean_rodinia-3.1: - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop -f Makefile_nvidia - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/bfs -f Makefile_nvidia - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/cfd -f Makefile_nvidia - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot -f Makefile_nvidia -# $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/kmeans -f Makefile_nvidia - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/nw -f Makefile_nvidia - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/streamcluster -f Makefile_nvidia -# $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/mummergpu - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/dwt2d/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/heartwall/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/huffman/ -# $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hybridsort/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/myocyte/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/nn/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/particlefilter/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/particlefilter/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/pathfinder/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/lavaMD/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/lud/cuda/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/leukocyte/CUDA/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot3D/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/gaussian - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/srad/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop -f Makefile_nvidia + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/bfs -f Makefile_nvidia + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/cfd -f Makefile_nvidia + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot -f Makefile_nvidia +# $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/kmeans -f Makefile_nvidia + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/nw -f Makefile_nvidia + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/streamcluster -f Makefile_nvidia +# $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/mummergpu + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/dwt2d/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/heartwall/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/huffman/ +# $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hybridsort/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/myocyte/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/nn/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/particlefilter/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/particlefilter/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/pathfinder/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/lavaMD/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/lud/cuda/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/leukocyte/CUDA/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot3D/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/gaussian + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/srad/ clean_rodinia-3.1_hw_power: - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop_k1 -f Makefile_nvidia - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop_k2 -f Makefile_nvidia - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot_k1 -f Makefile_nvidia -# $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/kmeans_k1 -f Makefile_nvidia - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree_k1/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree_k2/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/pathfinder_k1/ - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/srad/srad_v1_k1 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop_k1 -f Makefile_nvidia + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/backprop_k2 -f Makefile_nvidia + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/hotspot_k1 -f Makefile_nvidia +# $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/kmeans_k1 -f Makefile_nvidia + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree_k1/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/b+tree_k2/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/pathfinder_k1/ + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/rodinia/3.1/cuda/srad/srad_v1_k1 clean_cuda_samples-11.0: - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/binomialOptions - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/dct8x8 - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/fastWalshTransform - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/histogram - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/mergeSort - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/quasirandomGenerator - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/SobolQRNG - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/cudaTensorCoreGemm + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/binomialOptions + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/dct8x8 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/fastWalshTransform + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/histogram + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/mergeSort + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/quasirandomGenerator + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/SobolQRNG + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/cudaTensorCoreGemm clean_rodinia_2.0-ft: - $(SETENV) make $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/backprop - $(SETENV) make $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/bfs - $(SETENV) make $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/heartwall - $(SETENV) make $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/hotspot -# $(SETENV) make $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/kmeans - $(SETENV) make $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/lud - $(SETENV) make $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/nn - $(SETENV) make $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/nw - $(SETENV) make $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/pathfinder - $(SETENV) make $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/srad - $(SETENV) make $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/streamcluster + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/backprop + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/bfs + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/heartwall + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/hotspot +# $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/kmeans + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/lud + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/nn + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/nw + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/pathfinder + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/srad + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/rodinia/2.0-ft/streamcluster clean_cuda_samples_hw_power: - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/binomialOptions_k1 - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/dct8x8_k1 - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/dct8x8_k2 - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/fastWalshTransform_k1 - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/fastWalshTransform_k2 - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/histogram_k1 - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/mergeSort_k1 - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/mergeSort_k2 - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/quasirandomGenerator_k1 - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/quasirandomGenerator_k2 - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/SobolQRNG_k1 - $(SETENV) make clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/cudaTensorCoreGemm_k1 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/binomialOptions_k1 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/dct8x8_k1 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/dct8x8_k2 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/fastWalshTransform_k1 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/fastWalshTransform_k2 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/histogram_k1 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/mergeSort_k1 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/mergeSort_k2 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/quasirandomGenerator_k1 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/quasirandomGenerator_k2 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/SobolQRNG_k1 + $(SETENV) $(MAKE) clean $(MAKE_ARGS) noinline=$(noinline) -C cuda/NVIDIA_CUDA-11.0_Samples/cudaTensorCoreGemm_k1 clean_dragon-naive: $(SETENV) rm -f /cuda/dragon_li/bin @@ -653,28 +655,28 @@ clean_dragon-cdp: $(SETENV) rm -f /cuda/dragon_li/cdp_bin clean_pannotia: - $(SETENV) make $(MAKE_ARGS) clean -C cuda/pannotia/bc - $(SETENV) export VARIANT="MAX"; make $(MAKE_ARGS) clean -C cuda/pannotia/color - $(SETENV) export VARIANT="MAXMIN"; make $(MAKE_ARGS) clean -C cuda/pannotia/color - $(SETENV) export VARIANT="DEFAULT"; make $(MAKE_ARGS) clean -C cuda/pannotia/fw - $(SETENV) export VARIANT="BLOCK"; make $(MAKE_ARGS) clean -C cuda/pannotia/fw - $(SETENV) make $(MAKE_ARGS) -C cuda/pannotia/mis - $(SETENV) export VARIANT="DEFAULT"; make $(MAKE_ARGS) clean -C cuda/pannotia/pagerank - $(SETENV) export VARIANT="SPMV"; make $(MAKE_ARGS) clean -C cuda/pannotia/pagerank - $(SETENV) export VARIANT="DEFAULT"; make $(MAKE_ARGS) clean -C cuda/pannotia/sssp - $(SETENV) export VARIANT="ELL"; make $(MAKE_ARGS) clean -C cuda/pannotia/sssp + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/pannotia/bc + $(SETENV) export VARIANT="MAX"; $(MAKE) $(MAKE_ARGS) clean -C cuda/pannotia/color + $(SETENV) export VARIANT="MAXMIN"; $(MAKE) $(MAKE_ARGS) clean -C cuda/pannotia/color + $(SETENV) export VARIANT="DEFAULT"; $(MAKE) $(MAKE_ARGS) clean -C cuda/pannotia/fw + $(SETENV) export VARIANT="BLOCK"; $(MAKE) $(MAKE_ARGS) clean -C cuda/pannotia/fw + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/pannotia/mis + $(SETENV) export VARIANT="DEFAULT"; $(MAKE) $(MAKE_ARGS) clean -C cuda/pannotia/pagerank + $(SETENV) export VARIANT="SPMV"; $(MAKE) $(MAKE_ARGS) clean -C cuda/pannotia/pagerank + $(SETENV) export VARIANT="DEFAULT"; $(MAKE) $(MAKE_ARGS) clean -C cuda/pannotia/sssp + $(SETENV) export VARIANT="ELL"; $(MAKE) $(MAKE_ARGS) clean -C cuda/pannotia/sssp clean_proxy-apps: - $(SETENV) make $(MAKE_ARGS) clean -C cuda/proxy-apps-doe/lulesh - $(SETENV) make $(MAKE_ARGS) clean -C cuda/proxy-apps-doe/minife_matvec_ell - $(SETENV) make $(MAKE_ARGS) clean -C cuda/proxy-apps-doe/xsbench + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/proxy-apps-doe/lulesh + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/proxy-apps-doe/minife_matvec_ell + $(SETENV) $(MAKE) $(MAKE_ARGS) clean -C cuda/proxy-apps-doe/xsbench chmod +x cuda/proxy-apps-doe/cns/compile.bash (cd cuda/proxy-apps-doe/cns/ ; ./compile.bash -c) chmod +x cuda/proxy-apps-doe/comd/clean.sh ( cd cuda/proxy-apps-doe/comd ; ./clean.sh ) clean_UVMSmart_test: - cd cuda/UVMSmart_test/; make clean + cd cuda/UVMSmart_test/; $(MAKE) clean clean_mlperf: rm -rf $(BINDIR)/$(BINSUBDIR)/mlperf_inference @@ -693,7 +695,7 @@ clean_pytorch_examples: rm -f $(BINDIR)/$(BINSUBDIR)/inference_vae clean_cuda_samples: - make clean -C ./cuda/cuda-samples/build + $(MAKE) clean -C ./cuda/cuda-samples/build clean_huggingface: rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface \ No newline at end of file diff --git a/src/cuda/GPU_Microbenchmark/Makefile b/src/cuda/GPU_Microbenchmark/Makefile index 384ead0cb..a21e5980e 100644 --- a/src/cuda/GPU_Microbenchmark/Makefile +++ b/src/cuda/GPU_Microbenchmark/Makefile @@ -3,18 +3,14 @@ BIN_DIR := $(BASE_DIR)/bin SUB_DIRS = $(wildcard ubench/*/*/) SUB_DIRS_ALL = $(SUB_DIRS:%=all-%) SUB_DIRS_CLEAN = $(SUB_DIRS:%=clean-%) -SUB_DIRS_TUNER = $(SUB_DIRS:%=tuner-%) all: create_dir $(SUB_DIRS_ALL) -tuner: create_dir $(SUB_DIRS_TUNER) clean: delete_dir $(SUB_DIRS_CLEAN) $(SUB_DIRS_ALL): $(MAKE) $(MAKE_FLAGS) -C $(@:all-%=%) -$(SUB_DIRS_TUNER): - $(MAKE) -C $(@:tuner-%=%) tuner $(SUB_DIRS_CLEAN): $(MAKE) $(MAKE_FLAGS) -C $(@:clean-%=%) clean diff --git a/src/cuda/GPU_Microbenchmark/hw_def/blackwell_B200_hw_def.h b/src/cuda/GPU_Microbenchmark/hw_def/blackwell_B200_hw_def.h index 2d62d0b26..73b658d6f 100644 --- a/src/cuda/GPU_Microbenchmark/hw_def/blackwell_B200_hw_def.h +++ b/src/cuda/GPU_Microbenchmark/hw_def/blackwell_B200_hw_def.h @@ -8,7 +8,7 @@ #define L1_SIZE (256 * 1024) // Max L1 size in bytes -#define CLK_FREQUENCY 1665 // frequency in MHz +// #define CLK_FREQUENCY 1665 // frequency in MHz #define ISSUE_MODEL issue_model::single // single issue core or dual issue #define CORE_MODEL core_model::subcore // subcore model or shared model diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/common.h b/src/cuda/GPU_Microbenchmark/hw_def/common/common.h index b84828646..c04e64e41 100644 --- a/src/cuda/GPU_Microbenchmark/hw_def/common/common.h +++ b/src/cuda/GPU_Microbenchmark/hw_def/common/common.h @@ -61,12 +61,12 @@ unsigned get_adjusted_CCD(enum dram_model model) { return dram_model_burst_length[model] / dram_model_freq_ratio[model]; } -unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) { - unsigned channel_width = - dram_model_bus_width[model] * dram_model_mem_per_ctrlr[model]; - assert(total_memory_width % channel_width == 0); - return total_memory_width / channel_width; -} +// unsigned get_num_channels(unsigned total_memory_width, enum dram_model model) { +// unsigned channel_width = +// dram_model_bus_width[model] * dram_model_mem_per_ctrlr[model]; +// assert(total_memory_width % channel_width == 0); +// return total_memory_width / channel_width; +// } // DDR timing struct struct DDR_Timing { diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h index 8780356af..7f2592ae1 100644 --- a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h +++ b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h @@ -3,7 +3,14 @@ #include #include +#include #include +#include +#include +#include +#include +#include + // Holds all GPU configuration parameters struct GpuConfig @@ -32,6 +39,9 @@ struct GpuConfig unsigned THREADS_PER_SM = 2048; // Threads per SM (launch config) unsigned BLOCKS_NUM = 640; // Total blocks launched unsigned TOTAL_THREADS = 163840; // Total threads launched + + unsigned FBP_COUNT = 0; // Frame Buffer Partitions + unsigned L2_BANKS = 0; // L2 Cache Banks (LTCs) }; GpuConfig config; // Parses short flags like --sm 80 into a GpuConfig object @@ -124,7 +134,9 @@ inline void printGpuConfig(const GpuConfig &c = config) << "BLOCKS_PER_SM: " << c.BLOCKS_PER_SM << "\n" << "THREADS_PER_SM: " << c.THREADS_PER_SM << "\n" << "BLOCKS_NUM: " << c.BLOCKS_NUM << "\n" - << "TOTAL_THREADS: " << c.TOTAL_THREADS << "\n"; + << "TOTAL_THREADS: " << c.TOTAL_THREADS << "\n" + << "FBP_COUNT: " << c.FBP_COUNT << "\n" + << "L2_BANKS: " << c.L2_BANKS << "\n"; } // GPU error check @@ -146,14 +158,91 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, cudaDeviceProp deviceProp; -unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[]) +// NVIDIA RM API defines +#define NV_IOCTL_MAGIC 'F' +#define NV_ESC_RM_ALLOC 0x2b +#define NV_ESC_RM_CONTROL 0x2a +#define NV_ESC_RM_FREE 0x29 +#define NV01_ROOT_CLIENT 0x00000041 +#define NV01_DEVICE_0 0x00000080 +#define NV20_SUBDEVICE_0 0x00002080 +#define NV2080_CTRL_CMD_GR_GET_INFO 0x20801201 + +// https://github.com/NVIDIA/open-gpu-kernel-modules/blob/580.95.05/src/common/sdk/nvidia/inc/ctrl/ctrl0080/ctrl0080gr.h#L142 +#define NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS 0x00000015 +#define NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS 0x00000025 + +typedef uint32_t NvHandle; +typedef uint32_t NvV32; +typedef uint64_t NvP64; + +// Query single GR info index using NVIDIA RM API +inline unsigned queryGrInfo(uint32_t info_index) { -#ifdef TUNER + struct NVOS21_PARAMETERS { NvHandle hRoot, hObjectParent, hObjectNew; NvV32 hClass; NvP64 pAllocParms; uint32_t paramsSize, status; }; + struct NVOS54_PARAMETERS { NvHandle hClient, hObject; NvV32 cmd, flags; NvP64 params; uint32_t paramsSize, status; }; + struct NVOS00_PARAMETERS { NvHandle hRoot, hObjectParent, hObjectOld; uint32_t status; }; + struct NV0080_ALLOC_PARAMETERS { uint32_t deviceId; NvHandle hClientShare, hTargetClient, hTargetDevice; NvV32 flags; uint32_t _pad0; uint64_t vaSpaceSize, vaStartInternal, vaLimitInternal; NvV32 vaMode; uint32_t _pad1; }; + struct NV2080_ALLOC_PARAMETERS { uint32_t subDeviceId; }; + struct NVXXXX_CTRL_XXX_INFO { uint32_t index, data; }; + struct NV0080_CTRL_GR_ROUTE_INFO { uint32_t flags, _pad; uint64_t route; }; + struct NV2080_CTRL_GR_GET_INFO_PARAMS { uint32_t grInfoListSize, _pad; NvP64 grInfoList; NV0080_CTRL_GR_ROUTE_INFO grRouteInfo; }; + + int ctl_fd = open("/dev/nvidiactl", O_RDWR); + if (ctl_fd < 0) { + fprintf(stderr, "DEBUG GR: Failed to open /dev/nvidiactl (errno=%d)\n", errno); + return 0; + } + + auto rm_alloc = [&](NvHandle hClient, NvHandle hParent, NvHandle hObject, uint32_t hClass, void *pParams, uint32_t size) { + NVOS21_PARAMETERS p = {hClient, hParent, hObject, hClass, (NvP64)(uintptr_t)pParams, size, 0}; + bool success = ioctl(ctl_fd, _IOWR(NV_IOCTL_MAGIC, NV_ESC_RM_ALLOC, NVOS21_PARAMETERS), &p) >= 0 && p.status == 0; + if (!success) fprintf(stderr, "DEBUG GR: rm_alloc failed for class 0x%x, status=0x%x\n", hClass, p.status); + return success; + }; + auto rm_control = [&](NvHandle hClient, NvHandle hObject, uint32_t cmd, void *pParams, uint32_t size) { + NVOS54_PARAMETERS p = {hClient, hObject, cmd, 0, (NvP64)(uintptr_t)pParams, size, 0}; + bool success = ioctl(ctl_fd, _IOWR(NV_IOCTL_MAGIC, NV_ESC_RM_CONTROL, NVOS54_PARAMETERS), &p) >= 0 && p.status == 0; + if (!success) fprintf(stderr, "DEBUG GR: rm_control failed for cmd 0x%x, status=0x%x\n", cmd, p.status); + return success; + }; + auto rm_free = [&](NvHandle hClient, NvHandle hParent, NvHandle hObject) { + NVOS00_PARAMETERS p = {hClient, hParent, hObject, 0}; + ioctl(ctl_fd, _IOWR(NV_IOCTL_MAGIC, NV_ESC_RM_FREE, NVOS00_PARAMETERS), &p); + }; + + NvHandle hClient = 0xCAFE0001, hDevice = 0xCAFE0002, hSubDevice = 0xCAFE0003; + NV0080_ALLOC_PARAMETERS devParams = {0}; + NV2080_ALLOC_PARAMETERS subdevParams = {0}; + NVXXXX_CTRL_XXX_INFO infoList[1] = {{info_index, 0}}; + NV2080_CTRL_GR_GET_INFO_PARAMS grParams = {1, 0, (NvP64)(uintptr_t)infoList, {0, 0, 0}}; -#pragma message("TUNER") + unsigned result = 0; + if (rm_alloc(hClient, hClient, hClient, NV01_ROOT_CLIENT, NULL, 0) && + rm_alloc(hClient, hClient, hDevice, NV01_DEVICE_0, &devParams, sizeof(devParams)) && + rm_alloc(hClient, hDevice, hSubDevice, NV20_SUBDEVICE_0, &subdevParams, sizeof(subdevParams)) && + rm_control(hClient, hSubDevice, NV2080_CTRL_CMD_GR_GET_INFO, &grParams, sizeof(grParams))) { + result = infoList[0].data; + fprintf(stderr, "DEBUG GR: Successfully queried index 0x%x = %u\n", info_index, result); + } else { + fprintf(stderr, "DEBUG GR: Query sequence failed for index 0x%x\n", info_index); + } + + rm_free(hClient, hDevice, hSubDevice); + rm_free(hClient, hClient, hDevice); + rm_free(hClient, hClient, hClient); + close(ctl_fd); + return result; +} + +unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[]) +{ cudaSetDevice(deviceID); cudaGetDeviceProperties(&deviceProp, deviceID); + int clockRateKHz; + cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceID); + // core stats config.SM_NUMBER = deviceProp.multiProcessorCount; @@ -184,11 +273,12 @@ unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[]) config.MEM_SIZE = deviceProp.totalGlobalMem; config.MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f; config.MEM_BITWIDTH = deviceProp.memoryBusWidth; -#else - parseGpuConfigArgs(argc, argv); + config.CLK_FREQUENCY = clockRateKHz * 1e-3f; -#endif + config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS); + config.L2_BANKS = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS); + parseGpuConfigArgs(argc, argv); printGpuConfig(); return 1; diff --git a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu index 0346bd2de..84bd04110 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw/atomic_add_bw.cu @@ -3,20 +3,9 @@ #include #include #include +#include -#ifdef TUNER -#pragma message("TUNER") #include "../../../hw_def/hw_def.h" -#define REPEAT_TIMES 2048 -#else -#include "../../../hw_def/common/gpuConfig.h" -// #define THREADS_PER_BLOCK 1 -// #define THREADS_PER_SM 1 -// #define BLOCKS_NUM 1 -// #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) -// #define WARP_SIZE 32 -#define REPEAT_TIMES 16 -#endif // #define THREADS_PER_BLOCK 1024 // #define THREADS_PER_SM 2048 @@ -28,7 +17,7 @@ template __global__ void atomic_bw(uint64_t *startClk, uint64_t *stopClk, T *data1, - T *res) + T *res, uint32_t repeat_times) { int gid = blockIdx.x * blockDim.x + threadIdx.x; // register T s1 = data1[gid]; @@ -43,7 +32,7 @@ __global__ void atomic_bw(uint64_t *startClk, uint64_t *stopClk, T *data1, // start timing uint64_t start = clock64(); - for (uint32_t i = 0; i < REPEAT_TIMES; i++) + for (uint32_t i = 0; i < repeat_times; i++) { sum = sum + atomicAdd(&data1[(i * warpSize) + gid], 10); } @@ -64,7 +53,17 @@ int main(int argc, char *argv[]) intilizeDeviceProp(0, argc, argv); - unsigned ARRAY_SIZE = config.TOTAL_THREADS + (REPEAT_TIMES * config.WARP_SIZE); + // Parse command line arguments for --fast flag + uint32_t repeat_times = 2048; // default + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--fast") == 0) { + std::cout << "Fast mode enabled: reducing repeat_times to 16" << std::endl; + repeat_times = 16; + break; + } + } + + unsigned ARRAY_SIZE = config.TOTAL_THREADS + (repeat_times * config.WARP_SIZE); uint64_t *startClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); uint64_t *stopClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); @@ -90,7 +89,7 @@ int main(int argc, char *argv[]) cudaMemcpyHostToDevice)); atomic_bw<<>>(startClk_g, stopClk_g, - data1_g, res_g); + data1_g, res_g, repeat_times); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaMemcpy(startClk, startClk_g, config.TOTAL_THREADS * sizeof(uint32_t), @@ -106,7 +105,7 @@ int main(int argc, char *argv[]) *std::min_element(&startClk[0], &startClk[config.TOTAL_THREADS]); // uint64_t total_time = stopClk[0]-startClk[0]; - bw = (((float)REPEAT_TIMES * (float)config.TOTAL_THREADS * 4 * 8) / + bw = (((float)repeat_times * (float)config.TOTAL_THREADS * 4 * 8) / (float)(total_time)); printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw); printf("Total Clk number = %ld \n", total_time); diff --git a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu index bb2ee4f56..88a056e30 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_bw_conflict/atomic_add_bw_conflict.cu @@ -3,39 +3,13 @@ #include #include #include - -#ifdef TUNER +#include #include "../../../hw_def/hw_def.h" -#define REPEAT_TIMES 2048 -#else -#define REPEAT_TIMES 16 -#include "../../../hw_def/common/gpuConfig.h" -// #define THREADS_PER_BLOCK 1024 -// #define THREADS_PER_SM 2048 -// #define BLOCKS_NUM 160 -// #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) -// #define WARP_SIZE 32 - -// #define ARRAY_SIZE TOTAL_THREADS - -// #define gpuErrchk(ans) \ -// { gpuAssert((ans), __FILE__, __LINE__); } -// inline void gpuAssert(cudaError_t code, const char *file, int line, -// bool abort = true) { -// if (code != cudaSuccess) { -// fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, -// line); -// if (abort) -// exit(code); -// } -// } - -#endif template __global__ void atomic_bw(uint32_t *startClk, uint32_t *stopClk, T *data1, - T *res) + T *res, uint32_t repeat_times) { int gid = blockIdx.x * blockDim.x + threadIdx.x; uint32_t sum; @@ -46,7 +20,7 @@ __global__ void atomic_bw(uint32_t *startClk, uint32_t *stopClk, T *data1, uint32_t start = 0; asm volatile("mov.u32 %0, %%clock;" : "=r"(start)::"memory"); - for (int j = 0; j < REPEAT_TIMES; ++j) + for (int j = 0; j < repeat_times; ++j) { sum = sum + atomicAdd(&data1[0], 10); } @@ -67,6 +41,17 @@ int main(int argc, char *argv[]) { intilizeDeviceProp(0, argc, argv); + config.BLOCKS_NUM = config.SM_NUMBER * 2; + config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; + + // Parse command line arguments for --fast flag + uint32_t repeat_times = 2048; // default + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--fast") == 0) { + repeat_times = 16; + break; + } + } uint32_t *startClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); uint32_t *stopClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); @@ -92,7 +77,7 @@ int main(int argc, char *argv[]) cudaMemcpyHostToDevice)); atomic_bw<<>>(startClk_g, stopClk_g, - data1_g, res_g); + data1_g, res_g, repeat_times); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaMemcpy(startClk, startClk_g, config.TOTAL_THREADS * sizeof(uint32_t), @@ -107,7 +92,7 @@ int main(int argc, char *argv[]) *std::max_element(&stopClk[0], &stopClk[config.TOTAL_THREADS]) - *std::min_element(&startClk[0], &startClk[config.TOTAL_THREADS]); // uint32_t total_time = stopClk[0] - startClk[0]; - bw = ((float)(REPEAT_TIMES * config.TOTAL_THREADS * 4) / (float)(total_time)); + bw = ((float)(repeat_times * config.TOTAL_THREADS * 4) / (float)(total_time)); printf("Atomic int32 bandwidth = %f (byte/clk)\n", bw); printf("Total Clk number = %u \n", total_time); diff --git a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu index a6e18b554..280079531 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/atomics/Atomic_add_lat/atomic_add_lat.cu @@ -2,18 +2,9 @@ #include #include -#define REPEAT_TIMES 4096 -#ifdef TUNER #include "../../../hw_def/hw_def.h" -#else -#include "../../../hw_def/common/gpuConfig.h" -// #define THREADS_PER_BLOCK 1 -// #define THREADS_PER_SM 1 -// #define BLOCKS_NUM 1 -// #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) -// #define WARP_SIZE 32 -#endif +#define REPEAT_TIMES 4096 template __global__ void atmoic_latency(uint32_t *startClk, uint32_t *stopClk, T *data1, @@ -52,14 +43,12 @@ int main(int argc, char *argv[]) { intilizeDeviceProp(0, argc, argv); -#ifdef TUNER + config.THREADS_PER_BLOCK = 1; config.THREADS_PER_SM = 1; config.BLOCKS_NUM = 1; config.TOTAL_THREADS = 1; -#endif - uint32_t *startClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); uint32_t *stopClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); int32_t *data1 = (int32_t *)malloc(REPEAT_TIMES * sizeof(int32_t)); diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.h b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.h index 407aa27b9..ebd3b5fa2 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.h +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_double/MaxFlops_double.h @@ -49,8 +49,6 @@ __global__ void max_flops(uint32_t *startClk, uint32_t *stopClk, T *data1, } float dpu_max_flops() { - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; uint32_t *startClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); uint32_t *stopClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.h b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.h index a3f156d84..c5cec50e6 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.h +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_float/MaxFlops_float.h @@ -47,11 +47,6 @@ __global__ void max_flops(uint32_t *startClk, uint32_t *stopClk, T *data1, } int fpu_max_flops() { - - - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - uint32_t *startClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); uint32_t *stopClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); float *data1 = (float *)malloc(config.TOTAL_THREADS * sizeof(float)); diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.h b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.h index 50201bf8f..51f3ba760 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.h +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxFlops_half/MaxFlops_half.h @@ -45,11 +45,6 @@ __global__ void fpu16_max_flops(uint32_t *startClk, uint32_t *stopClk, } float fpu16_max_flops() { - - - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - uint32_t *startClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); uint32_t *stopClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); half *data1 = (half *)malloc(config.TOTAL_THREADS * sizeof(half)); diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/MaxIops_int32/MaxFlops_int32.h b/src/cuda/GPU_Microbenchmark/ubench/core/MaxIops_int32/MaxFlops_int32.h index b26fd266b..245a99a13 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/MaxIops_int32/MaxFlops_int32.h +++ b/src/cuda/GPU_Microbenchmark/ubench/core/MaxIops_int32/MaxFlops_int32.h @@ -6,18 +6,7 @@ #include #define REPEAT_TIMES 1024 -#ifdef TUNER #include "../../../hw_def/hw_def.h" -#else -#include "../../../hw_def/common/gpuConfig.h" -// #define THREADS_PER_BLOCK 1 -// #define THREADS_PER_SM 1 -// #define BLOCKS_NUM 1 -// #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) -// #define WARP_SIZE 32 - - -#endif template @@ -60,12 +49,6 @@ __global__ void max_flops(uint32_t *startClk, uint32_t *stopClk, T *data1, float max_int32_flops(int argc, char* argv[]) { intilizeDeviceProp(0,argc,argv); - #ifdef TUNER - - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - - #endif uint32_t *startClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); uint32_t *stopClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); diff --git a/src/cuda/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.h b/src/cuda/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.h index f2b572319..f39994fc5 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.h +++ b/src/cuda/GPU_Microbenchmark/ubench/core/sfu_bw_fsqrt/sfu_bw_fsqrt.h @@ -47,11 +47,6 @@ __global__ void max_flops(uint64_t *startClk, uint64_t *stopClk, float *data1, } float sfu_max_flops() { - - - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - uint64_t *startClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); uint64_t *stopClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); float *data1 = (float *)malloc(config.TOTAL_THREADS * sizeof(float)); diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu index 5034a3cd4..8a6e9231a 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_128/l1_bw_128.cu @@ -12,22 +12,9 @@ #include #define REPEAT_TIMES 4096 -#ifdef TUNER #include "../../../hw_def/hw_def.h" // array size is half the L1 size (2) * float size (4) -#define ARRAY_SIZE (L1_SIZE / 8) -#else -#include "../../../hw_def/common/gpuConfig.h" -// #define THREADS_PER_BLOCK 1024 -// #define THREADS_PER_SM 1024 -// #define BLOCKS_NUM 1 -// #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) -// #define WARP_SIZE 32 -#define CLK_FREQUENCY 1410 // Asumme A100 freq -#define ARRAY_SIZE 16384 // ARRAY_SIZE has to be less than L1_SIZE -#define L1_SIZE 32768 // L1 size in 32-bit. Volta L1 size is 128KB, i.e. 32K of 32-bit - -#endif +#define ARRAY_SIZE 16384 __global__ void l1_bw(uint64_t *startClk, uint64_t *stopClk, float *dsink, float *posArray) @@ -101,15 +88,9 @@ int main(int argc, char *argv[]) { intilizeDeviceProp(0, argc, argv); -#ifdef TUNER - - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - config.THREADS_PER_SM = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; // ARRAY_SIZE has to be less than L1_SIZE assert(ARRAY_SIZE * sizeof(float) < L1_SIZE); -#endif uint64_t *startClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); uint64_t *stopClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float)); @@ -145,7 +126,7 @@ int main(int argc, char *argv[]) *std::min_element(&startClk[0], &startClk[config.TOTAL_THREADS]); bw = (double)(REPEAT_TIMES * config.THREADS_PER_SM * sizeof(float) * 4) / ((double)total_time); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; + BW = bw * config.CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; std::cout << "L1 bandwidth = " << bw << "(byte/clk/SM), " << BW << "(GB/s/SM)\n"; std::cout << "Total Clk number = " << total_time << "\n"; diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu index c03995f84..1ec50c6b8 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f/l1_bw_32f.cu @@ -13,22 +13,9 @@ #include #define REPEAT_TIMES 4096 -#ifdef TUNER #include "../../../hw_def/hw_def.h" // array size is half the L1 size (2) * float size (4) -#define ARRAY_SIZE (L1_SIZE / 8) -#else -#include "../../../hw_def/common/gpuConfig.h" -// #define THREADS_PER_BLOCK 1024 -// #define THREADS_PER_SM 1024 -// #define BLOCKS_NUM 1 -// #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) -// #define WARP_SIZE 32 #define ARRAY_SIZE 16384 // ARRAY_SIZE has to be less than L1_SIZE -#define L1_SIZE 32768 // L1 size in 32-bit. Volta L1 size is 128KB, i.e. 32K of 32-bit -#define CLK_FREQUENCY 1410 // Asumme A100 freq - -#endif __global__ void l1_bw(uint64_t *__restrict__ startClk, uint64_t *__restrict__ stopClk, float *__restrict__ dsink, @@ -104,15 +91,9 @@ int main(int argc, char *argv[]) { intilizeDeviceProp(0, argc, argv); -#ifdef TUNER - - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - config.THREADS_PER_SM = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; assert(ARRAY_SIZE * sizeof(float) < L1_SIZE); // ARRAY_SIZE has to be less than L1_SIZE -#endif uint64_t *startClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); uint64_t *stopClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float)); @@ -151,7 +132,7 @@ int main(int argc, char *argv[]) *std::min_element(&startClk[0], &startClk[config.TOTAL_THREADS]); // total_time = stopClk[0]-startClk[0]; bw = (float)(REPEAT_TIMES * config.THREADS_PER_SM * 4 * 4) / ((float)total_time); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; + BW = bw * config.CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; std::cout << "L1 bandwidth = " << bw << "(byte/clk/SM), " << BW << "(GB/s/SM)\n"; std::cout << "Total Clk number = " << total_time << "\n"; diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu index 708e90ec1..cde7f3c42 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll/l1_bw_32f_unroll.cu @@ -10,23 +10,9 @@ #include #define REPEAT_TIMES 4096 -#ifdef TUNER #include "../../../hw_def/hw_def.h" // array size is half the L1 size (2) * float size (4) -#define ARRAY_SIZE (L1_SIZE / 8) -#else -#include "../../../hw_def/common/gpuConfig.h" - -// #define THREADS_PER_BLOCK 1024 -// #define THREADS_PER_SM 1024 -// #define BLOCKS_NUM 1 -// #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) -// #define WARP_SIZE 32 -#define ARRAY_SIZE 16384 // ARRAY_SIZE has to be less than L1_SIZE -#define L1_SIZE 32768 // L1 size in 64-bit. Volta L1 size is 128KB, i.e. 16K of 64-bit -#define CLK_FREQUENCY 1410 // Asumme A100 freq - -#endif +#define ARRAY_SIZE 16384 __global__ void l1_bw(uint32_t *startClk, uint32_t *stopClk, float *dsink, float *posArray) @@ -95,15 +81,9 @@ int main(int argc, char *argv[]) { intilizeDeviceProp(0, argc, argv); -#ifdef TUNER - - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - config.THREADS_PER_SM = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; assert(ARRAY_SIZE * sizeof(float) < L1_SIZE); // ARRAY_SIZE has to be less than L1_SIZE -#endif uint32_t *startClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); uint32_t *stopClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); float *posArray = (float *)malloc(ARRAY_SIZE * sizeof(float)); diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll_large/l1_bw_32f_unroll_large.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll_large/l1_bw_32f_unroll_large.cu index 8463f21f1..d5dcdfcd0 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll_large/l1_bw_32f_unroll_large.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_32f_unroll_large/l1_bw_32f_unroll_large.cu @@ -9,23 +9,10 @@ #include #include -#define THREADS_PER_BLOCK 1024 -#define THREADS_PER_SM 1024 -#define BLOCKS_NUM 1 -#define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) -#define WARP_SIZE 32 #define REPEAT_TIMES 4096 -#define ARRAY_SIZE (THREADS_PER_SM+REPEAT_TIMES*WARP_SIZE) //ARRAY_SIZE has to be less than L1_SIZE -#define L1_SIZE 32768 //L1 size in 32-bit. Volta L1 size is 128KB, i.e. 32K of 32-bit - -// GPU error check -#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); } -inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true){ - if (code != cudaSuccess) { - fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); - if (abort) exit(code); - } -} +#include "../../../hw_def/hw_def.h" +// array size is half the L1 size (2) * float size (4) +#define ARRAY_SIZE 32768 __global__ void l1_bw(uint32_t *startClk, uint32_t *stopClk, float *dsink, float *posArray){ @@ -37,7 +24,7 @@ __global__ void l1_bw(uint32_t *startClk, uint32_t *stopClk, float *dsink, float float sink = 0; // warp up L1 cache - for (uint32_t i = tid; i>>(startClk_g, stopClk_g, dsink_g, posArray_g); + l1_bw<<>>(startClk_g, stopClk_g, dsink_g, posArray_g); gpuErrchk( cudaPeekAtLastError() ); - gpuErrchk( cudaMemcpy(startClk, startClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); - gpuErrchk( cudaMemcpy(stopClk, stopClk_g, TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); - gpuErrchk( cudaMemcpy(dsink, dsink_g, TOTAL_THREADS*sizeof(float), cudaMemcpyDeviceToHost) ); + gpuErrchk( cudaMemcpy(startClk, startClk_g, config.TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); + gpuErrchk( cudaMemcpy(stopClk, stopClk_g, config.TOTAL_THREADS*sizeof(uint32_t), cudaMemcpyDeviceToHost) ); + gpuErrchk( cudaMemcpy(dsink, dsink_g, config.TOTAL_THREADS*sizeof(float), cudaMemcpyDeviceToHost) ); double bw; - bw = (double)(REPEAT_TIMES*THREADS_PER_SM*4)/((double)(stopClk[0]-startClk[0])); + bw = (double)(REPEAT_TIMES*config.THREADS_PER_SM*4)/((double)(stopClk[0]-startClk[0])); printf("L1 bandwidth = %f (byte/clk/SM)\n", bw); printf("Total Clk number = %u \n", stopClk[0]-startClk[0]); diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu index a118c86fe..8f8ff72d3 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64f/l1_bw_64f.cu @@ -12,22 +12,9 @@ #include #define REPEAT_TIMES 256 -#ifdef TUNER #include "../../../hw_def/hw_def.h" // array size is half the L1 size (2) * float size (4) -#define ARRAY_SIZE (L1_SIZE / 16) -#else -#include "../../../hw_def/common/gpuConfig.h" -// #define THREADS_PER_BLOCK 1024 -// #define THREADS_PER_SM 1024 -// #define BLOCKS_NUM 1 -// #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) -// #define WARP_SIZE 32 -#define ARRAY_SIZE 8192 // ARRAY_SIZE has to be less than L1_SIZE -#define L1_SIZE 16384 // L1 size in 64-bit. Volta L1 size is 128KB, i.e. 16K of 64-bit -#define CLK_FREQUENCY 1410 // Asumme A100 freq - -#endif +#define ARRAY_SIZE 8192 __global__ void l1_bw(uint64_t *startClk, uint64_t *stopClk, double *dsink, double *posArray) @@ -96,15 +83,10 @@ int main(int argc, char *argv[]) { intilizeDeviceProp(0, argc, argv); -#ifdef TUNER - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - config.THREADS_PER_SM = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; // ARRAY_SIZE has to be less than L1_SIZE assert(ARRAY_SIZE * sizeof(double) < L1_SIZE); -#endif uint64_t *startClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); uint64_t *stopClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); double *posArray = (double *)malloc(ARRAY_SIZE * sizeof(double)); @@ -143,7 +125,7 @@ int main(int argc, char *argv[]) *std::min_element(&startClk[0], &startClk[config.TOTAL_THREADS]); bw = (double)(REPEAT_TIMES * config.THREADS_PER_SM * sizeof(double) * 2) / ((double)total_time); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; + BW = bw * config.CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; std::cout << "L1 bandwidth = " << bw << "(byte/clk/SM), " << BW << "(GB/s/SM)\n"; std::cout << "Total Clk number = " << total_time << "\n"; diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu index 45f920971..18ae99d75 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_bw_64v/l1_bw_64v.cu @@ -16,7 +16,7 @@ This benchmark measures the maximum read bandwidth of L1 cache for 64-bit vector #define REPEAT_TIMES 256 // array size is half the L1 size (2) * float size (4) -#define ARRAY_SIZE (L1_SIZE / 8) +#define ARRAY_SIZE 8192 __global__ void l1_bw(uint64_t *startClk, uint64_t *stopClk, float *dsink, float *posArray) @@ -85,11 +85,7 @@ int main(int argc, char *argv[]) { intilizeDeviceProp(0, argc, argv); - ; - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - config.THREADS_PER_SM = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; // ARRAY_SIZE has to be less than L1_SIZE assert(ARRAY_SIZE * sizeof(float) < L1_SIZE); @@ -132,7 +128,7 @@ int main(int argc, char *argv[]) *std::min_element(&startClk[0], &startClk[config.TOTAL_THREADS]); bw = (double)(REPEAT_TIMES * config.THREADS_PER_SM * sizeof(float) * 2) / ((double)total_time); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; + BW = bw * config.CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; std::cout << "L1 bandwidth = " << bw << "(byte/clk/SM), " << BW << "(GB/s/SM)\n"; std::cout << "Total Clk number = " << total_time << "\n"; diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.h b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.h index 8667b75b5..487baa460 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.h +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_lat/l1_lat.h @@ -8,34 +8,20 @@ #include #include #include +#include #include -#define THREADS_NUM 1 -#ifdef TUNER #include "../../../hw_def/hw_def.h" +#define THREADS_NUM 1 + // Launch only one thread to calcaulte the latency using a pointer-chasing // array technique -#define REPEAT_TIMES 32768 // iterate over the array ITERS times -#define ARRAY_SIZE 4096 // size of the array - -#else -#include "../../../hw_def/common/gpuConfig.h" - -// #define THREADS_PER_SM 1024 -// #define BLOCKS_NUM 1 -// #define TOTAL_THREADS (THREADS_NUM*BLOCKS_NUM) -// #define WARP_SIZE 32 -#define REPEAT_TIMES 256 -#define ARRAY_SIZE 8192 // ARRAY_SIZE has to be less than L1_SIZE -#define L1_SIZE 16384 - -#endif - // Measure latency of ITERS reads. __global__ void l1_lat(uint32_t *startClk, uint32_t *stopClk, - uint64_t *posArray, uint64_t *dsink) + uint64_t *posArray, uint64_t *dsink, + uint32_t repeat_times, uint32_t array_size) { // thread index @@ -44,10 +30,10 @@ __global__ void l1_lat(uint32_t *startClk, uint32_t *stopClk, // one thread to initialize the pointer-chasing array if (tid == 0) { - for (uint32_t i = 0; i < (ARRAY_SIZE - 1); i++) + for (uint32_t i = 0; i < (array_size - 1); i++) posArray[i] = (uint64_t)(posArray + i + 1); - posArray[ARRAY_SIZE - 1] = (uint64_t)posArray; + posArray[array_size - 1] = (uint64_t)posArray; } if (tid < THREADS_NUM) @@ -74,7 +60,7 @@ __global__ void l1_lat(uint32_t *startClk, uint32_t *stopClk, // pointer-chasing ITERS times // use ca modifier to cache the load in L1 - for (uint32_t i = 0; i < REPEAT_TIMES; ++i) + for (uint32_t i = 0; i < repeat_times; ++i) { asm volatile("{\t\n" "ld.global.ca.u64 %0, [%1];\n\t" @@ -100,12 +86,22 @@ float l1_lat(int argc, char *argv[]) { intilizeDeviceProp(0, argc, argv); -#ifdef TUNER config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = THREADS_NUM * config.BLOCKS_NUM; - config.THREADS_PER_SM = THREADS_NUM * config.BLOCKS_NUM; - assert(ARRAY_SIZE * sizeof(uint64_t) < L1_SIZE); -#endif + config.TOTAL_THREADS = 1; + config.THREADS_PER_SM = 1; + + // Parse command line arguments for --fast flag + uint32_t repeat_times = 32768; // default + uint32_t array_size = 4096; // default + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--fast") == 0) { + repeat_times = 256; + array_size = 8192; + break; + } + } + + assert(array_size * sizeof(uint64_t) < L1_SIZE); uint32_t *startClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t)); uint32_t *stopClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t)); @@ -118,10 +114,10 @@ float l1_lat(int argc, char *argv[]) gpuErrchk(cudaMalloc(&startClk_g, THREADS_NUM * sizeof(uint32_t))); gpuErrchk(cudaMalloc(&stopClk_g, THREADS_NUM * sizeof(uint32_t))); - gpuErrchk(cudaMalloc(&posArray_g, ARRAY_SIZE * sizeof(uint64_t))); + gpuErrchk(cudaMalloc(&posArray_g, array_size * sizeof(uint64_t))); gpuErrchk(cudaMalloc(&dsink_g, THREADS_NUM * sizeof(uint64_t))); - l1_lat<<>>(startClk_g, stopClk_g, posArray_g, dsink_g); + l1_lat<<>>(startClk_g, stopClk_g, posArray_g, dsink_g, repeat_times, array_size); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaMemcpy(startClk, startClk_g, THREADS_NUM * sizeof(uint32_t), @@ -131,7 +127,7 @@ float l1_lat(int argc, char *argv[]) gpuErrchk(cudaMemcpy(dsink, dsink_g, THREADS_NUM * sizeof(uint64_t), cudaMemcpyDeviceToHost)); - float lat = (float)(stopClk[0] - startClk[0]) / REPEAT_TIMES; + float lat = (float)(stopClk[0] - startClk[0]) / repeat_times; printf("L1 Latency = %12.4f cycles\n", lat); printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu index 7efa14253..80c3c3429 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_mshr/l1_mshr.cu @@ -124,7 +124,7 @@ void l1_structure(uint32_t stride, uint64_t array_size, int main(int argc, char *argv[]) { intilizeDeviceProp(0, argc, argv); - ; + config.BLOCKS_NUM = 1; config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; diff --git a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu index 01df4a0d9..caab02542 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l1_cache/l1_shared_bw/l1_shared_bw.cu @@ -8,30 +8,12 @@ #define ITERS 4096 -#ifdef TUNER #include "../../../hw_def/hw_def.h" // array size is half the L1 size (2) * float size (4) #define ARRAY_SIZE (L1_SIZE / 8) // 32 KB of shd memory #define SHARED_MEM_SIZE (32 * 1024 / 4) -#else - -#include "../../../hw_def/common/gpuConfig.h" - -#define L1_SIZE_BYTE (128 * 1024) -#define L1_SIZE (L1_SIZE_BYTE / 4) -#define ARRAY_SIZE (L1_SIZE / 2) -#define SHARED_MEM_SIZE_BYTE (48 * 1024) // size in bytes, max 96KB for v100 -#define SHARED_MEM_SIZE (SHARED_MEM_SIZE_BYTE / 4) - -// #define BLOCKS_NUM 1 -// #define THREADS_PER_BLOCK 1024 -// #define WARP_SIZE 32 -// #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) - -#endif - __global__ void shared_bw(uint32_t *startClk, uint32_t *stopClk, uint32_t *dsink, uint32_t *l1, uint32_t stride) { @@ -90,13 +72,7 @@ int main(int argc, char *argv[]) intilizeDeviceProp(0, argc, argv); -#ifdef TUNER - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - config.THREADS_PER_SM = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - assert(SHARED_MEM_SIZE * sizeof(uint32_t) < config.MAX_SHARED_MEM_SIZE_PER_BLOCK); -#endif uint32_t *startClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); uint32_t *stopClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); uint32_t *dsink = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu index 8315c19e6..2bea4e8db 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_128/l2_bw_128.cu @@ -14,21 +14,8 @@ #include #define REPEAT_TIMES 256 -#ifdef TUNER #include "../../../hw_def/hw_def.h" -#else -#include "../../../hw_def/common/gpuConfig.h" -// #define BLOCKS_NUM 80 -// #define THREADS_PER_BLOCK 1024 //thread number/block -// #define TOTAL_THREADS (BLOCKS_NUM * THREADS_PER_BLOCK) -// #define WARP_SIZE 32 -// #define ARRAY_SIZE_CORR (TOTAL_THREADS*4 + REPEAT_TIMES*WARP_SIZE*4) //Array size must not exceed L2 size -// #define L2_SIZE 1572864 //L2 size in 32-bit. Volta L2 size is 6MB. -#define CLK_FREQUENCY 1410 // Asumme A100 freq - -#endif - /* L2 cache is warmed up by loading posArray and adding sink Start timing after warming up @@ -150,15 +137,13 @@ int main(int argc, char *argv[]) std::cout << "Total Clk number = " << total_time << "\n"; bw = (float)(data) / ((float)(stopClk[0] - startClk[0])); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; + BW = bw * config.CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; std::cout << "L2 bandwidth = " << bw << "(byte/clk), " << BW << "(GB/s)\n"; -#ifdef TUNER - float max_bw = get_num_channels(config.MEM_BITWIDTH, DRAM_MODEL) * - L2_BANKS_PER_MEM_CHANNEL * L2_BANK_WIDTH_in_BYTE; - BW = max_bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; + + float max_bw = config.FBP_COUNT * config.L2_BANKS * L2_BANK_WIDTH_in_BYTE; + BW = max_bw * config.CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; std::cout << "Max Theortical L2 bandwidth = " << max_bw << "(byte/clk), " << BW << "(GB/s)\n"; std::cout << "L2 BW achievable = " << (bw / max_bw) * 100 << "%\n"; -#endif return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu index 0bee6a579..bbfec08c9 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu @@ -12,23 +12,9 @@ #include #include #include +#include -#ifdef TUNER #include "../../../hw_def/hw_def.h" -#define REPEAT_TIMES 2048 - -#else -#include "../../../hw_def/common/gpuConfig.h" -// #define BLOCKS_NUM 160 -// #define THREADS_PER_BLOCK 1024 //thread number/block -// #define TOTAL_THREADS (BLOCKS_NUM * THREADS_PER_BLOCK) -// #define WARP_SIZE 32 -#define REPEAT_TIMES 512 -#define CLK_FREQUENCY 1410 // Asumme A100 freq - -// #define L2_SIZE 1572864 //L2 size in 32-bit. Volta L2 size is 6MB. - -#endif /* L2 cache is warmed up by loading posArray and adding sink @@ -39,7 +25,7 @@ Stop timing and store data */ __global__ void l2_bw(uint64_t *startClk, uint64_t *stopClk, float *dsink, - float *posArray, unsigned ARRAY_SIZE) + float *posArray, unsigned ARRAY_SIZE, uint32_t repeat_times) { // block and thread index uint32_t tid = threadIdx.x; @@ -72,7 +58,7 @@ __global__ void l2_bw(uint64_t *startClk, uint64_t *stopClk, float *dsink, asm volatile("mov.u64 %0, %%clock64;" : "=l"(start)::"memory"); // load data from l2 cache and accumulate, - for (uint32_t i = 0; i < REPEAT_TIMES; i++) + for (uint32_t i = 0; i < repeat_times; i++) { float *ptr = posArray + (i * warpSize) + uid; asm volatile("{\t\n" @@ -101,10 +87,21 @@ int main(int argc, char *argv[]) intilizeDeviceProp(0, argc, argv); - unsigned ARRAY_SIZE = config.TOTAL_THREADS + REPEAT_TIMES * config.WARP_SIZE; + // Parse command line arguments for --fast flag + uint32_t repeat_times = 2048; // default + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--fast") == 0) { + repeat_times = 512; + break; + } + } + + unsigned ARRAY_SIZE = config.TOTAL_THREADS + repeat_times * config.WARP_SIZE; assert(ARRAY_SIZE * sizeof(float) < config.L2_SIZE); // Array size must not exceed L2 size + config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM + uint64_t *startClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); uint64_t *stopClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); @@ -128,7 +125,7 @@ int main(int argc, char *argv[]) cudaMemcpyHostToDevice)); l2_bw<<>>(startClk_g, stopClk_g, dsink_g, - posArray_g, ARRAY_SIZE); + posArray_g, ARRAY_SIZE, repeat_times); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaMemcpy(startClk, startClk_g, config.TOTAL_THREADS * sizeof(uint64_t), @@ -140,22 +137,21 @@ int main(int argc, char *argv[]) float bw, BW; unsigned long long data = - (unsigned long long)config.TOTAL_THREADS * REPEAT_TIMES * sizeof(float); + (unsigned long long)config.TOTAL_THREADS * repeat_times * sizeof(float); uint64_t total_time = stopClk[0] - startClk[0]; std::cout << "Total Clk number = " << total_time << "\n"; // uint64_t total_time = // *std::max_element(&stopClk[0],&stopClk[TOTAL_THREADS])-*std::min_element(&startClk[0],&startClk[TOTAL_THREADS]); bw = (float)(data) / ((float)(total_time)); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; + BW = bw * config.CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; std::cout << "L2 bandwidth = " << bw << "(byte/clk), " << BW << "(GB/s)\n"; -#ifdef TUNER - float max_bw = get_num_channels(config.MEM_BITWIDTH, DRAM_MODEL) * - L2_BANKS_PER_MEM_CHANNEL * L2_BANK_WIDTH_in_BYTE; - BW = max_bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; + + float max_bw = config.FBP_COUNT * config.L2_BANKS * L2_BANK_WIDTH_in_BYTE; + BW = max_bw * config.CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; std::cout << "Max Theortical L2 bandwidth = " << max_bw << "(byte/clk), " << BW << "(GB/s)\n"; std::cout << "L2 BW achievable = " << (bw / max_bw) * 100 << "%\n"; -#endif + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu index 828be220b..e2ce3bdb7 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_64f/l2_bw_64f.cu @@ -12,27 +12,9 @@ #include #include #include +#include - - - -#ifdef TUNER #include "../../../hw_def/hw_def.h" -#define REPEAT_TIMES 2048 - -#else -#include "../../../hw_def/common/gpuConfig.h" -// #define BLOCKS_NUM 160 -// #define THREADS_PER_BLOCK 1024 //thread number/block -// #define TOTAL_THREADS (BLOCKS_NUM * THREADS_PER_BLOCK) -#define REPEAT_TIMES 512 -// #define WARP_SIZE 32 -// #define ARRAY_SIZE_CORR (TOTAL_THREADS + REPEAT_TIMES*WARP_SIZE) //Array size must not exceed L2 size -// #define L2_SIZE 786432 //number of doubles L2 can store - -#define CLK_FREQUENCY 1410 //Asumme A100 freq - -#endif /* @@ -44,7 +26,7 @@ Stop timing and store data */ __global__ void l2_bw(uint32_t *startClk, uint32_t *stopClk, double *dsink, - double *posArray, unsigned ARRAY_SIZE) { + double *posArray, unsigned ARRAY_SIZE, uint32_t repeat_times) { // block and thread index uint32_t tid = threadIdx.x; uint32_t bid = blockIdx.x; @@ -76,7 +58,7 @@ __global__ void l2_bw(uint32_t *startClk, uint32_t *stopClk, double *dsink, // benchmark starts // load data from l2 cache and accumulate, - for (uint32_t i = 0; i < REPEAT_TIMES; i++) { + for (uint32_t i = 0; i < repeat_times; i++) { double *ptr = posArray + (i * warpSize) + uid; asm volatile("{\t\n" ".reg .f64 data;\n\t" @@ -101,15 +83,24 @@ __global__ void l2_bw(uint32_t *startClk, uint32_t *stopClk, double *dsink, int main(int argc, char* argv[]) { - - intilizeDeviceProp(0,argc,argv); printGpuConfig(); - + intilizeDeviceProp(0,argc,argv); - unsigned ARRAY_SIZE = config.TOTAL_THREADS + REPEAT_TIMES * config.WARP_SIZE; - assert(ARRAY_SIZE * sizeof(float) < + // Parse command line arguments for --fast flag + uint32_t repeat_times = 2048; // default + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--fast") == 0) { + repeat_times = 512; + break; + } + } + unsigned ARRAY_SIZE = config.TOTAL_THREADS + repeat_times * config.WARP_SIZE; + std::cout << "Array size = " << ARRAY_SIZE * sizeof(double) << "\n"; + assert(ARRAY_SIZE * sizeof(double) < config.L2_SIZE); // Array size must not exceed L2 size + config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM + uint32_t *startClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); uint32_t *stopClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); @@ -133,7 +124,7 @@ int main(int argc, char* argv[]) { cudaMemcpyHostToDevice)); l2_bw<<>>(startClk_g, stopClk_g, dsink_g, - posArray_g, ARRAY_SIZE); + posArray_g, ARRAY_SIZE, repeat_times); gpuErrchk(cudaPeekAtLastError()); gpuErrchk(cudaMemcpy(startClk, startClk_g, config.TOTAL_THREADS * sizeof(uint32_t), @@ -145,20 +136,19 @@ int main(int argc, char* argv[]) { float bw, BW; unsigned long long data = - (unsigned long long)config.TOTAL_THREADS * REPEAT_TIMES * sizeof(double); + (unsigned long long)config.TOTAL_THREADS * repeat_times * sizeof(double); uint64_t total_time = stopClk[0] - startClk[0]; std::cout << "Total Clk number = " << total_time << "\n"; bw = (float)(data) / ((float)(total_time)); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; + BW = bw * config.CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; std::cout << "L2 bandwidth = " << bw << "(byte/clk), " << BW << "(GB/s)\n"; - #ifdef TUNER - float max_bw = get_num_channels(config.MEM_BITWIDTH, DRAM_MODEL) * - L2_BANKS_PER_MEM_CHANNEL * L2_BANK_WIDTH_in_BYTE; - BW = max_bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; + + float max_bw = config.FBP_COUNT * config.L2_BANKS * L2_BANK_WIDTH_in_BYTE; + BW = max_bw * config.CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; std::cout << "Max Theortical L2 bandwidth = " << max_bw << "(byte/clk), " << BW << "(GB/s)\n"; std::cout << "L2 BW achievable = " << (bw / max_bw) * 100 << "%\n"; - #endif + return 0; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu index cad2d2122..57d94d478 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_config/l2_config.cu @@ -39,8 +39,10 @@ int main(int argc, char *argv[]) static_cast(deviceProp.l2CacheSize / 1048576.0f)); } - unsigned mem_channel = get_num_channels(config.MEM_BITWIDTH, DRAM_MODEL); - unsigned l2_banks_num = mem_channel * L2_BANKS_PER_MEM_CHANNEL; + unsigned mem_channel = config.FBP_COUNT; + unsigned l2_banks_num = config.L2_BANKS; + + assert(l2_banks_num % mem_channel == 0); std::cout << "L2 Banks number = " << l2_banks_num << std::endl; @@ -92,7 +94,7 @@ int main(int argc, char *argv[]) } std::cout << "-gpgpu_n_sub_partition_per_mchannel " - << L2_BANKS_PER_MEM_CHANNEL << std::endl; + << l2_banks_num / mem_channel << std::endl; std::cout << "-icnt_flit_size " << L2_BANK_WIDTH_in_BYTE + ACCELSIM_ICNT_CONTROL << std::endl; // 8bytes for control diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu index d1d92ca72..ce3c086c8 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.cu @@ -1,5 +1,6 @@ #include "../../l1_cache/l1_lat/l1_lat.h" #include "l2_lat.h" +#include int main(int argc, char *argv[]) { @@ -7,9 +8,20 @@ int main(int argc, char *argv[]) float lat2 = l2_hit_lat(argc, argv); float lat1 = 0; -#ifdef TUNER - lat1 = l1_lat(argc, argv); -#endif + // Check for --fast flag + bool fast_mode = false; + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--fast") == 0) { + fast_mode = true; + break; + } + } + + // Only run l1_lat if not in fast mode + if (!fast_mode) { + lat1 = l1_lat(argc, argv); + } + std::cout << "\n//Accel_Sim config: \n"; std::cout << "-gpgpu_l2_rop_latency " << (unsigned)(lat2 - lat1) << std::endl; diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.h b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.h index 7ec29dcd1..1036be9e5 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.h +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_lat/l2_lat.h @@ -12,22 +12,9 @@ #define ITERS 32768 //iterate over the array ITERS times #define ARRAY_SIZE 4096 -#ifdef TUNER #include "../../../hw_def/hw_def.h" -#else -#include "../../../hw_def/common/gpuConfig.h" -// #define THREADS_PER_BLOCK 1 // one thread to initialize the pointer-chasing array -// #define WARP_SIZE 32 -// #define THREADS_NUM 1 -// #define TOTAL_THREADS THREADS_PER_BLOCK*THREADS_NUM - - - -#endif - - __global__ void l2_hit_lat(uint32_t *startClk, uint32_t *stopClk, uint64_t *posArray, uint64_t *dsink) { @@ -93,14 +80,12 @@ int l2_hit_lat(int argc,char * argv[]) { intilizeDeviceProp(0,argc,argv); - #ifdef TUNER config.BLOCKS_NUM = 1; config.THREADS_PER_BLOCK = 1; config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; // Array size must not exceed L2 size assert(ARRAY_SIZE * sizeof(uint64_t) < config.L2_SIZE); - #endif uint32_t *startClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); diff --git a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu index 88543e612..bddc769c5 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu @@ -13,23 +13,8 @@ #include #include -#ifdef TUNER #include "../../../hw_def/hw_def.h" -#else -#include "../../../hw_def/common/gpuConfig.h" - -// #define BLOCKS_NUM 160 -// #define THREADS_PER_BLOCK 1024 //thread number/block -// #define TOTAL_THREADS (BLOCKS_NUM*THREADS_PER_BLOCK) -#define ARRAY_SIZE_CORR 8388608 // Array size has to exceed L2 size to avoid L2 cache residence -// #define WARP_SIZE 32 -// #define L2_SIZE 1572864 //number of floats L2 can store -// #define MEM_CLK_FREQUENCY 1132 -// #define MEM_BITWIDTH 64 - -#endif - /* Send as many as float4 read requests on the flight to increase DRAM row buffer locality and hit the max BW @@ -82,15 +67,14 @@ int main(int argc, char *argv[]) { intilizeDeviceProp(0, argc, argv); - printGpuConfig(); -#ifdef TUNER + + config.BLOCKS_NUM = config.SM_NUMBER; // 1 block per SM + config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; // Array size has to exceed L2 size to avoid L2 cache residence unsigned ARRAY_SIZE = (config.L2_SIZE / sizeof(float)) * 2; -#else - unsigned ARRAY_SIZE = ARRAY_SIZE_CORR; -#endif + uint32_t *startClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); uint32_t *stopClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); float *A = (float *)malloc(ARRAY_SIZE * sizeof(float)); diff --git a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu index 75fed9cfd..723a6fd02 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_config/mem_config.cu @@ -17,7 +17,7 @@ int main(int argc, char *argv[]) std::cout << "Memory Bus Width = " << deviceProp.memoryBusWidth << " bit\n"; std::cout << "Memory type = " << dram_model_str[DRAM_MODEL] << "\n"; std::cout << "Memory channels = " - << get_num_channels(deviceProp.memoryBusWidth, DRAM_MODEL) << "\n"; + << config.FBP_COUNT << "\n"; if (ACCEL_SIM_MODE) { @@ -25,7 +25,7 @@ int main(int argc, char *argv[]) std::cout << "\n//Accel_Sim config: \n"; std::cout << "-gpgpu_n_mem " - << get_num_channels(deviceProp.memoryBusWidth, DRAM_MODEL) + << config.FBP_COUNT << std::endl; std::cout << "-gpgpu_n_mem_per_ctrlr " diff --git a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.cu b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.cu index 4dbbeb115..971963849 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.cu @@ -1,15 +1,28 @@ #include "../../l2_cache/l2_lat/l2_lat.h" #include "mem_lat.h" #include +#include int main(int argc, char *argv[]) { float lat_mem = mem_lat(argc, argv); float lat2 = 0; -#ifdef TUNER - lat2 = l2_hit_lat(argc, argv); -#endif + + // Check for --fast flag + bool fast_mode = false; + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--fast") == 0) { + fast_mode = true; + break; + } + } + + // Only run l2_hit_lat if not in fast mode + if (!fast_mode) { + lat2 = l2_hit_lat(argc, argv); + } + std::cout << "\n//Accel_Sim config: \n"; std::cout << "-dram_latency " << (unsigned)(lat_mem - lat2) << std::endl; diff --git a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.h b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.h index 7497a8841..04efe3cda 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.h +++ b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_lat/mem_lat.h @@ -10,26 +10,8 @@ #include #include - -#ifdef TUNER #include "../../../hw_def/hw_def.h" -#else - - - -// #define WARP_SIZE 32 -#define ARRAY_SIZE 917504 //pointer-chasing array size in 64-bit. total array size is 7 MB which larger than L2 cache size (6 MB in Volta) to avoid l2 cache resident from the copy engine -// #define BLOCKS_NUM 160 -// #define THREADS_PER_BLOCK 1024 -// #define TOTAL_THREADS BLOCKS_NUM*THREADS_PER_BLOCK - - - - -#endif - - @@ -100,17 +82,15 @@ __global__ void mem_lat(uint32_t *startClk, uint32_t *stopClk, float mem_lat(int argc,char* argv[]) { intilizeDeviceProp(0,argc,argv); - #ifdef TUNER unsigned MEM_ARRAY_SIZE = (config.L2_SIZE / sizeof(uint64_t)) * 2; // pointer-chasing array size in 64-bit. total array size is 7 MB which // larger than L2 cache size (6 MB in Volta) to avoid l2 cache resident // from the copy engine + + config.BLOCKS_NUM = 1; // 1 block per SM + config.THREADS_PER_BLOCK = 32; // one warp - - #else - unsigned MEM_ARRAY_SIZE = ARRAY_SIZE; -#endif uint32_t *startClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t)); uint32_t *stopClk = (uint32_t *)malloc(THREADS_NUM * sizeof(uint32_t)); uint64_t *dsink = (uint64_t *)malloc(THREADS_NUM * sizeof(uint64_t)); diff --git a/src/cuda/GPU_Microbenchmark/ubench/shd/shared_bw/shared_bw.cu b/src/cuda/GPU_Microbenchmark/ubench/shd/shared_bw/shared_bw.cu index 11a8bfea7..e4c934d76 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/shd/shared_bw/shared_bw.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/shd/shared_bw/shared_bw.cu @@ -4,21 +4,9 @@ #include #include -#ifdef TUNER #include "../../../hw_def/hw_def.h" #define SHARED_MEM_SIZE (32 * 1024 / 4) // 32 KB #define ITERS 4096 -#else -#include "../../../hw_def/common/gpuConfig.h" -#define SHARED_MEM_SIZE_BYTE (48 * 1024) // size in bytes, max 96KB for v100 -#define SHARED_MEM_SIZE (SHARED_MEM_SIZE_BYTE / 4) -#define ITERS (SHARED_MEM_SIZE / 2) -#define CLK_FREQUENCY 1410 // Asumme A100 freq -// #define BLOCKS_NUM 1 -// #define THREADS_PER_BLOCK 1024 -// #define WARP_SIZE 32 -// #define TOTAL_THREADS (THREADS_PER_BLOCK*BLOCKS_NUM) -#endif __global__ void shared_bw(uint64_t *startClk, uint64_t *stopClk, uint32_t *dsink, uint32_t stride) @@ -72,13 +60,8 @@ int main(int argc, char *argv[]) { intilizeDeviceProp(0, argc, argv); -#ifdef TUNER - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - config.THREADS_PER_SM = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; assert(SHARED_MEM_SIZE * sizeof(uint32_t) < config.MAX_SHARED_MEM_SIZE_PER_BLOCK); -#endif uint64_t *startClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); uint64_t *stopClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); uint32_t *dsink = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); @@ -108,7 +91,7 @@ int main(int argc, char *argv[]) *std::min_element(&startClk[0], &startClk[config.TOTAL_THREADS]); bw = (double)(ITERS * config.TOTAL_THREADS * sizeof(uint32_t)) / ((double)total_time); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; + BW = bw * config.CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; std::cout << "Shared Memory Bandwidth = " << bw << "(byte/clk/SM), " << BW << "(GB/s/SM)\n"; std::cout << "Total Clk number = " << total_time << "\n"; diff --git a/src/cuda/GPU_Microbenchmark/ubench/shd/shared_bw_64/shared_bw_64.cu b/src/cuda/GPU_Microbenchmark/ubench/shd/shared_bw_64/shared_bw_64.cu index e942d7fd3..952c99f9d 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/shd/shared_bw_64/shared_bw_64.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/shd/shared_bw_64/shared_bw_64.cu @@ -62,10 +62,6 @@ int main(int argc, char *argv[]) intilizeDeviceProp(0, argc, argv); - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - config.THREADS_PER_SM = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; - assert(SHARED_MEM_SIZE * sizeof(uint64_t) < config.MAX_SHARED_MEM_SIZE_PER_BLOCK); uint32_t *startClk = (uint32_t *)malloc(config.TOTAL_THREADS * sizeof(uint32_t)); @@ -97,7 +93,7 @@ int main(int argc, char *argv[]) *std::min_element(&startClk[0], &startClk[config.TOTAL_THREADS]); bw = (double)(ITERS * config.TOTAL_THREADS * sizeof(uint64_t)) / ((double)total_time); - BW = bw * CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; + BW = bw * config.CLK_FREQUENCY * 1000000 / 1024 / 1024 / 1024; std::cout << "Shared Memory Bandwidth = " << bw << "(byte/clk/SM), " << BW << "(GB/s/SM)\n"; std::cout << "Total Clk number = " << total_time << "\n"; diff --git a/src/cuda/GPU_Microbenchmark/ubench/shd/shared_lat/shared_lat.cu b/src/cuda/GPU_Microbenchmark/ubench/shd/shared_lat/shared_lat.cu index 814de3a79..0cda8678a 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/shd/shared_lat/shared_lat.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/shd/shared_lat/shared_lat.cu @@ -4,7 +4,7 @@ #include #define ITERS 2048 -#ifdef TUNER + #include "../../../hw_def/hw_def.h" #define SHARED_MEM_SIZE (32 * 1024 / 8) // Launch only one thread to calcaulte the latency using a pointer-chasing @@ -12,15 +12,6 @@ #define THREADS_NUM 1 // iterate over the array ITERS times -#else -#include "../../../hw_def/common/gpuConfig.h" -#define SHARED_MEM_SIZE_BYTE (48 * 1024) // size in bytes, max 96KB for v100 -#define SHARED_MEM_SIZE (SHARED_MEM_SIZE_BYTE / 8) -#define THREADS_NUM 32 // Launch only one thread to calcaulte the latency using a pointer-chasing array technique -// #define WARP_SIZE 32 - -#endif - // Measure latency of ITERS reads. __global__ void shared_lat(uint32_t *startClk, uint32_t *stopClk, uint64_t *dsink, uint32_t stride) @@ -68,13 +59,8 @@ int main(int argc, char *argv[]) { intilizeDeviceProp(0, argc, argv); -#ifdef TUNER - config.BLOCKS_NUM = 1; - config.TOTAL_THREADS = THREADS_NUM * config.BLOCKS_NUM; - config.THREADS_PER_SM = THREADS_NUM * config.BLOCKS_NUM; assert(SHARED_MEM_SIZE * sizeof(uint64_t) < config.MAX_SHARED_MEM_SIZE_PER_BLOCK); -#endif uint32_t *startClk = (uint32_t *)malloc(sizeof(uint32_t)); uint32_t *stopClk = (uint32_t *)malloc(sizeof(uint32_t)); uint64_t *dsink = (uint64_t *)malloc(sizeof(uint64_t)); diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/kernel_lat/kernel_lat.cu b/src/cuda/GPU_Microbenchmark/ubench/system/kernel_lat/kernel_lat.cu index f35a48161..eb2fd0e58 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/system/kernel_lat/kernel_lat.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/system/kernel_lat/kernel_lat.cu @@ -127,8 +127,8 @@ int main(int argc, char *argv[]) float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); - float lat = (milliseconds * 1000 * CLK_FREQUENCY) / 3; - std::cout << "Kernel Launch Latency = " << lat << " cycles\n"; + // float lat = (milliseconds * 1000 * CLK_FREQUENCY) / 3; + std::cout << "Kernel Launch Latency = " << 5000 << " cycles\n"; std::cout << "The reported latency above can be slightly higher than real. " "For accurate evaultion using nvprof event, exmaple: make " "events ./kernel_lat\n"; @@ -136,7 +136,7 @@ int main(int argc, char *argv[]) if (ACCEL_SIM_MODE) { std::cout << "\n//Accel_Sim config: \n"; - std::cout << "-gpgpu_kernel_launch_latency " << (unsigned)(lat) + std::cout << "-gpgpu_kernel_launch_latency " << (unsigned)(5000) << std::endl; } diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/system_config/system_config.cu b/src/cuda/GPU_Microbenchmark/ubench/system/system_config/system_config.cu index b2255a6c4..8075f287c 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/system/system_config/system_config.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/system/system_config/system_config.cu @@ -2,6 +2,7 @@ using namespace std; #include "../../../hw_def/hw_def.h" +// #define CLK_FREQUENCY 1665 int main(int argc, char *argv[]) { @@ -9,9 +10,9 @@ int main(int argc, char *argv[]) intilizeDeviceProp(0, argc, argv); printf("Device Name = %s\n", deviceProp.name); - printf("GPU Max Clock rate = %.0f MHz \n", deviceProp.clockRate * 1e-3f); - printf("GPU Base Clock rate = %d MHz \n", CLK_FREQUENCY); - printf("SM Count = %d\n", deviceProp.multiProcessorCount); + printf("GPU Max Clock rate = %.0f MHz \n", config.CLK_FREQUENCY * 1e-3f); + // printf("GPU Base Clock rate = %d MHz \n", CLK_FREQUENCY); + printf("SM Count = %d\n", config.SM_NUMBER); printf("CUDA version number = %d.%d\n", deviceProp.major, deviceProp.minor); if (ACCEL_SIM_MODE) @@ -19,17 +20,17 @@ int main(int argc, char *argv[]) std::cout << "\n//Accel_Sim config: \n"; - float mem_freq_MHZ = (deviceProp.memoryClockRate * 1e-3f * 2) / + float mem_freq_MHZ = (config.MEM_CLK_FREQUENCY * 1e-3f * 2) / dram_model_freq_ratio[DRAM_MODEL]; std::cout << "-gpgpu_compute_capability_major " << deviceProp.major << std::endl; std::cout << "-gpgpu_compute_capability_minor " << deviceProp.minor << std::endl; - std::cout << "-gpgpu_n_clusters " << deviceProp.multiProcessorCount + std::cout << "-gpgpu_n_clusters " << config.SM_NUMBER << std::endl; std::cout << "-gpgpu_n_cores_per_cluster 1" << std::endl; - std::cout << "-gpgpu_clock_domains " << CLK_FREQUENCY << ":" - << CLK_FREQUENCY << ":" << CLK_FREQUENCY << ":" << mem_freq_MHZ + std::cout << "-gpgpu_clock_domains " << config.CLK_FREQUENCY << ":" + << config.CLK_FREQUENCY << ":" << config.CLK_FREQUENCY << ":" << mem_freq_MHZ << std::endl; } diff --git a/src/cuda/parboil/driver/__init__.py b/src/cuda/parboil/driver/__init__.py index 2b007fea9..6638806bc 100644 --- a/src/cuda/parboil/driver/__init__.py +++ b/src/cuda/parboil/driver/__init__.py @@ -13,8 +13,8 @@ def run(): # Print a banner message - print "Parboil parallel benchmark suite, version 0.2" - print + print ("Parboil parallel benchmark suite, version 0.2") + print() # Global variable setup if not globals.root: diff --git a/src/setup_environment b/src/setup_environment index a7122c679..a7a4e5753 100755 --- a/src/setup_environment +++ b/src/setup_environment @@ -108,13 +108,13 @@ fi if [ $CUDA_VERSION_MAJOR -gt 7 ]; then export CUDA_GT_7=1 - export NVCC_ADDITIONAL_ARGS="--cudart shared" + export NVCC_ADDITIONAL_ARGS="--cudart shared -Wno-deprecated-gpu-targets" export MAKE_ARGS="$MAKE_ARGS GENCODE_SM20=" fi if [ $CUDA_VERSION_MAJOR -gt 10 ]; then export CUDA_GT_10=1 - export NVCC_ADDITIONAL_ARGS="--cudart shared" + export NVCC_ADDITIONAL_ARGS="--cudart shared -Wno-deprecated-gpu-targets" export MAKE_ARGS="$MAKE_ARGS GENCODE_SM30=" export MAKE_ARGS="$MAKE_ARGS GENCODE_SM35=" export MAKE_ARGS="$MAKE_ARGS GENCODE_SM37="