diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 30bc747d7e..62600a9904 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -15,6 +15,7 @@ on: # A workflow run is made up of one or more jobs that can run sequentially or in parallel env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} + PR_HEAD_REPO_FULL_NAME: ${{ github.event.pull_request.head.repo.full_name || github.repository }} jobs: check-format: @@ -64,7 +65,7 @@ jobs: # Try to checkout the same branch from the same owner's fork first if [[ ${{ github.event_name }} == 'pull_request' ]]; then - current_owner=$(echo ${{ github.event.pull_request.head.repo.full_name }} | cut -d'/' -f1) + current_owner=$(echo $PR_HEAD_REPO_FULL_NAME | cut -d'/' -f1) else current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1) fi @@ -116,7 +117,7 @@ jobs: source ./env-setup/12.8_env_setup.sh rm -rf ./statistics-archive git clone --quiet git@github.com:accel-sim/statistics-archive.git - BRANCH_NAME=${{ github.repository }}/$BRANCH_NAME + BRANCH_NAME=$PR_HEAD_REPO_FULL_NAME/$BRANCH_NAME # either create a new branch or check it out if it already exists git -C ./statistics-archive checkout $BRANCH_NAME 2>/dev/null || git -C ./statistics-archive checkout -b $BRANCH_NAME ./util/job_launching/get_stats.py -k -K -R -B GPU_Microbenchmark -C QV100-SASS -A | tee v100-ubench-sass-local.csv @@ -142,12 +143,10 @@ jobs: | tee ampere-ubench-sass-latest2.csv && mv ampere-ubench-sass-local.csv ./statistics-archive/ubench/ampere-ubench-sass-latest.csv ./util/plotting/merge-stats.py -R -c ./statistics-archive/ubench/ampere-a100-ubench-sass-latest.csv,ampere-a100-ubench-sass-local.csv \ | tee ampere-a100-ubench-sass-latest2.csv && mv ampere-a100-ubench-sass-local.csv ./statistics-archive/ubench/ampere-a100-ubench-sass-latest.csv - if [[ $GITHUB_EVENT_NAME == 'push' ]]; then - git -C ./statistics-archive add --all - git -C ./statistics-archive commit \ - -m "CI automated checkin $BRANCH_NAME Build:${{ github.run_number }}"_"${{ github.run_attempt}}" || echo "No Changes." - git -C ./statistics-archive push -u origin $BRANCH_NAME - fi + git -C ./statistics-archive add --all + git -C ./statistics-archive commit \ + -m "CI automated checkin $BRANCH_NAME Build:${{ github.run_number }}"_"${{ github.run_attempt}}" || echo "No Changes." + git -C ./statistics-archive push -u origin $BRANCH_NAME - name: Correlate Ubench run: | source ./env-setup/12.8_env_setup.sh @@ -165,12 +164,12 @@ jobs: ssh ghci@tgrogers-pc01 mkdir -p /home/ghci/accel-sim/correl/git_$BRANCH_NAME"_"${{ github.run_number }}"_"${{ github.run_attempt}}/ rsync --delete -r ./util/plotting/correl-html/ ghci@tgrogers-pc01:/home/ghci/accel-sim/correl/git_$BRANCH_NAME"_"${{ github.run_number }}"_"${{ github.run_attempt}}/ + git -C ./statistics-archive reset --soft HEAD~1 + git -C ./statistics-archive add --all + git -C ./statistics-archive commit \ + -m "CI automated checkin $PR_HEAD_REPO_FULL_NAME/$BRANCH_NAME Build:${{ github.run_number }}"_"${{ github.run_attempt}}" || echo "No Changes." + git -C ./statistics-archive push -f -u origin $PR_HEAD_REPO_FULL_NAME/$BRANCH_NAME if [[ ${{ github.event_name }} == 'push' ]]; then - git -C ./statistics-archive reset --soft HEAD~1 - git -C ./statistics-archive add --all - git -C ./statistics-archive commit \ - -m "CI automated checkin ${{ github.repository }}/$BRANCH_NAME Build:${{ github.run_number }}"_"${{ github.run_attempt}}" || echo "No Changes." - git -C ./statistics-archive push -f -u origin ${{ github.repository }}/$BRANCH_NAME rm -rf /scratch/tgrogers-disk01/a/tgrghci/ci/lastSuccess/${{ github.repository }}/$BRANCH_NAME mkdir -p /scratch/tgrogers-disk01/a/tgrghci/ci/lastSuccess/${{ github.repository }}/$BRANCH_NAME @@ -202,7 +201,7 @@ jobs: # Try to checkout the same branch from the same owner's fork first if [[ ${{ github.event_name }} == 'pull_request' ]]; then - current_owner=$(echo ${{ github.event.pull_request.head.repo.full_name }} | cut -d'/' -f1) + current_owner=$(echo $PR_HEAD_REPO_FULL_NAME | cut -d'/' -f1) else current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1) fi @@ -274,7 +273,7 @@ jobs: # Try to checkout the same branch from the same owner's fork first if [[ ${{ github.event_name }} == 'pull_request' ]]; then - current_owner=$(echo ${{ github.event.pull_request.head.repo.full_name }} | cut -d'/' -f1) + current_owner=$(echo $PR_HEAD_REPO_FULL_NAME | cut -d'/' -f1) else current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1) fi @@ -380,7 +379,7 @@ jobs: # stats are only archived on pushes. So the repo in the report url needs to be the head repo of the PR # The stats htmls are generated by CI triggered by pushes to the head repo. # Not so clean. But works for now. The htmls are not that important anyway. - export REPORT_URL="https://rawcdn.githack.com/accel-sim/statistics-archive/${{ github.event.pull_request.head.repo.full_name }}/$BRANCH_NAME/ubench/" + export REPORT_URL="https://rawcdn.githack.com/accel-sim/statistics-archive/$PR_HEAD_REPO_FULL_NAME/$BRANCH_NAME/ubench/" python3 .github/scripts/send_ci_email.py -t success fi diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml index d962d8c3fc..363c8c39bf 100644 --- a/.github/workflows/weekly.yml +++ b/.github/workflows/weekly.yml @@ -53,6 +53,7 @@ jobs: source ./gpu-app-collection/src/setup_environment rm -rf ./hw_run/ ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7 + ./util/tracer_nvbit/run_hw.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7 rm -rf /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces mkdir -p /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run diff --git a/util/hw_stats/run_hw.py b/util/hw_stats/run_hw.py index d90e3f5421..fc4717cf32 100755 --- a/util/hw_stats/run_hw.py +++ b/util/hw_stats/run_hw.py @@ -234,7 +234,7 @@ "l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_miss.sum,idc__requests.sum,idc__requests_lookup_hit.sum," "sm__sass_inst_executed_op_shared_ld.sum,sm__sass_inst_executed_op_shared_st.sum,lts__t_sectors_srcunit_tex_op_read_lookup_miss.sum,lts__t_sectors_srcunit_tex_op_write_lookup_miss.sum,sm__pipe_alu_cycles_active.sum,sm__pipe_fma_cycles_active.sum,sm__pipe_fp64_cycles_active.sum,sm__pipe_shared_cycles_active.sum,sm__pipe_tensor_cycles_active.sum,sm__pipe_tensor_op_hmma_cycles_active.sum,sm__cycles_active.sum,sm__cycles_active.avg,sm__cycles_elapsed.avg,sm__sass_thread_inst_executed_op_integer_pred_on.sum,sm__sass_thread_inst_executed_ops_dadd_dmul_dfma_pred_on.sum,sm__sass_thread_inst_executed_ops_fadd_fmul_ffma_pred_on.sum,sm__sass_thread_inst_executed_ops_hadd_hmul_hfma_pred_on.sum,sm__inst_executed_pipe_alu.sum,sm__inst_executed_pipe_fma.sum,sm__inst_executed_pipe_fp16.sum,sm__inst_executed_pipe_fp64.sum,sm__inst_executed_pipe_tensor.sum,sm__inst_executed_pipe_tex.sum,sm__inst_executed_pipe_xu.sum,sm__inst_executed_pipe_lsu.sum," "sm__sass_thread_inst_executed_op_fp16_pred_on.sum,sm__sass_thread_inst_executed_op_fp32_pred_on.sum,sm__sass_thread_inst_executed_op_fp64_pred_on.sum,sm__sass_thread_inst_executed_op_dmul_pred_on.sum,sm__sass_thread_inst_executed_op_dfma_pred_on.sum,sm__sass_inst_executed_op_memory_128b.sum,sm__sass_inst_executed_op_memory_64b.sum,sm__sass_inst_executed_op_memory_32b.sum,sm__sass_inst_executed_op_memory_16b.sum,sm__sass_inst_executed_op_memory_8b.sum,smsp__thread_inst_executed_per_inst_executed.ratio,sm__sass_thread_inst_executed.sum" - " --csv --page raw --target-processes all " + " --csv --page raw --target-processes all -f " + kernel_number + " -o " + os.path.join(this_run_dir, "ncu_stats") @@ -306,7 +306,7 @@ ) elif options.nsight_profiler: profile_command = ( - "ncu --target-processes all --metrics gpc__cycles_elapsed.avg --csv " + "ncu --target-processes all --metrics gpc__cycles_elapsed.avg --csv -f " + kernel_number + " -o " + os.path.join(this_run_dir, "ncu_cycles.{0}".format(i)) diff --git a/util/job_launching/apps/define-all-apps.yml b/util/job_launching/apps/define-all-apps.yml index 3d0a6e4168..411f18a59f 100644 --- a/util/job_launching/apps/define-all-apps.yml +++ b/util/job_launching/apps/define-all-apps.yml @@ -54,40 +54,40 @@ GPU_Microbenchmark: data_dirs: "$GPUAPPS_ROOT/data_dirs/" execs: - l1_bw_32f: - - args: --tpb 1024 --tpsm 1024 --blocks 1 --ws 32 + - args: --tpb 1024 --tpsm 1024 --blocks 1 accel-sim-mem: 1G - l1_bw_64f: - - args: --tpb 1024 --tpsm 1024 --blocks 1 --ws 32 + - args: --tpb 1024 --tpsm 1024 --blocks 1 accel-sim-mem: 1G - l1_bw_128: - - args: --tpb 1024 --tpsm 1024 --blocks 1 --ws 32 + - args: --tpb 1024 --tpsm 1024 --blocks 1 accel-sim-mem: 2G - l1_lat: - - args: --blocks 1 --ws 32 + - args: --blocks 1 --fast accel-sim-mem: 1G - l2_bw_32f: - - args: --tpb 1024 --tpsm 1024 --blocks 160 --ws 32 + - args: --tpb 1024 --tpsm 1024 --fast accel-sim-mem: 6G - l2_bw_64f: - - args: --tpb 1024 --tpsm 1024 --blocks 160 --l2 786432 --ws 32 + - args: --tpb 1024 --tpsm 1024 --fast accel-sim-mem: 6G # - l2_bw_128: # - args: # accel-sim-mem: 1G - l2_lat: - - args: --tpb 1 --tpsm 1 --blocks 1 --l2 786432 --ws 32 + - args: --fast accel-sim-mem: 1G - mem_bw: - - args: --tpb 1024 --tpsm 1024 --blocks 80 --l2 1572864 --ws 32 --memclk 1132 --membw 64 + - args: --tpb 1024 --tpsm 1024 accel-sim-mem: 2G - mem_lat: - - args: --tpb 1024 --tpsm 1024 --blocks 80 --l2 1572864 --ws 32 --memclk 1132 --membw 64 + - args: --fast accel-sim-mem: 1G - shared_bw: - - args: --tpb 1024 --tpsm 1024 --blocks 1 --ws 32 + - args: --tpb 1024 --tpsm 1024 --blocks 1 accel-sim-mem: 2G - shared_lat: - - args: --blocks 1 --ws 32 + - args: --blocks 1 accel-sim-mem: 1G - shared_bank_conflicts: ## argument 1 kernel has conflicts @@ -97,16 +97,16 @@ GPU_Microbenchmark: - args: 2 accel-sim-mem: 1G - MaxIops_int32: - - args: --tpb 1024 --blocks 1 --ws 32 + - args: --tpb 1024 --blocks 1 accel-sim-mem: 1G - l1_shared_bw: - - args: --tpb 1024 --blocks 1 --ws 32 + - args: --tpb 1024 --blocks 1 accel-sim-mem: 1G - l1_bw_32f_unroll: - - args: --tpb 1024 --blocks 1 --ws 32 + - args: --tpb 1024 --blocks 1 accel-sim-mem: 1G - l1_bw_32f_unroll_large: - - args: + - args: --tpb 1024 --blocks 1 accel-sim-mem: 1G GPU_Atomic: @@ -114,10 +114,10 @@ GPU_Atomic: data_dirs: "$GPUAPPS_ROOT/data_dirs/" execs: - atomic_add_bw: - - args: --tpb 1 --tpsm 1 --blocks 1 --ws 32 + - args: --tpb 1 --tpsm 1 --blocks 1 --fast accel-sim-mem: 1G - atomic_add_bw_conflict: - - args: --tpb 1024 --tpsm 2048 --blocks 160 --ws 32 + - args: --tpb 1024 --tpsm 2048 --fast accel-sim-mem: 1G - atomic_add_bw_profile: - args: 16 @@ -1067,6 +1067,27 @@ huggingface: - args: accel-sim-mem: 10G +tma: + exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/tma" + data_dirs: "" + execs: + - tma_tensor_test: + - args: -w 1024 -h 1024 -o UTMAPF + - args: -w 1024 -h 1024 -o UTMALDG + - args: -w 1024 -h 1024 -o UTMASTG + - args: -w 1024 -h 1024 -o UTMAREDG + - args: -w 1024 -h 1024 -o REGULAR_LOAD + accel-sim-mem: 10G + - tma_bulk_test: + - args: -n 1024 -o UBLKPF + - args: -n 1024 -o UBLKCP_S_G + - args: -n 1024 -o UBLKCP_G_S + - args: -n 1024 -o UBLKRED_G_S + - args: -n 262144 -o UBLKPF + - args: -n 262144 -o UBLKCP_S_G + - args: -n 262144 -o UBLKCP_G_S + - args: -n 262144 -o UBLKRED_G_S + accel-sim-mem: 10G vllm: exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/vllm" diff --git a/util/tuner/run_all.sh b/util/tuner/run_all.sh index 93dcf16e0a..57ef770689 100755 --- a/util/tuner/run_all.sh +++ b/util/tuner/run_all.sh @@ -1,17 +1,95 @@ #!/bin/bash # THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )" +source ./gpu-app-collection-partial/src/setup_environment SCRIPT_DIR="./gpu-app-collection-partial/src/cuda/GPU_Microbenchmark/" echo "Running make in $SCRIPT_DIR" -make -C "$SCRIPT_DIR" tuner -j || { echo "make failed"; exit 1; } +make -C "$SCRIPT_DIR" -j || { echo "make failed"; exit 1; } cd ${SCRIPT_DIR}/bin/ -for f in ./*; do - if [[ "$f" == *_corr ]]; then - continue - fi - - echo "running $f microbenchmark" - $f - echo "/////////////////////////////////" -done \ No newline at end of file + +# List of configuration benchmarks that output lines starting with "-" +# These are used by tuner.py to generate Accel-Sim configuration + +# System config +echo "running system_config" +./system_config +echo "/////////////////////////////////" + +# Core config +echo "running core_config" +./core_config +echo "/////////////////////////////////" + +echo "running config_dpu" +./config_dpu +echo "/////////////////////////////////" + +echo "running config_fpu" +./config_fpu +echo "/////////////////////////////////" + +echo "running config_int" +./config_int +echo "/////////////////////////////////" + +echo "running config_sfu" +./config_sfu +echo "/////////////////////////////////" + +echo "running config_tensor" +./config_tensor +echo "/////////////////////////////////" + +echo "running config_udp" +./config_udp +echo "/////////////////////////////////" + +echo "running regfile_bw" +./regfile_bw +echo "/////////////////////////////////" + +# L1 cache config +echo "running l1_config" +./l1_config +echo "/////////////////////////////////" + +echo "running l1_lat with args: --blocks 1" +./l1_lat --blocks 1 +echo "/////////////////////////////////" + +# L2 cache config +echo "running l2_config" +./l2_config +echo "/////////////////////////////////" + +echo "running l2_copy_engine" +./l2_copy_engine +echo "/////////////////////////////////" + +echo "running l2_lat" +./l2_lat +echo "/////////////////////////////////" + +# Memory config +echo "running mem_config" +./mem_config +echo "/////////////////////////////////" + +echo "running mem_lat" +./mem_lat +echo "/////////////////////////////////" + +# Shared memory config +echo "running shd_config" +./shd_config +echo "/////////////////////////////////" + +echo "running shared_lat with args: --blocks 1" +./shared_lat --blocks 1 +echo "/////////////////////////////////" + +# Kernel latency +echo "running kernel_lat" +./kernel_lat +echo "/////////////////////////////////" \ No newline at end of file