accel-sim · JRPan · Nov 25, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 26, 2025
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -15,6 +15,7 @@ on:
 # A workflow run is made up of one or more jobs that can run sequentially or in parallel
 env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
+  PR_HEAD_REPO_FULL_NAME: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
 
 jobs:
   check-format:
@@ -64,7 +65,7 @@ jobs:
 
           # Try to checkout the same branch from the same owner's fork first
           if [[ ${{ github.event_name }} == 'pull_request' ]]; then
-            current_owner=$(echo ${{ github.event.pull_request.head.repo.full_name }} | cut -d'/' -f1)
+            current_owner=$(echo $PR_HEAD_REPO_FULL_NAME | cut -d'/' -f1)
           else
             current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1)
           fi
@@ -116,7 +117,7 @@ jobs:
           source ./env-setup/12.8_env_setup.sh
           rm -rf ./statistics-archive
           git clone --quiet [email protected]:accel-sim/statistics-archive.git
-          BRANCH_NAME=${{ github.repository }}/$BRANCH_NAME
+          BRANCH_NAME=$PR_HEAD_REPO_FULL_NAME/$BRANCH_NAME
           # either create a new branch or check it out if it already exists
           git -C ./statistics-archive checkout $BRANCH_NAME 2>/dev/null || git -C ./statistics-archive checkout -b $BRANCH_NAME
           ./util/job_launching/get_stats.py -k -K -R -B GPU_Microbenchmark -C QV100-SASS -A | tee v100-ubench-sass-local.csv
@@ -142,12 +143,10 @@ jobs:
           | tee ampere-ubench-sass-latest2.csv && mv ampere-ubench-sass-local.csv ./statistics-archive/ubench/ampere-ubench-sass-latest.csv
           ./util/plotting/merge-stats.py -R -c ./statistics-archive/ubench/ampere-a100-ubench-sass-latest.csv,ampere-a100-ubench-sass-local.csv \
           | tee ampere-a100-ubench-sass-latest2.csv && mv ampere-a100-ubench-sass-local.csv ./statistics-archive/ubench/ampere-a100-ubench-sass-latest.csv
-          if [[ $GITHUB_EVENT_NAME == 'push' ]]; then
-            git -C ./statistics-archive add --all
-            git -C ./statistics-archive commit \
-            -m "CI automated checkin $BRANCH_NAME Build:${{ github.run_number }}"_"${{ github.run_attempt}}" || echo "No Changes."
-            git -C ./statistics-archive push -u origin $BRANCH_NAME
-          fi
+          git -C ./statistics-archive add --all
+          git -C ./statistics-archive commit \
+          -m "CI automated checkin $BRANCH_NAME Build:${{ github.run_number }}"_"${{ github.run_attempt}}" || echo "No Changes."
+          git -C ./statistics-archive push -u origin $BRANCH_NAME
       - name: Correlate Ubench
         run: |
           source ./env-setup/12.8_env_setup.sh
@@ -165,12 +164,12 @@ jobs:
 
           ssh ghci@tgrogers-pc01 mkdir -p /home/ghci/accel-sim/correl/git_$BRANCH_NAME"_"${{ github.run_number }}"_"${{ github.run_attempt}}/
           rsync --delete -r ./util/plotting/correl-html/ ghci@tgrogers-pc01:/home/ghci/accel-sim/correl/git_$BRANCH_NAME"_"${{ github.run_number }}"_"${{ github.run_attempt}}/
+          git -C ./statistics-archive reset --soft HEAD~1
+          git -C ./statistics-archive add --all
+          git -C ./statistics-archive commit \
+          -m "CI automated checkin $PR_HEAD_REPO_FULL_NAME/$BRANCH_NAME Build:${{ github.run_number }}"_"${{ github.run_attempt}}" || echo "No Changes."
+          git -C ./statistics-archive push -f -u origin $PR_HEAD_REPO_FULL_NAME/$BRANCH_NAME
           if [[ ${{ github.event_name }} == 'push' ]]; then
-            git -C ./statistics-archive reset --soft HEAD~1
-            git -C ./statistics-archive add --all
-            git -C ./statistics-archive commit \
-            -m "CI automated checkin ${{ github.repository }}/$BRANCH_NAME Build:${{ github.run_number }}"_"${{ github.run_attempt}}" || echo "No Changes."
-            git -C ./statistics-archive push -f -u origin ${{ github.repository }}/$BRANCH_NAME
 
             rm -rf /scratch/tgrogers-disk01/a/tgrghci/ci/lastSuccess/${{ github.repository }}/$BRANCH_NAME
             mkdir -p /scratch/tgrogers-disk01/a/tgrghci/ci/lastSuccess/${{ github.repository }}/$BRANCH_NAME
@@ -202,7 +201,7 @@ jobs:
 
           # Try to checkout the same branch from the same owner's fork first
           if [[ ${{ github.event_name }} == 'pull_request' ]]; then
-            current_owner=$(echo ${{ github.event.pull_request.head.repo.full_name }} | cut -d'/' -f1)
+            current_owner=$(echo $PR_HEAD_REPO_FULL_NAME | cut -d'/' -f1)
           else
             current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1)
           fi
@@ -274,7 +273,7 @@ jobs:
 
           # Try to checkout the same branch from the same owner's fork first
           if [[ ${{ github.event_name }} == 'pull_request' ]]; then
-            current_owner=$(echo ${{ github.event.pull_request.head.repo.full_name }} | cut -d'/' -f1)
+            current_owner=$(echo $PR_HEAD_REPO_FULL_NAME | cut -d'/' -f1)
           else
             current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1)
           fi
@@ -380,7 +379,7 @@ jobs:
             # stats are only archived on pushes. So the repo in the report url needs to be the head repo of the PR
             # The stats htmls are generated by CI triggered by pushes to the head repo.
             # Not so clean. But works for now. The htmls are not that important anyway. 
-            export REPORT_URL="https://rawcdn.githack.com/accel-sim/statistics-archive/${{ github.event.pull_request.head.repo.full_name }}/$BRANCH_NAME/ubench/"
+            export REPORT_URL="https://rawcdn.githack.com/accel-sim/statistics-archive/$PR_HEAD_REPO_FULL_NAME/$BRANCH_NAME/ubench/"
             python3 .github/scripts/send_ci_email.py -t success 
           fi
 

diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml
@@ -53,6 +53,7 @@ jobs:
           source ./gpu-app-collection/src/setup_environment
           rm -rf ./hw_run/
           ./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7
+          ./util/tracer_nvbit/run_hw.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7
           rm -rf /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces
           mkdir -p /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces
           mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run

diff --git a/util/hw_stats/run_hw.py b/util/hw_stats/run_hw.py
@@ -234,7 +234,7 @@
                     "l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_miss.sum,idc__requests.sum,idc__requests_lookup_hit.sum,"
                     "sm__sass_inst_executed_op_shared_ld.sum,sm__sass_inst_executed_op_shared_st.sum,lts__t_sectors_srcunit_tex_op_read_lookup_miss.sum,lts__t_sectors_srcunit_tex_op_write_lookup_miss.sum,sm__pipe_alu_cycles_active.sum,sm__pipe_fma_cycles_active.sum,sm__pipe_fp64_cycles_active.sum,sm__pipe_shared_cycles_active.sum,sm__pipe_tensor_cycles_active.sum,sm__pipe_tensor_op_hmma_cycles_active.sum,sm__cycles_active.sum,sm__cycles_active.avg,sm__cycles_elapsed.avg,sm__sass_thread_inst_executed_op_integer_pred_on.sum,sm__sass_thread_inst_executed_ops_dadd_dmul_dfma_pred_on.sum,sm__sass_thread_inst_executed_ops_fadd_fmul_ffma_pred_on.sum,sm__sass_thread_inst_executed_ops_hadd_hmul_hfma_pred_on.sum,sm__inst_executed_pipe_alu.sum,sm__inst_executed_pipe_fma.sum,sm__inst_executed_pipe_fp16.sum,sm__inst_executed_pipe_fp64.sum,sm__inst_executed_pipe_tensor.sum,sm__inst_executed_pipe_tex.sum,sm__inst_executed_pipe_xu.sum,sm__inst_executed_pipe_lsu.sum,"
                     "sm__sass_thread_inst_executed_op_fp16_pred_on.sum,sm__sass_thread_inst_executed_op_fp32_pred_on.sum,sm__sass_thread_inst_executed_op_fp64_pred_on.sum,sm__sass_thread_inst_executed_op_dmul_pred_on.sum,sm__sass_thread_inst_executed_op_dfma_pred_on.sum,sm__sass_inst_executed_op_memory_128b.sum,sm__sass_inst_executed_op_memory_64b.sum,sm__sass_inst_executed_op_memory_32b.sum,sm__sass_inst_executed_op_memory_16b.sum,sm__sass_inst_executed_op_memory_8b.sum,smsp__thread_inst_executed_per_inst_executed.ratio,sm__sass_thread_inst_executed.sum"
-                    " --csv --page raw --target-processes all "
+                    " --csv --page raw --target-processes all -f "
                     + kernel_number
                     + " -o "
                     + os.path.join(this_run_dir, "ncu_stats")
@@ -306,7 +306,7 @@
                 )
             elif options.nsight_profiler:
                 profile_command = (
-                    "ncu --target-processes all --metrics gpc__cycles_elapsed.avg --csv "
+                    "ncu --target-processes all --metrics gpc__cycles_elapsed.avg --csv -f "
                     + kernel_number
                     + " -o "
                     + os.path.join(this_run_dir, "ncu_cycles.{0}".format(i))

diff --git a/util/job_launching/apps/define-all-apps.yml b/util/job_launching/apps/define-all-apps.yml
@@ -54,40 +54,40 @@ GPU_Microbenchmark:
     data_dirs: "$GPUAPPS_ROOT/data_dirs/"
     execs:
         - l1_bw_32f:
-            - args: --tpb 1024 --tpsm 1024 --blocks 1 --ws 32
+            - args: --tpb 1024 --tpsm 1024 --blocks 1
               accel-sim-mem: 1G
         - l1_bw_64f:
-            - args: --tpb 1024 --tpsm 1024 --blocks 1 --ws 32
+            - args: --tpb 1024 --tpsm 1024 --blocks 1
               accel-sim-mem: 1G
         - l1_bw_128:
-            - args: --tpb 1024 --tpsm 1024 --blocks 1 --ws 32
+            - args: --tpb 1024 --tpsm 1024 --blocks 1
               accel-sim-mem: 2G
         - l1_lat:
-            - args: --blocks 1 --ws 32
+            - args: --blocks 1 --fast
               accel-sim-mem: 1G
         - l2_bw_32f:
-            - args: --tpb 1024 --tpsm 1024 --blocks 160 --ws 32
+            - args: --tpb 1024 --tpsm 1024 --fast
               accel-sim-mem: 6G
         - l2_bw_64f:
-            - args: --tpb 1024 --tpsm 1024 --blocks 160 --l2 786432 --ws 32
+            - args: --tpb 1024 --tpsm 1024 --fast
               accel-sim-mem: 6G
 #        - l2_bw_128:
 #            - args:
 #              accel-sim-mem: 1G
         - l2_lat:
-            - args: --tpb 1 --tpsm 1 --blocks 1 --l2 786432 --ws 32
+            - args: --fast
               accel-sim-mem: 1G
         - mem_bw:
-            - args: --tpb 1024 --tpsm 1024 --blocks 80 --l2 1572864 --ws 32 --memclk 1132 --membw 64
+            - args: --tpb 1024 --tpsm 1024
               accel-sim-mem: 2G
         - mem_lat:
-            - args: --tpb 1024 --tpsm 1024 --blocks 80 --l2 1572864 --ws 32 --memclk 1132 --membw 64
+            - args: --fast
               accel-sim-mem: 1G
         - shared_bw:
-            - args: --tpb 1024 --tpsm 1024 --blocks 1  --ws 32 
+            - args: --tpb 1024 --tpsm 1024 --blocks 1  
               accel-sim-mem: 2G
         - shared_lat:
-            - args:  --blocks 1  --ws 32 
+            - args:  --blocks 1  
               accel-sim-mem: 1G
         - shared_bank_conflicts:
         ## argument 1 kernel has conflicts
@@ -97,27 +97,27 @@ GPU_Microbenchmark:
             - args: 2
               accel-sim-mem: 1G
         - MaxIops_int32:
-            - args: --tpb 1024  --blocks 1  --ws 32 
+            - args: --tpb 1024  --blocks 1  
               accel-sim-mem: 1G
         - l1_shared_bw:
-            - args: --tpb 1024  --blocks 1  --ws 32 
+            - args: --tpb 1024  --blocks 1  
               accel-sim-mem: 1G
         - l1_bw_32f_unroll:
-            - args: --tpb 1024 --blocks 1  --ws 32 
+            - args: --tpb 1024 --blocks 1  
               accel-sim-mem: 1G
         - l1_bw_32f_unroll_large:
-            - args:
+            - args: --tpb 1024  --blocks 1 
               accel-sim-mem: 1G
 
 GPU_Atomic:
     exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/"
     data_dirs: "$GPUAPPS_ROOT/data_dirs/"
     execs:
         - atomic_add_bw:
-            - args:   --tpb 1 --tpsm 1 --blocks 1  --ws 32 
+            - args:   --tpb 1 --tpsm 1 --blocks 1  --fast
               accel-sim-mem: 1G
         - atomic_add_bw_conflict:
-            - args: --tpb 1024 --tpsm 2048 --blocks 160  --ws 32 
+            - args: --tpb 1024 --tpsm 2048 --fast
               accel-sim-mem: 1G
         - atomic_add_bw_profile:
             - args: 16
@@ -1067,6 +1067,27 @@ huggingface:
             - args:
               accel-sim-mem: 10G
 
+tma:
+    exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/tma"
+    data_dirs: ""
+    execs:
+        - tma_tensor_test:
+            - args: -w 1024 -h 1024 -o UTMAPF
+            - args: -w 1024 -h 1024 -o UTMALDG
+            - args: -w 1024 -h 1024 -o UTMASTG
+            - args: -w 1024 -h 1024 -o UTMAREDG
+            - args: -w 1024 -h 1024 -o REGULAR_LOAD
+              accel-sim-mem: 10G
+        - tma_bulk_test:
+            - args: -n 1024 -o UBLKPF
+            - args: -n 1024 -o UBLKCP_S_G
+            - args: -n 1024 -o UBLKCP_G_S
+            - args: -n 1024 -o UBLKRED_G_S
+            - args: -n 262144 -o UBLKPF
+            - args: -n 262144 -o UBLKCP_S_G
+            - args: -n 262144 -o UBLKCP_G_S
+            - args: -n 262144 -o UBLKRED_G_S
+              accel-sim-mem: 10G
 
 vllm:
     exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/vllm"

diff --git a/util/tuner/run_all.sh b/util/tuner/run_all.sh
@@ -1,17 +1,95 @@
 #!/bin/bash
 
 # THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )"
+source ./gpu-app-collection-partial/src/setup_environment
 SCRIPT_DIR="./gpu-app-collection-partial/src/cuda/GPU_Microbenchmark/"
 echo "Running make in $SCRIPT_DIR"
-make -C "$SCRIPT_DIR" tuner -j || { echo "make failed"; exit 1; }
+make -C "$SCRIPT_DIR" -j || { echo "make failed"; exit 1; }
 
 cd ${SCRIPT_DIR}/bin/
-for f in ./*; do
-    if [[ "$f" == *_corr ]]; then
-        continue
-    fi
-
-    echo "running $f microbenchmark"
-    $f
-    echo "/////////////////////////////////"
-done
+
+# List of configuration benchmarks that output lines starting with "-"
+# These are used by tuner.py to generate Accel-Sim configuration
+
+# System config
+echo "running system_config"
+./system_config
+echo "/////////////////////////////////"
+
+# Core config
+echo "running core_config"
+./core_config
+echo "/////////////////////////////////"
+
+echo "running config_dpu"
+./config_dpu
+echo "/////////////////////////////////"
+
+echo "running config_fpu"
+./config_fpu
+echo "/////////////////////////////////"
+
+echo "running config_int"
+./config_int
+echo "/////////////////////////////////"
+
+echo "running config_sfu"
+./config_sfu
+echo "/////////////////////////////////"
+
+echo "running config_tensor"
+./config_tensor
+echo "/////////////////////////////////"
+
+echo "running config_udp"
+./config_udp
+echo "/////////////////////////////////"
+
+echo "running regfile_bw"
+./regfile_bw
+echo "/////////////////////////////////"
+
+# L1 cache config
+echo "running l1_config"
+./l1_config
+echo "/////////////////////////////////"
+
+echo "running l1_lat with args: --blocks 1"
+./l1_lat --blocks 1
+echo "/////////////////////////////////"
+
+# L2 cache config
+echo "running l2_config"
+./l2_config
+echo "/////////////////////////////////"
+
+echo "running l2_copy_engine"
+./l2_copy_engine
+echo "/////////////////////////////////"
+
+echo "running l2_lat"
+./l2_lat
+echo "/////////////////////////////////"
+
+# Memory config
+echo "running mem_config"
+./mem_config
+echo "/////////////////////////////////"
+
+echo "running mem_lat"
+./mem_lat
+echo "/////////////////////////////////"
+
+# Shared memory config
+echo "running shd_config"
+./shd_config
+echo "/////////////////////////////////"
+
+echo "running shared_lat with args: --blocks 1"
+./shared_lat --blocks 1
+echo "/////////////////////////////////"
+
+# Kernel latency
+echo "running kernel_lat"
+./kernel_lat
+echo "/////////////////////////////////"