Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 15 additions & 16 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ on:
# A workflow run is made up of one or more jobs that can run sequentially or in parallel
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
PR_HEAD_REPO_FULL_NAME: ${{ github.event.pull_request.head.repo.full_name || github.repository }}

jobs:
check-format:
Expand Down Expand Up @@ -64,7 +65,7 @@ jobs:

# Try to checkout the same branch from the same owner's fork first
if [[ ${{ github.event_name }} == 'pull_request' ]]; then
current_owner=$(echo ${{ github.event.pull_request.head.repo.full_name }} | cut -d'/' -f1)
current_owner=$(echo $PR_HEAD_REPO_FULL_NAME | cut -d'/' -f1)
else
current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1)
fi
Expand Down Expand Up @@ -116,7 +117,7 @@ jobs:
source ./env-setup/12.8_env_setup.sh
rm -rf ./statistics-archive
git clone --quiet [email protected]:accel-sim/statistics-archive.git
BRANCH_NAME=${{ github.repository }}/$BRANCH_NAME
BRANCH_NAME=$PR_HEAD_REPO_FULL_NAME/$BRANCH_NAME
# either create a new branch or check it out if it already exists
git -C ./statistics-archive checkout $BRANCH_NAME 2>/dev/null || git -C ./statistics-archive checkout -b $BRANCH_NAME
./util/job_launching/get_stats.py -k -K -R -B GPU_Microbenchmark -C QV100-SASS -A | tee v100-ubench-sass-local.csv
Expand All @@ -142,12 +143,10 @@ jobs:
| tee ampere-ubench-sass-latest2.csv && mv ampere-ubench-sass-local.csv ./statistics-archive/ubench/ampere-ubench-sass-latest.csv
./util/plotting/merge-stats.py -R -c ./statistics-archive/ubench/ampere-a100-ubench-sass-latest.csv,ampere-a100-ubench-sass-local.csv \
| tee ampere-a100-ubench-sass-latest2.csv && mv ampere-a100-ubench-sass-local.csv ./statistics-archive/ubench/ampere-a100-ubench-sass-latest.csv
if [[ $GITHUB_EVENT_NAME == 'push' ]]; then
git -C ./statistics-archive add --all
git -C ./statistics-archive commit \
-m "CI automated checkin $BRANCH_NAME Build:${{ github.run_number }}"_"${{ github.run_attempt}}" || echo "No Changes."
git -C ./statistics-archive push -u origin $BRANCH_NAME
fi
git -C ./statistics-archive add --all
git -C ./statistics-archive commit \
-m "CI automated checkin $BRANCH_NAME Build:${{ github.run_number }}"_"${{ github.run_attempt}}" || echo "No Changes."
git -C ./statistics-archive push -u origin $BRANCH_NAME
- name: Correlate Ubench
run: |
source ./env-setup/12.8_env_setup.sh
Expand All @@ -165,12 +164,12 @@ jobs:

ssh ghci@tgrogers-pc01 mkdir -p /home/ghci/accel-sim/correl/git_$BRANCH_NAME"_"${{ github.run_number }}"_"${{ github.run_attempt}}/
rsync --delete -r ./util/plotting/correl-html/ ghci@tgrogers-pc01:/home/ghci/accel-sim/correl/git_$BRANCH_NAME"_"${{ github.run_number }}"_"${{ github.run_attempt}}/
git -C ./statistics-archive reset --soft HEAD~1
git -C ./statistics-archive add --all
git -C ./statistics-archive commit \
-m "CI automated checkin $PR_HEAD_REPO_FULL_NAME/$BRANCH_NAME Build:${{ github.run_number }}"_"${{ github.run_attempt}}" || echo "No Changes."
git -C ./statistics-archive push -f -u origin $PR_HEAD_REPO_FULL_NAME/$BRANCH_NAME
if [[ ${{ github.event_name }} == 'push' ]]; then
git -C ./statistics-archive reset --soft HEAD~1
git -C ./statistics-archive add --all
git -C ./statistics-archive commit \
-m "CI automated checkin ${{ github.repository }}/$BRANCH_NAME Build:${{ github.run_number }}"_"${{ github.run_attempt}}" || echo "No Changes."
git -C ./statistics-archive push -f -u origin ${{ github.repository }}/$BRANCH_NAME

rm -rf /scratch/tgrogers-disk01/a/tgrghci/ci/lastSuccess/${{ github.repository }}/$BRANCH_NAME
mkdir -p /scratch/tgrogers-disk01/a/tgrghci/ci/lastSuccess/${{ github.repository }}/$BRANCH_NAME
Expand Down Expand Up @@ -202,7 +201,7 @@ jobs:

# Try to checkout the same branch from the same owner's fork first
if [[ ${{ github.event_name }} == 'pull_request' ]]; then
current_owner=$(echo ${{ github.event.pull_request.head.repo.full_name }} | cut -d'/' -f1)
current_owner=$(echo $PR_HEAD_REPO_FULL_NAME | cut -d'/' -f1)
else
current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1)
fi
Expand Down Expand Up @@ -274,7 +273,7 @@ jobs:

# Try to checkout the same branch from the same owner's fork first
if [[ ${{ github.event_name }} == 'pull_request' ]]; then
current_owner=$(echo ${{ github.event.pull_request.head.repo.full_name }} | cut -d'/' -f1)
current_owner=$(echo $PR_HEAD_REPO_FULL_NAME | cut -d'/' -f1)
else
current_owner=$(echo ${{ github.repository }} | cut -d'/' -f1)
fi
Expand Down Expand Up @@ -380,7 +379,7 @@ jobs:
# stats are only archived on pushes. So the repo in the report url needs to be the head repo of the PR
# The stats htmls are generated by CI triggered by pushes to the head repo.
# Not so clean. But works for now. The htmls are not that important anyway.
export REPORT_URL="https://rawcdn.githack.com/accel-sim/statistics-archive/${{ github.event.pull_request.head.repo.full_name }}/$BRANCH_NAME/ubench/"
export REPORT_URL="https://rawcdn.githack.com/accel-sim/statistics-archive/$PR_HEAD_REPO_FULL_NAME/$BRANCH_NAME/ubench/"
python3 .github/scripts/send_ci_email.py -t success
fi

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/weekly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ jobs:
source ./gpu-app-collection/src/setup_environment
rm -rf ./hw_run/
./util/tracer_nvbit/run_hw_trace.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7
./util/tracer_nvbit/run_hw.py -B rodinia_2.0-ft,rodinia-3.1,GPU_Microbenchmark -D 7
rm -rf /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces
mkdir -p /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces
mv ./hw_run /scratch/tgrogers-disk01/a/common/for-sharing/$USER/nightly-traces/hw_run
Expand Down
4 changes: 2 additions & 2 deletions util/hw_stats/run_hw.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@
"l1tex__t_sectors_pipe_lsu_mem_global_op_st_lookup_miss.sum,idc__requests.sum,idc__requests_lookup_hit.sum,"
"sm__sass_inst_executed_op_shared_ld.sum,sm__sass_inst_executed_op_shared_st.sum,lts__t_sectors_srcunit_tex_op_read_lookup_miss.sum,lts__t_sectors_srcunit_tex_op_write_lookup_miss.sum,sm__pipe_alu_cycles_active.sum,sm__pipe_fma_cycles_active.sum,sm__pipe_fp64_cycles_active.sum,sm__pipe_shared_cycles_active.sum,sm__pipe_tensor_cycles_active.sum,sm__pipe_tensor_op_hmma_cycles_active.sum,sm__cycles_active.sum,sm__cycles_active.avg,sm__cycles_elapsed.avg,sm__sass_thread_inst_executed_op_integer_pred_on.sum,sm__sass_thread_inst_executed_ops_dadd_dmul_dfma_pred_on.sum,sm__sass_thread_inst_executed_ops_fadd_fmul_ffma_pred_on.sum,sm__sass_thread_inst_executed_ops_hadd_hmul_hfma_pred_on.sum,sm__inst_executed_pipe_alu.sum,sm__inst_executed_pipe_fma.sum,sm__inst_executed_pipe_fp16.sum,sm__inst_executed_pipe_fp64.sum,sm__inst_executed_pipe_tensor.sum,sm__inst_executed_pipe_tex.sum,sm__inst_executed_pipe_xu.sum,sm__inst_executed_pipe_lsu.sum,"
"sm__sass_thread_inst_executed_op_fp16_pred_on.sum,sm__sass_thread_inst_executed_op_fp32_pred_on.sum,sm__sass_thread_inst_executed_op_fp64_pred_on.sum,sm__sass_thread_inst_executed_op_dmul_pred_on.sum,sm__sass_thread_inst_executed_op_dfma_pred_on.sum,sm__sass_inst_executed_op_memory_128b.sum,sm__sass_inst_executed_op_memory_64b.sum,sm__sass_inst_executed_op_memory_32b.sum,sm__sass_inst_executed_op_memory_16b.sum,sm__sass_inst_executed_op_memory_8b.sum,smsp__thread_inst_executed_per_inst_executed.ratio,sm__sass_thread_inst_executed.sum"
" --csv --page raw --target-processes all "
" --csv --page raw --target-processes all -f "
+ kernel_number
+ " -o "
+ os.path.join(this_run_dir, "ncu_stats")
Expand Down Expand Up @@ -306,7 +306,7 @@
)
elif options.nsight_profiler:
profile_command = (
"ncu --target-processes all --metrics gpc__cycles_elapsed.avg --csv "
"ncu --target-processes all --metrics gpc__cycles_elapsed.avg --csv -f "
+ kernel_number
+ " -o "
+ os.path.join(this_run_dir, "ncu_cycles.{0}".format(i))
Expand Down
55 changes: 38 additions & 17 deletions util/job_launching/apps/define-all-apps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,40 +54,40 @@ GPU_Microbenchmark:
data_dirs: "$GPUAPPS_ROOT/data_dirs/"
execs:
- l1_bw_32f:
- args: --tpb 1024 --tpsm 1024 --blocks 1 --ws 32
- args: --tpb 1024 --tpsm 1024 --blocks 1
accel-sim-mem: 1G
- l1_bw_64f:
- args: --tpb 1024 --tpsm 1024 --blocks 1 --ws 32
- args: --tpb 1024 --tpsm 1024 --blocks 1
accel-sim-mem: 1G
- l1_bw_128:
- args: --tpb 1024 --tpsm 1024 --blocks 1 --ws 32
- args: --tpb 1024 --tpsm 1024 --blocks 1
accel-sim-mem: 2G
- l1_lat:
- args: --blocks 1 --ws 32
- args: --blocks 1 --fast
accel-sim-mem: 1G
- l2_bw_32f:
- args: --tpb 1024 --tpsm 1024 --blocks 160 --ws 32
- args: --tpb 1024 --tpsm 1024 --fast
accel-sim-mem: 6G
- l2_bw_64f:
- args: --tpb 1024 --tpsm 1024 --blocks 160 --l2 786432 --ws 32
- args: --tpb 1024 --tpsm 1024 --fast
accel-sim-mem: 6G
# - l2_bw_128:
# - args:
# accel-sim-mem: 1G
- l2_lat:
- args: --tpb 1 --tpsm 1 --blocks 1 --l2 786432 --ws 32
- args: --fast
accel-sim-mem: 1G
- mem_bw:
- args: --tpb 1024 --tpsm 1024 --blocks 80 --l2 1572864 --ws 32 --memclk 1132 --membw 64
- args: --tpb 1024 --tpsm 1024
accel-sim-mem: 2G
- mem_lat:
- args: --tpb 1024 --tpsm 1024 --blocks 80 --l2 1572864 --ws 32 --memclk 1132 --membw 64
- args: --fast
accel-sim-mem: 1G
- shared_bw:
- args: --tpb 1024 --tpsm 1024 --blocks 1 --ws 32
- args: --tpb 1024 --tpsm 1024 --blocks 1
accel-sim-mem: 2G
- shared_lat:
- args: --blocks 1 --ws 32
- args: --blocks 1
accel-sim-mem: 1G
- shared_bank_conflicts:
## argument 1 kernel has conflicts
Expand All @@ -97,27 +97,27 @@ GPU_Microbenchmark:
- args: 2
accel-sim-mem: 1G
- MaxIops_int32:
- args: --tpb 1024 --blocks 1 --ws 32
- args: --tpb 1024 --blocks 1
accel-sim-mem: 1G
- l1_shared_bw:
- args: --tpb 1024 --blocks 1 --ws 32
- args: --tpb 1024 --blocks 1
accel-sim-mem: 1G
- l1_bw_32f_unroll:
- args: --tpb 1024 --blocks 1 --ws 32
- args: --tpb 1024 --blocks 1
accel-sim-mem: 1G
- l1_bw_32f_unroll_large:
- args:
- args: --tpb 1024 --blocks 1
accel-sim-mem: 1G

GPU_Atomic:
exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/"
data_dirs: "$GPUAPPS_ROOT/data_dirs/"
execs:
- atomic_add_bw:
- args: --tpb 1 --tpsm 1 --blocks 1 --ws 32
- args: --tpb 1 --tpsm 1 --blocks 1 --fast
accel-sim-mem: 1G
- atomic_add_bw_conflict:
- args: --tpb 1024 --tpsm 2048 --blocks 160 --ws 32
- args: --tpb 1024 --tpsm 2048 --fast
accel-sim-mem: 1G
- atomic_add_bw_profile:
- args: 16
Expand Down Expand Up @@ -1067,6 +1067,27 @@ huggingface:
- args:
accel-sim-mem: 10G

tma:
exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/tma"
data_dirs: ""
execs:
- tma_tensor_test:
- args: -w 1024 -h 1024 -o UTMAPF
- args: -w 1024 -h 1024 -o UTMALDG
- args: -w 1024 -h 1024 -o UTMASTG
- args: -w 1024 -h 1024 -o UTMAREDG
- args: -w 1024 -h 1024 -o REGULAR_LOAD
accel-sim-mem: 10G
- tma_bulk_test:
- args: -n 1024 -o UBLKPF
- args: -n 1024 -o UBLKCP_S_G
- args: -n 1024 -o UBLKCP_G_S
- args: -n 1024 -o UBLKRED_G_S
- args: -n 262144 -o UBLKPF
- args: -n 262144 -o UBLKCP_S_G
- args: -n 262144 -o UBLKCP_G_S
- args: -n 262144 -o UBLKRED_G_S
accel-sim-mem: 10G

vllm:
exec_dir: "$GPUAPPS_ROOT/bin/$CUDA_VERSION/release/vllm"
Expand Down
98 changes: 88 additions & 10 deletions util/tuner/run_all.sh
Original file line number Diff line number Diff line change
@@ -1,17 +1,95 @@
#!/bin/bash

# THIS_DIR="$( cd "$( dirname "$BASH_SOURCE" )" && pwd )"
source ./gpu-app-collection-partial/src/setup_environment
SCRIPT_DIR="./gpu-app-collection-partial/src/cuda/GPU_Microbenchmark/"
echo "Running make in $SCRIPT_DIR"
make -C "$SCRIPT_DIR" tuner -j || { echo "make failed"; exit 1; }
make -C "$SCRIPT_DIR" -j || { echo "make failed"; exit 1; }

cd ${SCRIPT_DIR}/bin/
for f in ./*; do
if [[ "$f" == *_corr ]]; then
continue
fi

echo "running $f microbenchmark"
$f
echo "/////////////////////////////////"
done

# List of configuration benchmarks that output lines starting with "-"
# These are used by tuner.py to generate Accel-Sim configuration

# System config
echo "running system_config"
./system_config
echo "/////////////////////////////////"

# Core config
echo "running core_config"
./core_config
echo "/////////////////////////////////"

echo "running config_dpu"
./config_dpu
echo "/////////////////////////////////"

echo "running config_fpu"
./config_fpu
echo "/////////////////////////////////"

echo "running config_int"
./config_int
echo "/////////////////////////////////"

echo "running config_sfu"
./config_sfu
echo "/////////////////////////////////"

echo "running config_tensor"
./config_tensor
echo "/////////////////////////////////"

echo "running config_udp"
./config_udp
echo "/////////////////////////////////"

echo "running regfile_bw"
./regfile_bw
echo "/////////////////////////////////"

# L1 cache config
echo "running l1_config"
./l1_config
echo "/////////////////////////////////"

echo "running l1_lat with args: --blocks 1"
./l1_lat --blocks 1
echo "/////////////////////////////////"

# L2 cache config
echo "running l2_config"
./l2_config
echo "/////////////////////////////////"

echo "running l2_copy_engine"
./l2_copy_engine
echo "/////////////////////////////////"

echo "running l2_lat"
./l2_lat
echo "/////////////////////////////////"

# Memory config
echo "running mem_config"
./mem_config
echo "/////////////////////////////////"

echo "running mem_lat"
./mem_lat
echo "/////////////////////////////////"

# Shared memory config
echo "running shd_config"
./shd_config
echo "/////////////////////////////////"

echo "running shared_lat with args: --blocks 1"
./shared_lat --blocks 1
echo "/////////////////////////////////"

# Kernel latency
echo "running kernel_lat"
./kernel_lat
echo "/////////////////////////////////"
Loading