Skip to content

Commit 19e2fd2

Browse files
committed
chore(gpu): add compute-sanitizer run on H100
1 parent c5db64e commit 19e2fd2

File tree

1 file changed

+151
-0
lines changed

1 file changed

+151
-0
lines changed
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# Compile and test tfhe-cuda-backend on an AWS instance
2+
name: gpu_memory_sanitizer
3+
4+
env:
5+
CARGO_TERM_COLOR: always
6+
ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
7+
RUSTFLAGS: "-C target-cpu=native"
8+
RUST_BACKTRACE: "full"
9+
RUST_MIN_STACK: "8388608"
10+
SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }}
11+
SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png
12+
SLACK_USERNAME: ${{ secrets.BOT_USERNAME }}
13+
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
14+
SLACKIFY_MARKDOWN: true
15+
IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }}
16+
PULL_REQUEST_MD_LINK: ""
17+
CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }}
18+
# Secrets will be available only to zama-ai organization members
19+
SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }}
20+
EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04"
21+
22+
on:
23+
# Allows you to run this workflow manually from the Actions tab as an alternative.
24+
pull_request:
25+
types: [ labeled ]
26+
workflow_dispatch:
27+
28+
permissions:
29+
contents: read
30+
31+
# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning
32+
33+
jobs:
34+
setup-instance:
35+
name: gpu_memory_sanitizer/setup-instance
36+
runs-on: ubuntu-latest
37+
if: github.event_name != 'pull_request' ||
38+
(github.event.action == 'labeled' && github.event.label.name == 'approved')
39+
outputs:
40+
runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }}
41+
steps:
42+
- name: Start remote instance
43+
id: start-remote-instance
44+
if: env.SECRETS_AVAILABLE == 'true'
45+
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
46+
with:
47+
mode: start
48+
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
49+
slab-url: ${{ secrets.SLAB_BASE_URL }}
50+
job-secret: ${{ secrets.JOB_SECRET }}
51+
backend: hyperstack
52+
profile: single-h100
53+
54+
# This instance will be spawned especially for pull-request from forked repository
55+
- name: Start GitHub instance
56+
id: start-github-instance
57+
if: env.SECRETS_AVAILABLE == 'false'
58+
run: |
59+
echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT"
60+
61+
cuda-tests-linux:
62+
name: gpu_memory_sanitizer/cuda-tests-linux
63+
needs: [ setup-instance ]
64+
if: github.event_name != 'pull_request' ||
65+
(github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped')
66+
concurrency:
67+
group: ${{ github.workflow_ref }}
68+
cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
69+
runs-on: ${{ needs.setup-instance.outputs.runner-name }}
70+
timeout-minutes: 240
71+
strategy:
72+
fail-fast: false
73+
# explicit include-based build matrix, of known valid options
74+
matrix:
75+
include:
76+
- os: ubuntu-22.04
77+
cuda: "12.8"
78+
gcc: 11
79+
steps:
80+
- name: Checkout tfhe-rs
81+
uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8
82+
with:
83+
persist-credentials: 'false'
84+
token: ${{ env.CHECKOUT_TOKEN }}
85+
86+
- name: Setup Hyperstack dependencies
87+
uses: ./.github/actions/gpu_setup
88+
with:
89+
cuda-version: ${{ matrix.cuda }}
90+
gcc-version: ${{ matrix.gcc }}
91+
github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }}
92+
93+
- name: Find tools
94+
run: |
95+
find /usr -executable -name "compute-sanitizer"
96+
97+
- name: Install latest stable
98+
uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases
99+
with:
100+
toolchain: stable
101+
102+
- name: Run memory sanitizer
103+
run: |
104+
make test_high_level_api_gpu_sanitizer
105+
106+
slack-notify:
107+
name: gpu_memory_sanitizer/slack-notify
108+
needs: [ setup-instance, cuda-tests-linux ]
109+
runs-on: ubuntu-latest
110+
if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }}
111+
continue-on-error: true
112+
steps:
113+
- name: Set pull-request URL
114+
if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request'
115+
run: |
116+
echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}"
117+
env:
118+
PR_BASE_URL: ${{ vars.PR_BASE_URL }}
119+
PR_NUMBER: ${{ github.event.pull_request.number }}
120+
121+
- name: Send message
122+
if: env.SECRETS_AVAILABLE == 'true'
123+
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
124+
env:
125+
SLACK_COLOR: ${{ needs.cuda-tests-linux.result }}
126+
SLACK_MESSAGE: "GPU Memory Checks tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))"
127+
128+
teardown-instance:
129+
name: gpu_memory_sanitizer/teardown-instance
130+
if: ${{ always() && needs.setup-instance.result == 'success' }}
131+
needs: [ setup-instance, cuda-tests-linux ]
132+
runs-on: ubuntu-latest
133+
steps:
134+
- name: Stop remote instance
135+
id: stop-instance
136+
if: env.SECRETS_AVAILABLE == 'true'
137+
uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac
138+
with:
139+
mode: stop
140+
github-token: ${{ secrets.SLAB_ACTION_TOKEN }}
141+
slab-url: ${{ secrets.SLAB_BASE_URL }}
142+
job-secret: ${{ secrets.JOB_SECRET }}
143+
label: ${{ needs.setup-instance.outputs.runner-name }}
144+
145+
- name: Slack Notification
146+
if: ${{ failure() }}
147+
continue-on-error: true
148+
uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661
149+
env:
150+
SLACK_COLOR: ${{ job.status }}
151+
SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})"

0 commit comments

Comments
 (0)