|
| 1 | +# Compile and test tfhe-cuda-backend on an AWS instance |
| 2 | +name: gpu_memory_sanitizer |
| 3 | + |
| 4 | +env: |
| 5 | + CARGO_TERM_COLOR: always |
| 6 | + ACTION_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} |
| 7 | + RUSTFLAGS: "-C target-cpu=native" |
| 8 | + RUST_BACKTRACE: "full" |
| 9 | + RUST_MIN_STACK: "8388608" |
| 10 | + SLACK_CHANNEL: ${{ secrets.SLACK_CHANNEL }} |
| 11 | + SLACK_ICON: https://pbs.twimg.com/profile_images/1274014582265298945/OjBKP9kn_400x400.png |
| 12 | + SLACK_USERNAME: ${{ secrets.BOT_USERNAME }} |
| 13 | + SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }} |
| 14 | + SLACKIFY_MARKDOWN: true |
| 15 | + IS_PULL_REQUEST: ${{ github.event_name == 'pull_request' }} |
| 16 | + PULL_REQUEST_MD_LINK: "" |
| 17 | + CHECKOUT_TOKEN: ${{ secrets.REPO_CHECKOUT_TOKEN || secrets.GITHUB_TOKEN }} |
| 18 | + # Secrets will be available only to zama-ai organization members |
| 19 | + SECRETS_AVAILABLE: ${{ secrets.JOB_SECRET != '' }} |
| 20 | + EXTERNAL_CONTRIBUTION_RUNNER: "gpu_ubuntu-22.04" |
| 21 | + |
| 22 | +on: |
| 23 | + # Allows you to run this workflow manually from the Actions tab as an alternative. |
| 24 | + pull_request: |
| 25 | + types: [ labeled ] |
| 26 | + workflow_dispatch: |
| 27 | + |
| 28 | +permissions: |
| 29 | + contents: read |
| 30 | + |
| 31 | +# zizmor: ignore[concurrency-limits] concurrency is managed after instance setup to ensure safe provisioning |
| 32 | + |
| 33 | +jobs: |
| 34 | + setup-instance: |
| 35 | + name: gpu_memory_sanitizer/setup-instance |
| 36 | + runs-on: ubuntu-latest |
| 37 | + if: github.event_name != 'pull_request' || |
| 38 | + (github.event.action == 'labeled' && github.event.label.name == 'approved') |
| 39 | + outputs: |
| 40 | + runner-name: ${{ steps.start-remote-instance.outputs.label || steps.start-github-instance.outputs.runner_group }} |
| 41 | + steps: |
| 42 | + - name: Start remote instance |
| 43 | + id: start-remote-instance |
| 44 | + if: env.SECRETS_AVAILABLE == 'true' |
| 45 | + uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac |
| 46 | + with: |
| 47 | + mode: start |
| 48 | + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} |
| 49 | + slab-url: ${{ secrets.SLAB_BASE_URL }} |
| 50 | + job-secret: ${{ secrets.JOB_SECRET }} |
| 51 | + backend: hyperstack |
| 52 | + profile: single-h100 |
| 53 | + |
| 54 | + # This instance will be spawned especially for pull-request from forked repository |
| 55 | + - name: Start GitHub instance |
| 56 | + id: start-github-instance |
| 57 | + if: env.SECRETS_AVAILABLE == 'false' |
| 58 | + run: | |
| 59 | + echo "runner_group=${EXTERNAL_CONTRIBUTION_RUNNER}" >> "$GITHUB_OUTPUT" |
| 60 | +
|
| 61 | + cuda-tests-linux: |
| 62 | + name: gpu_memory_sanitizer/cuda-tests-linux |
| 63 | + needs: [ setup-instance ] |
| 64 | + if: github.event_name != 'pull_request' || |
| 65 | + (github.event_name == 'pull_request' && needs.setup-instance.result != 'skipped') |
| 66 | + concurrency: |
| 67 | + group: ${{ github.workflow_ref }} |
| 68 | + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} |
| 69 | + runs-on: ${{ needs.setup-instance.outputs.runner-name }} |
| 70 | + timeout-minutes: 240 |
| 71 | + strategy: |
| 72 | + fail-fast: false |
| 73 | + # explicit include-based build matrix, of known valid options |
| 74 | + matrix: |
| 75 | + include: |
| 76 | + - os: ubuntu-22.04 |
| 77 | + cuda: "12.8" |
| 78 | + gcc: 11 |
| 79 | + steps: |
| 80 | + - name: Checkout tfhe-rs |
| 81 | + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 |
| 82 | + with: |
| 83 | + persist-credentials: 'false' |
| 84 | + token: ${{ env.CHECKOUT_TOKEN }} |
| 85 | + |
| 86 | + - name: Setup Hyperstack dependencies |
| 87 | + uses: ./.github/actions/gpu_setup |
| 88 | + with: |
| 89 | + cuda-version: ${{ matrix.cuda }} |
| 90 | + gcc-version: ${{ matrix.gcc }} |
| 91 | + github-instance: ${{ env.SECRETS_AVAILABLE == 'false' }} |
| 92 | + |
| 93 | + - name: Find tools |
| 94 | + run: | |
| 95 | + find /usr -executable -name "compute-sanitizer" |
| 96 | +
|
| 97 | + - name: Install latest stable |
| 98 | + uses: dtolnay/rust-toolchain@b3b07ba8b418998c39fb20f53e8b695cdcc8de1b # zizmor: ignore[stale-action-refs] this action doesn't create releases |
| 99 | + with: |
| 100 | + toolchain: stable |
| 101 | + |
| 102 | + - name: Run memory sanitizer |
| 103 | + run: | |
| 104 | + make test_high_level_api_gpu_sanitizer |
| 105 | +
|
| 106 | + slack-notify: |
| 107 | + name: gpu_memory_sanitizer/slack-notify |
| 108 | + needs: [ setup-instance, cuda-tests-linux ] |
| 109 | + runs-on: ubuntu-latest |
| 110 | + if: ${{ always() && needs.cuda-tests-linux.result != 'skipped' && failure() }} |
| 111 | + continue-on-error: true |
| 112 | + steps: |
| 113 | + - name: Set pull-request URL |
| 114 | + if: env.SECRETS_AVAILABLE == 'true' && github.event_name == 'pull_request' |
| 115 | + run: | |
| 116 | + echo "PULL_REQUEST_MD_LINK=[pull-request](${PR_BASE_URL}${PR_NUMBER}), " >> "${GITHUB_ENV}" |
| 117 | + env: |
| 118 | + PR_BASE_URL: ${{ vars.PR_BASE_URL }} |
| 119 | + PR_NUMBER: ${{ github.event.pull_request.number }} |
| 120 | + |
| 121 | + - name: Send message |
| 122 | + if: env.SECRETS_AVAILABLE == 'true' |
| 123 | + uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 |
| 124 | + env: |
| 125 | + SLACK_COLOR: ${{ needs.cuda-tests-linux.result }} |
| 126 | + SLACK_MESSAGE: "GPU Memory Checks tests finished with status: ${{ needs.cuda-tests-linux.result }}. (${{ env.PULL_REQUEST_MD_LINK }}[action run](${{ env.ACTION_RUN_URL }}))" |
| 127 | + |
| 128 | + teardown-instance: |
| 129 | + name: gpu_memory_sanitizer/teardown-instance |
| 130 | + if: ${{ always() && needs.setup-instance.result == 'success' }} |
| 131 | + needs: [ setup-instance, cuda-tests-linux ] |
| 132 | + runs-on: ubuntu-latest |
| 133 | + steps: |
| 134 | + - name: Stop remote instance |
| 135 | + id: stop-instance |
| 136 | + if: env.SECRETS_AVAILABLE == 'true' |
| 137 | + uses: zama-ai/slab-github-runner@79939325c3c429837c10d6041e4fd8589d328bac |
| 138 | + with: |
| 139 | + mode: stop |
| 140 | + github-token: ${{ secrets.SLAB_ACTION_TOKEN }} |
| 141 | + slab-url: ${{ secrets.SLAB_BASE_URL }} |
| 142 | + job-secret: ${{ secrets.JOB_SECRET }} |
| 143 | + label: ${{ needs.setup-instance.outputs.runner-name }} |
| 144 | + |
| 145 | + - name: Slack Notification |
| 146 | + if: ${{ failure() }} |
| 147 | + continue-on-error: true |
| 148 | + uses: rtCamp/action-slack-notify@e31e87e03dd19038e411e38ae27cbad084a90661 |
| 149 | + env: |
| 150 | + SLACK_COLOR: ${{ job.status }} |
| 151 | + SLACK_MESSAGE: "Instance teardown (cuda-tests) finished with status: ${{ job.status }}. (${{ env.ACTION_RUN_URL }})" |
0 commit comments