Monitor Self-Hosted Runners #9685
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Monitor Self-Hosted Runners | |
| on: | |
| # push: # uncomment for testing if on a branch in the azure_scripts repo | |
| schedule: | |
| - cron: '*/15 * * * *' # every 15 minutes | |
| - cron: '0 9 * * 1' # weekly report every Monday at 9 AM UTC | |
| workflow_dispatch: | |
| inputs: | |
| send_weekly_report: | |
| description: 'Send a weekly report to Zulip' | |
| required: false | |
| default: false | |
| type: boolean | |
| env: | |
| ZULIP_SERVER: "https://leanprover.zulipchat.com" | |
| ZULIP_CHANNEL: "CI admins" | |
| STATE_FILE: "runner-state.json" | |
| STATS_FILE: "runner-stats.json" | |
| CACHE_KEY: ${{ github.ref == 'refs/heads/master' && 'runner-monitor-state-v4' || 'runner-monitor-state-testing' }} | |
| STATS_CACHE_KEY: ${{ github.ref == 'refs/heads/master' && 'runner-monitor-stats-v4' || 'runner-monitor-stats-testing' }} | |
| jobs: | |
| monitor-runners: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Restore previous state | |
| id: cache-restore | |
| uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 | |
| with: | |
| path: ${{ env.STATE_FILE }} | |
| key: ${{ env.CACHE_KEY }}-${{ github.run_id }} | |
| restore-keys: ${{ env.CACHE_KEY }} | |
| - name: Restore previous stats | |
| id: stats-cache-restore | |
| uses: actions/cache/restore@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 | |
| with: | |
| path: ${{ env.STATS_FILE }} | |
| key: ${{ env.STATS_CACHE_KEY }}-${{ github.run_id }} | |
| restore-keys: ${{ env.STATS_CACHE_KEY }} | |
| - name: Create empty state | |
| if: steps.cache-restore.outputs.cache-hit == '' | |
| run: | | |
| echo "No previous state file found, creating empty state" | |
| echo '{"last_run": "", "runners": {}}' > "${{ env.STATE_FILE }}" | |
| - name: Create empty stats | |
| if: steps.stats-cache-restore.outputs.cache-hit == '' | |
| run: | | |
| echo "No previous stats file found, creating empty stats" | |
| echo '{"runners": {}, "last_cleanup": ""}' > "${{ env.STATS_FILE }}" | |
| - name: Check self-hosted runners | |
| id: check-runners | |
| run: | | |
| current_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") | |
| echo "Current time: $current_time" | |
| echo "Fetching organization runners..." | |
| # Get all self-hosted runners for the organization (the MONITOR_RUNNERS_GITHUB_TOKEN requires admin:org permissions) | |
| response=$(curl -s -H "Authorization: token ${{ secrets.MONITOR_RUNNERS_GITHUB_TOKEN }}" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| "https://api.github.com/orgs/${{ github.repository_owner }}/actions/runners") | |
| # Save response for label management step | |
| echo "$response" > runners_response.json | |
| # Load previous state and stats | |
| previous_state=$(cat "${{ env.STATE_FILE }}") | |
| previous_stats=$(cat "${{ env.STATS_FILE }}") | |
| echo "::group::previous_state" | |
| echo "$previous_state" | |
| echo "::endgroup::" | |
| echo "::group::previous_stats" | |
| echo "$previous_stats" | |
| echo "::endgroup::" | |
| # Initialize arrays for notifications | |
| newly_offline_runners=() | |
| persistent_offline_runners=() | |
| back_online_runners=() | |
| # Initialize new stats structure | |
| new_stats=$(echo "$previous_stats" | jq --arg time "$current_time" '.last_cleanup = $time') | |
| # Clean up stats older than 7 days | |
| seven_days_ago=$(date -u -d "7 days ago" +"%Y-%m-%dT%H:%M:%SZ") | |
| new_stats=$(echo "$new_stats" | jq --arg cutoff "$seven_days_ago" ' | |
| .runners = (.runners // {} | | |
| to_entries | | |
| map(select(.value.history) | | |
| .value.history = (.value.history | map(select(.timestamp >= $cutoff))) | |
| ) | | |
| from_entries | |
| ) | |
| ') | |
| echo "::group::new_stats after 7-day cleanup" | |
| echo "$new_stats" | |
| echo "::endgroup::" | |
| # Process current runners | |
| echo "$response" | jq -r '.runners[] | "\(.name)|\(.status)|\(.busy)|\(.labels | map(.name) | join(","))"' | while IFS='|' read -r name status busy labels; do | |
| if [ -z "$name" ]; then continue; fi | |
| echo "::group::Processing runner: $name" | |
| # Determine runner state (Idle, Active, Offline) | |
| if [ "$status" != "online" ]; then | |
| runner_state="Offline" | |
| elif [ "$busy" = "true" ]; then | |
| runner_state="Active" | |
| else | |
| runner_state="Idle" | |
| fi | |
| echo "runner_state: $runner_state" | |
| # Get previous state for this runner | |
| prev_status=$(echo "$previous_state" | jq -r --arg name "$name" '.runners[$name].status // "unknown"') | |
| prev_consecutive=$(echo "$previous_state" | jq -r --arg name "$name" '.runners[$name].consecutive_offline // 0') | |
| echo "prev_status: $prev_status" | |
| echo "prev_consecutive: $prev_consecutive" | |
| if [ "$status" != "online" ]; then | |
| # Runner is offline | |
| if [ "$prev_status" = "online" ] || [ "$prev_status" = "unknown" ]; then | |
| # First time offline | |
| consecutive_offline=1 | |
| echo "NEWLY_OFFLINE: $name (labels: $labels)" | tee -a newly_offline.tmp | |
| else | |
| # Still offline, increment counter | |
| consecutive_offline=$((prev_consecutive + 1)) | |
| if [ "$consecutive_offline" -ge 2 ]; then | |
| echo "PERSISTENT_OFFLINE: $name,$consecutive_offline,$labels" | tee -a persistent_offline.tmp | |
| fi | |
| fi | |
| else | |
| # Runner is online | |
| consecutive_offline=0 | |
| if [ "$prev_status" = "offline" ] && [ "$prev_consecutive" -ge 2 ]; then | |
| # Runner came back online | |
| echo "BACK_ONLINE: $name,$prev_consecutive,$labels" | tee -a back_online.tmp | |
| fi | |
| fi | |
| # Update state for this runner | |
| echo "updated state:" | |
| echo "$name,$status,$consecutive_offline,$labels" | tee -a current_runners.tmp | |
| # Update stats with current state | |
| echo "updated stats:" | |
| echo "$name,$runner_state,$labels" | tee -a current_runner_states.tmp | |
| echo "::endgroup::" | |
| done | |
| # Update stats file with all current runner states | |
| while IFS=',' read -r name state labels; do | |
| if [ -n "$name" ]; then | |
| new_stats=$(echo "$new_stats" | jq --arg name "$name" --arg state "$state" --arg time "$current_time" --arg labels "$labels" ' | |
| .runners[$name] = (.runners[$name] // {"history": [], "labels": ""}) | | |
| .runners[$name].labels = $labels | | |
| .runners[$name].history += [{"timestamp": $time, "state": $state}] | |
| ') | |
| fi | |
| done < current_runner_states.tmp | |
| echo "::group::new_stats after processing" | |
| echo "$new_stats" | |
| echo "::endgroup::" | |
| # Save updated stats | |
| echo "$new_stats" > "${{ env.STATS_FILE }}" | |
| # Build new state file | |
| new_state=$(echo "$previous_state" | jq --arg time "$current_time" '.last_run = $time | .runners = {}') | |
| if [ -f "current_runners.tmp" ]; then | |
| while IFS=',' read -r name status consecutive labels; do | |
| if [ -n "$name" ]; then | |
| new_state=$(echo "$new_state" | jq --arg name "$name" --arg status "$status" --argjson consecutive "$consecutive" --arg labels "$labels" \ | |
| '.runners[$name] = {"status": $status, "consecutive_offline": $consecutive, "labels": $labels}') | |
| fi | |
| done < current_runners.tmp | |
| echo "::group::Formatted current runners info" | |
| cat current_runners.tmp | |
| echo "::endgroup::" | |
| fi | |
| # Save new state | |
| echo "$new_state" > "${{ env.STATE_FILE }}" | |
| # Prepare notification messages | |
| notification_message="" | |
| # Check for runners that came back online | |
| if [ -f "back_online.tmp" ] && [ -s "back_online.tmp" ]; then | |
| notification_message+="✅ **[Runners](https://github.com/organizations/${{ github.repository_owner }}/settings/actions/runners) back online:**\n\n" | |
| while IFS=',' read -r name prev_consecutive labels; do | |
| name=$(echo "$name" | sed 's/BACK_ONLINE: //') | |
| if [ -n "$labels" ] && [ "$labels" != "" ]; then | |
| notification_message+="- \`$name\` (was offline for ${prev_consecutive} checks, labels: \`$labels\`)\n" | |
| else | |
| notification_message+="- \`$name\` (was offline for ${prev_consecutive} checks, no labels)\n" | |
| fi | |
| done < back_online.tmp | |
| notification_message+="\n" | |
| fi | |
| # Check for persistently offline runners (≥2 consecutive runs) | |
| if [ -f "persistent_offline.tmp" ] && [ -s "persistent_offline.tmp" ]; then | |
| notification_message+="⚠️ **[Runners](https://github.com/organizations/${{ github.repository_owner }}/settings/actions/runners) offline for multiple checks:**\n\n" | |
| while IFS=',' read -r name consecutive labels; do | |
| name=$(echo "$name" | sed 's/PERSISTENT_OFFLINE: //') | |
| if [ -n "$labels" ] && [ "$labels" != "" ]; then | |
| notification_message+="- \`$name\` (${consecutive} consecutive checks, labels: \`$labels\`)\n" | |
| else | |
| notification_message+="- \`$name\` (${consecutive} consecutive checks, no labels)\n" | |
| fi | |
| done < persistent_offline.tmp | |
| notification_message+="\n" | |
| fi | |
| # Check if this is a weekly report trigger | |
| is_weekly_report="false" | |
| if [ "${{ github.event.schedule }}" = "0 9 * * 1" ] || [ "${{ toJSON(inputs.send_weekly_report) }}" = "true" ]; then | |
| is_weekly_report="true" | |
| fi | |
| echo "is_weekly_report=$is_weekly_report" >> $GITHUB_OUTPUT | |
| # Set should_notify output | |
| if [ -n "$notification_message" ]; then | |
| echo "should_notify=true" >> $GITHUB_OUTPUT | |
| else | |
| echo "should_notify=false" >> $GITHUB_OUTPUT | |
| fi | |
| # Save message to output | |
| echo "message<<EOF" >> $GITHUB_OUTPUT | |
| echo -e "$notification_message" >> $GITHUB_OUTPUT | |
| echo "EOF" >> $GITHUB_OUTPUT | |
| # Clean up temp files | |
| rm -f newly_offline.tmp persistent_offline.tmp back_online.tmp current_runners.tmp current_runner_states.tmp | |
| - name: Find out if there are active bors batches | |
| id: bors_active | |
| run: | | |
| response=$(curl -sf "https://mathlib-bors-ca18eefec4cb.herokuapp.com/api/active-batches") || { echo "result=true" >> "$GITHUB_OUTPUT"; exit 0; } | |
| length=$(echo "$response" | jq '.batch_ids | length') | |
| echo "result=$([ "$length" -gt 0 ] && echo true || echo false)" >> "$GITHUB_OUTPUT" | |
| - name: Manage runner labels based on bors status | |
| id: manage-labels | |
| run: | | |
| bors_active="${{ steps.bors_active.outputs.result }}" | |
| echo "Bors active: $bors_active" | |
| # Load the runners response | |
| response=$(cat runners_response.json) | |
| # Initialize outputs | |
| label_summary="" | |
| label_errors="" | |
| # Helper function to add a label to a runner | |
| add_label() { | |
| local runner_id=$1 | |
| local runner_name=$2 | |
| local label=$3 | |
| local update_response=$(curl -s -X POST \ | |
| -H "Authorization: token ${{ secrets.MONITOR_RUNNERS_GITHUB_TOKEN }}" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| "https://api.github.com/orgs/${{ github.repository_owner }}/actions/runners/$runner_id/labels" \ | |
| -d "{\"labels\":[\"$label\"]}") | |
| if echo "$update_response" | jq -e '.labels' > /dev/null 2>&1; then | |
| label_summary+="🏷️ Added \`$label\` label to runner \`$runner_name\`\n" | |
| return 0 | |
| else | |
| echo "ERROR: Failed to add label $label to runner $runner_name" | |
| echo "Response: $update_response" | |
| label_errors+="❌ Failed to add \`$label\` label to runner \`$runner_name\`\n" | |
| return 1 | |
| fi | |
| } | |
| # Helper function to remove a label from a runner | |
| remove_label() { | |
| local runner_id=$1 | |
| local runner_name=$2 | |
| local label=$3 | |
| local update_response=$(curl -s -X DELETE \ | |
| -H "Authorization: token ${{ secrets.MONITOR_RUNNERS_GITHUB_TOKEN }}" \ | |
| -H "Accept: application/vnd.github+json" \ | |
| "https://api.github.com/orgs/${{ github.repository_owner }}/actions/runners/$runner_id/labels/$label") | |
| # DELETE returns 204 No Content on success, or 404/422 on error | |
| if [ $? -eq 0 ]; then | |
| label_summary+="🏷️ Removed \`$label\` label from runner \`$runner_name\`\n" | |
| return 0 | |
| else | |
| echo "ERROR: Failed to remove label $label from runner $runner_name" | |
| echo "Response: $update_response" | |
| label_errors+="❌ Failed to remove \`$label\` label from runner \`$runner_name\`\n" | |
| return 1 | |
| fi | |
| } | |
| # Extract runner data (id, name, custom labels) | |
| echo "Extracting runner data..." | |
| runner_data=$(echo "$response" | jq -r '.runners[] | "\(.id)|\(.name)|\([.labels[] | select(.type == "custom") | .name] | join(","))"') | |
| if [ -z "$runner_data" ]; then | |
| echo "ERROR: No runners found" | |
| label_errors+="❌ **Label Management Error:** No runners found in organization\n\n" | |
| else | |
| # Ensure all runners have 'bors' label | |
| echo "Checking if all runners have 'bors' label..." | |
| while IFS='|' read -r runner_id runner_name labels; do | |
| # Check if 'bors' is present | |
| if [[ ",$labels," != *",bors,"* ]]; then | |
| echo "Adding 'bors' label to runner: $runner_name" | |
| add_label "$runner_id" "$runner_name" "bors" | |
| fi | |
| done <<< "$runner_data" | |
| # Now manage 'pr' label based on bors status | |
| if [ "$bors_active" = "true" ]; then | |
| echo "Managing 'pr' labels (bors active - one runner should NOT have 'pr')..." | |
| # Check if there's already a runner without 'pr' | |
| runner_without_pr=$(echo "$response" | jq -r ' | |
| .runners[] | | |
| select( | |
| ([.labels[] | select(.type == "custom") | .name] | contains(["pr"]) | not) | |
| ) | | |
| .name | |
| ' | head -n 1) | |
| if [ -n "$runner_without_pr" ]; then | |
| echo "Runner '$runner_without_pr' already lacks 'pr' label - no changes needed" | |
| label_summary+="✅ Runner \`$runner_without_pr\` already lacks \`pr\` label (no changes needed)\n" | |
| else | |
| echo "All runners have 'pr' - selecting one to remove it from" | |
| # Find an idle runner first, fall back to any online runner | |
| selected_runner=$(echo "$response" | jq -r '.runners[] | select(.status == "online" and .busy == false) | "\(.id)|\(.name)"' | head -n 1) | |
| if [ -z "$selected_runner" ]; then | |
| echo "No idle runner found, selecting any online runner" | |
| selected_runner=$(echo "$response" | jq -r '.runners[] | select(.status == "online") | "\(.id)|\(.name)"' | head -n 1) | |
| fi | |
| if [ -z "$selected_runner" ]; then | |
| echo "ERROR: No online runners found" | |
| label_errors+="❌ **Label Management Error:** No online runners available to remove \`pr\` label from\n\n" | |
| else | |
| IFS='|' read -r runner_id runner_name <<< "$selected_runner" | |
| echo "Removing 'pr' label from runner: $runner_name" | |
| remove_label "$runner_id" "$runner_name" "pr" | |
| fi | |
| fi | |
| else | |
| echo "Managing 'pr' labels (bors inactive - all runners should have 'pr')..." | |
| all_have_pr=true | |
| while IFS='|' read -r runner_id runner_name labels; do | |
| # Check if 'pr' is present | |
| if [[ ",$labels," != *",pr,"* ]]; then | |
| all_have_pr=false | |
| echo "Adding 'pr' label to runner: $runner_name" | |
| add_label "$runner_id" "$runner_name" "pr" | |
| fi | |
| done <<< "$runner_data" | |
| if [ "$all_have_pr" = true ]; then | |
| echo "All runners already have 'pr' label" | |
| label_summary+="✅ All runners already have \`pr\` label\n" | |
| fi | |
| fi | |
| fi | |
| # Save summary to output (for logging) | |
| echo "label_summary<<EOF" >> $GITHUB_OUTPUT | |
| echo -e "$label_summary" >> $GITHUB_OUTPUT | |
| echo "EOF" >> $GITHUB_OUTPUT | |
| # Only set error output if there are actual errors | |
| if [ -n "$label_errors" ]; then | |
| echo "label_errors<<EOF" >> $GITHUB_OUTPUT | |
| echo -e "$label_errors" >> $GITHUB_OUTPUT | |
| echo "EOF" >> $GITHUB_OUTPUT | |
| echo "has_label_errors=true" >> $GITHUB_OUTPUT | |
| else | |
| echo "label_errors=" >> $GITHUB_OUTPUT | |
| echo "has_label_errors=false" >> $GITHUB_OUTPUT | |
| fi | |
| # Clean up | |
| rm -f runners_response.json | |
| - name: Generate stats report | |
| id: weekly-stats | |
| run: | | |
| echo "Generating statistics report..." | |
| # Load stats | |
| stats=$(cat "${{ env.STATS_FILE }}") | |
| # Get list of runners that have data | |
| runners=$(echo "$stats" | jq -r '.runners | keys[]' | sort) | |
| # Create temporary file for weekly report content | |
| weekly_report_file="weekly_report.tmp" | |
| # Write report header | |
| tee > "$weekly_report_file" << 'EOF' | |
| 📊 **Weekly Runner Statistics Report** | |
| EOF | |
| echo "*Period: Last 7 days • Generated: $(date -u +'%Y-%m-%d %H:%M UTC')*" | tee -a "$weekly_report_file" | |
| echo "" | tee -a "$weekly_report_file" | |
| if [ -z "$runners" ]; then | |
| echo "No runner data available for the past 7 days." | tee -a "$weekly_report_file" | |
| else | |
| # Write table header | |
| cat >> "$weekly_report_file" << 'EOF' | |
| | Runner | Idle | Active | Offline | Labels | | |
| |--------|------|---------|---------|--------| | |
| EOF | |
| # Process each runner and write to temp file | |
| echo "$runners" | while read -r runner; do | |
| if [ -z "$runner" ]; then continue; fi | |
| echo "Processing stats for runner: $runner" | |
| # Calculate percentages using jq | |
| runner_stats=$(echo "$stats" | jq -r --arg runner "$runner" ' | |
| .runners[$runner] as $data | | |
| ($data.history | length) as $total | | |
| if $total == 0 then | |
| "0.0|0.0|0.0|\($data.labels // "")" | |
| else | |
| ($data.history | map(select(.state == "Idle")) | length) as $idle | | |
| ($data.history | map(select(.state == "Active")) | length) as $active | | |
| ($data.history | map(select(.state == "Offline")) | length) as $offline | | |
| (($idle * 100.0 / $total) | .*100 | round / 100) as $idle_pct | | |
| (($active * 100.0 / $total) | .*100 | round / 100) as $active_pct | | |
| (($offline * 100.0 / $total) | .*100 | round / 100) as $offline_pct | | |
| "\($idle_pct)|\($active_pct)|\($offline_pct)|\($data.labels // "")" | |
| end | |
| ') | |
| IFS='|' read -r idle_pct active_pct offline_pct labels <<< "$runner_stats" | |
| # Format labels for display | |
| if [ -z "$labels" ] || [ "$labels" = "null" ]; then | |
| labels_display="-" | |
| else | |
| labels_display="\`$labels\`" | |
| fi | |
| echo "| \`$runner\` | ${idle_pct}% | ${active_pct}% | ${offline_pct}% | $labels_display |" | tee -a "$weekly_report_file" | |
| done | |
| # Calculate concurrent state statistics | |
| echo "Calculating concurrent state statistics..." | |
| concurrent_stats=$(echo "$stats" | jq -r ' | |
| . as $root | | |
| # Get all unique timestamps across all runners | |
| [.runners[].history[].timestamp] | unique | sort as $timestamps | | |
| # For each timestamp, check the state of each runner. | |
| # If a runner does not have a state recorded at a specific timestamp, | |
| # we use the most recent state before that timestamp, | |
| # or default to "Offline" if no prior state exists. | |
| $timestamps | map(. as $ts | | |
| [$root.runners[] | .history | map(select(.timestamp <= $ts)) | last | .state // "Offline"] | | |
| { | |
| timestamp: $ts, | |
| all_idle: (all(. == "Idle")), | |
| all_busy: (all(. == "Active")), | |
| states: . | |
| } | |
| ) | | |
| # Calculate percentages | |
| length as $total | | |
| if $total == 0 then | |
| "0.0|0.0" | |
| else | |
| (map(select(.all_idle)) | length) as $all_idle_count | | |
| (map(select(.all_busy)) | length) as $all_busy_count | | |
| (($all_idle_count * 100.0 / $total) | .*100 | round / 100) as $all_idle_pct | | |
| (($all_busy_count * 100.0 / $total) | .*100 | round / 100) as $all_busy_pct | | |
| "\($all_idle_pct)|\($all_busy_pct)" | |
| end | |
| ') | |
| IFS='|' read -r all_idle_pct all_busy_pct <<< "$concurrent_stats" | |
| # Add concurrent statistics to report | |
| cat >> "$weekly_report_file" << EOF | |
| **Overall Statistics:** | |
| - **All runners idle**: ${all_idle_pct}% of monitoring periods | |
| - **All runners busy**: ${all_busy_pct}% of monitoring periods | |
| EOF | |
| # Add legend and footer | |
| tee >> "$weekly_report_file" << EOF | |
| **Legend:** | |
| • **Idle**: Runner online but not executing jobs | |
| • **Active**: Runner online and executing jobs | |
| • **Offline**: Runner not responding | |
| *Statistics based on $(echo "$stats" | jq -r '[.runners[].history[]] | length') data points collected every 15 minutes.* | |
| EOF | |
| fi | |
| echo "weekly_message<<EOF" >> $GITHUB_OUTPUT | |
| cat "$weekly_report_file" >> $GITHUB_OUTPUT | |
| echo "EOF" >> $GITHUB_OUTPUT | |
| # Clean up temp file | |
| rm -f "$weekly_report_file" | |
| - name: Send status message on Zulip | |
| if: steps.check-runners.outputs.should_notify == 'true' | |
| uses: zulip/github-actions-zulip/send-message@e4c8f27c732ba9bd98ac6be0583096dea82feea5 # v1.0.2 | |
| with: | |
| api-key: ${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }} | |
| email: ${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }} | |
| organization-url: ${{ env.ZULIP_SERVER }} | |
| to: ${{ env.ZULIP_CHANNEL }} | |
| type: 'stream' | |
| topic: 'Runner Status' | |
| content: | | |
| ${{ steps.check-runners.outputs.message }} | |
| - name: Send label management notification on Zulip | |
| if: steps.manage-labels.outputs.has_label_errors == 'true' | |
| uses: zulip/github-actions-zulip/send-message@e4c8f27c732ba9bd98ac6be0583096dea82feea5 # v1.0.2 | |
| with: | |
| api-key: ${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }} | |
| email: ${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }} | |
| organization-url: ${{ env.ZULIP_SERVER }} | |
| to: ${{ env.ZULIP_CHANNEL }} | |
| type: 'stream' | |
| topic: 'Runner Status' | |
| content: | | |
| ${{ steps.manage-labels.outputs.label_errors }} | |
| - name: Send weekly report on Zulip | |
| if: steps.check-runners.outputs.is_weekly_report == 'true' | |
| uses: zulip/github-actions-zulip/send-message@e4c8f27c732ba9bd98ac6be0583096dea82feea5 # v1.0.2 | |
| with: | |
| api-key: ${{ secrets.ZULIP_MONITOR_RUNNERS_API_KEY }} | |
| email: ${{ secrets.ZULIP_MONITOR_RUNNERS_BOT_EMAIL }} | |
| organization-url: ${{ env.ZULIP_SERVER }} | |
| to: ${{ env.ZULIP_CHANNEL }} | |
| type: 'stream' | |
| topic: 'Weekly Runner Report' | |
| content: | | |
| ${{ steps.weekly-stats.outputs.weekly_message }} | |
| - name: Save state to cache | |
| if: always() && github.event_name != 'workflow_dispatch' | |
| uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 | |
| with: | |
| path: ${{ env.STATE_FILE }} | |
| key: ${{ env.CACHE_KEY }}-${{ github.run_id }} | |
| - name: Save stats to cache | |
| if: always() && github.event_name != 'workflow_dispatch' | |
| uses: actions/cache/save@0400d5f644dc74513175e3cd8d07132dd4860809 # v4.2.4 | |
| with: | |
| path: ${{ env.STATS_FILE }} | |
| key: ${{ env.STATS_CACHE_KEY }}-${{ github.run_id }} | |
| - name: upload files as artifact | |
| if: always() | |
| uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 | |
| with: | |
| name: monitor_runners_artifact | |
| path: | | |
| ${{ env.STATE_FILE }} | |
| ${{ env.STATS_FILE }} | |
| - name: Log summary | |
| run: | | |
| echo "=== Runner Monitor Summary ===" | |
| echo "::group::State file contents:" | |
| cat "${{ env.STATE_FILE }}" | jq . | |
| echo "::endgroup::" | |
| echo "::group::Stats file contents:" | |
| cat "${{ env.STATS_FILE }}" | jq . | |
| echo "::endgroup::" | |
| echo "" | |
| echo "=== Statistics Summary ===" | |
| runner_count=$(cat "${{ env.STATS_FILE }}" | jq '.runners | keys | length') | |
| echo "Runners tracked: $runner_count" | |
| data_points=$(cat "${{ env.STATS_FILE }}" | jq '[.runners[].history[]] | length') | |
| echo "Total data points: $data_points" | |
| echo "" | |
| echo "=== Label Management Summary ===" | |
| echo "Bors active: ${{ steps.bors_active.outputs.result }}" | |
| cat << 'EOF' | |
| ${{ steps.manage-labels.outputs.label_summary }} | |
| EOF | |
| if [ "${{ steps.manage-labels.outputs.has_label_errors }}" = "true" ]; then | |
| echo "⚠️ Label management errors occurred (see Zulip notification)" | |
| fi | |
| echo "" | |
| echo "::group::=== 7-Day Statistics Report ===" | |
| cat << 'EOF' | |
| ${{ steps.weekly-stats.outputs.weekly_message }} | |
| EOF | |
| echo "::endgroup::" | |
| if [ "${{ steps.check-runners.outputs.should_notify }}" = "true" ]; then | |
| echo "" | |
| echo "📢 Status notifications sent to Zulip" | |
| else | |
| echo "" | |
| echo "✅ No status notifications needed - all runners stable" | |
| fi | |
| if [ "${{ steps.check-runners.outputs.is_weekly_report }}" = "true" ]; then | |
| echo "📊 Weekly report sent to Zulip" | |
| else | |
| echo "📊 Weekly report generated but not sent (not scheduled weekly run)" | |
| fi | |
| workflow-keepalive: | |
| if: github.event_name == 'schedule' | |
| runs-on: ubuntu-latest | |
| permissions: | |
| actions: write | |
| steps: | |
| - uses: liskin/gh-workflow-keepalive@f72ff1a1336129f29bf0166c0fd0ca6cf1bcb38c # v1.2.1 |