Skip to content

Benchmark for codex using z-ai/glm-4.6 #147

Benchmark for codex using z-ai/glm-4.6

Benchmark for codex using z-ai/glm-4.6 #147

Workflow file for this run

name: Manual Benchmark Run
on:
workflow_dispatch:
inputs:
agent:
description: Agent to run
required: true
default: claude
type: choice
options:
- claude
- codex
- gemini
- opencode
- goose
- qwen
- aider
- cursor
- copilot
provider:
description: Provider (openai, anthropic, google, openrouter, etc.)
required: true
default: anthropic
type: choice
options:
- openai
- groq
- cerebras
- anthropic
- google
- openrouter
- dashscope
- xai
- deepseek
- moonshot
- zai
model:
description: Model to use (e.g., claude-sonnet-4-20250514)
required: true
default: claude-3-5-haiku-20241022
exercise:
description: "Exercise selection: number (first N), comma list, or name"
required: false
default: ""
custom_instruction:
description: "Additional instruction text appended to the prompt"
required: false
default: ""
verbose:
description: Verbose console output
required: false
default: false
type: boolean
run-name: 'Benchmark for ${{ github.event.inputs.agent }} using ${{ github.event.inputs.model }}'
jobs:
run-benchmark:
name: Run Benchmark
runs-on: ubuntu-latest
permissions:
contents: read
timeout-minutes: 120
env:
RESULT_DIR: ./data/results
OUTPUT_DIR: ./results
USE_DOCKER: 'false'
steps:
- name: Checkout
uses: actions/checkout@v4
with:
submodules: recursive
- name: Setup Node (for corepack/yarn)
uses: actions/setup-node@v4
with:
node-version: "20"
- name: Enable corepack
run: corepack enable
- uses: oven-sh/setup-bun@v2
with:
bun-version: latest
- name: Install dependencies
run: bun install --frozen-lockfile
- name: Install agent CLI (local mode)
if: ${{ env.USE_DOCKER == 'false' }}
run: |
set -euo pipefail
agent="${{ github.event.inputs.agent }}"
echo "Installing CLI for agent: $agent"
case "$agent" in
claude)
npm install -g @anthropic-ai/claude-code
mkdir -p "$HOME/.claude"
;;
codex)
npm install -g @openai/codex
# Provide default AGENTS.md (optional)
echo "Solve this TypeScript exercise. Read the test file to understand requirements and implement the solution." > AGENTS.md
;;
goose)
CONFIGURE=false curl -fsSL https://github.com/block/goose/releases/download/stable/download_cli.sh | bash
echo "HOME=$HOME" >> "$GITHUB_ENV"
echo "PATH=$HOME/.local/bin:$PATH" >> "$GITHUB_ENV"
;;
aider)
curl -LsSf https://aider.chat/install.sh | sh
echo "PATH=$HOME/.local/bin:$PATH" >> "$GITHUB_ENV"
;;
gemini)
npm install -g @google/gemini-cli
;;
qwen)
npm install -g @qwen-code/qwen-code
;;
opencode)
npm install -g opencode-ai
;;
cursor)
curl -fsS https://cursor.com/install | bash
echo "PATH=$HOME/.cursor/bin:$PATH" >> "$GITHUB_ENV"
;;
copilot)
npm install -g @github/copilot
;;
*)
echo "No installer defined for agent: $agent" >&2
;;
esac
- name: Setup Codex config.toml
if: ${{ github.event.inputs.agent == 'codex' }}
run: |
set -euo pipefail
mkdir -p "$HOME/.codex"
CONFIG_PATH="$HOME/.codex/config.toml"
echo "Writing Codex config to $CONFIG_PATH"
cat > "$CONFIG_PATH" << 'EOF'
# Auto-generated by CI (CLI provides model/provider)
[model_providers.openai]
name = "OpenAI"
base_url = "https://api.openai.com/v1"
env_key = "OPENAI_API_KEY"
[model_providers.anthropic]
name = "Anthropic"
base_url = "https://api.anthropic.com/v1"
env_key = "ANTHROPIC_API_KEY"
[model_providers.cerebras]
name = "Cerebras"
base_url = "https://api.cerebras.ai/v1"
env_key = "CEREBRAS_API_KEY"
[model_providers.groq]
name = "Groq"
base_url = "https://api.groq.com/openai/v1"
env_key = "GROQ_API_KEY"
[model_providers.openrouter]
name = "OpenRouter"
base_url = "https://openrouter.ai/api/v1"
env_key = "OPENROUTER_API_KEY"
EOF
- name: Check local agent availability
if: ${{ env.USE_DOCKER == 'false' }}
run: |
set -euo pipefail
agent="${{ github.event.inputs.agent }}"
ok=0
case "$agent" in
claude) cmd=claude ;;
goose) cmd=goose ;;
aider) cmd=aider ;;
codex) cmd="codex --version" ;;
gemini) cmd=gemini ;;
opencode) cmd=opencode ;;
qwen) cmd=qwen ;;
cursor) cmd=cursor-agent ;;
copilot) cmd="copilot --version" ;;
*) cmd="" ;;
esac
if [ -z "$cmd" ]; then
echo "Unknown agent: $agent" >&2
exit 1
fi
if echo "$cmd" | grep -q '^npx'; then
# npx-based command
if $cmd >/dev/null 2>&1; then ok=1; fi
else
if command -v "$cmd" >/dev/null 2>&1; then ok=1; fi
fi
if [ "$ok" -ne 1 ]; then
echo "::warning::Agent CLI '$agent' not found for local mode. Install it in this workflow or set USE_DOCKER to true." >&2
else
echo "Agent CLI '$agent' found."
fi
- name: Run benchmark
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
CEREBRAS_API_KEY: ${{ secrets.CEREBRAS_API_KEY }}
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
DASHSCOPE_API_KEY: ${{ secrets.DASHSCOPE_API_KEY }}
XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }}
CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}
CUSTOM_INSTRUCTION: ${{ github.event.inputs.custom_instruction }}
MOONSHOT_API_KEY: ${{ secrets.MOONSHOT_API_KEY }}
ZAI_API_KEY: ${{ secrets.ZAI_API_KEY }}
run: |
set -euo pipefail
mkdir -p "${{ env.RESULT_DIR }}"
mkdir -p "${{ env.OUTPUT_DIR }}"
CMD=(bun src/index.ts \
--agent "${{ github.event.inputs.agent }}" \
--model "${{ github.event.inputs.model }}" \
--provider "${{ github.event.inputs.provider }}" \
--output-dir "${{ env.OUTPUT_DIR }}")
# Toggle Docker execution when USE_DOCKER is true
if [ "${{ env.USE_DOCKER }}" = "true" ]; then
CMD+=(--docker)
fi
# Persist results for GitHub Actions runs to power summaries
CMD+=(--save-result --result-dir "${{ env.RESULT_DIR }}")
# Exercise selection (number, comma list, or name)
if [ -n "${{ github.event.inputs.exercise }}" ]; then
CMD+=(--exercise "${{ github.event.inputs.exercise }}")
fi
# Verbose output toggle
if [ "${{ github.event.inputs.verbose }}" = "true" ]; then
CMD+=(--verbose)
fi
# Custom instruction (append to prompt)
if [ -n "${CUSTOM_INSTRUCTION:-}" ]; then
CMD+=(--custom-instruction "${CUSTOM_INSTRUCTION}")
fi
echo "Running: ${CMD[*]}"
"${CMD[@]}" | tee benchmark-summary.txt
# Fixed artifact name; safe-name step not needed
- name: Write Job Summary
if: always()
run: |
{
echo "## Benchmark Results"
echo
echo "- Agent: ${{ github.event.inputs.agent }}"
echo "- Model: ${{ github.event.inputs.model }}"
echo "- Provider: ${{ github.event.inputs.provider }}"
if [ -n "${{ github.event.inputs.custom_instruction }}" ]; then
PREVIEW=$(printf %s "${{ github.event.inputs.custom_instruction }}" | tr '\n' ' ' | sed -E 's/\s+/ /g' | cut -c1-140)
echo "- Custom Instruction: ${PREVIEW}$( [ ${#PREVIEW} -eq 140 ] && echo '…' )"
else
echo "- Custom Instruction: (none)"
fi
# If latest.json exists, print a compact summary
if [ -f "${{ env.RESULT_DIR }}/latest.json" ]; then
if command -v jq >/dev/null 2>&1; then
TOTAL=$(jq -r '.summary.totalCount' "${{ env.RESULT_DIR }}/latest.json" 2>/dev/null || echo "")
SUCCESS=$(jq -r '.summary.successCount' "${{ env.RESULT_DIR }}/latest.json" 2>/dev/null || echo "")
RATE=$(jq -r '.summary.successRate' "${{ env.RESULT_DIR }}/latest.json" 2>/dev/null || echo "")
TDUR_MS=$(jq -r '.summary.totalDuration' "${{ env.RESULT_DIR }}/latest.json" 2>/dev/null || echo "")
ADUR_MS=$(jq -r '.summary.avgDuration' "${{ env.RESULT_DIR }}/latest.json" 2>/dev/null || echo "")
# Pretty seconds (1 decimal if needed)
if [ -n "$TDUR_MS" ] && [ "$TDUR_MS" != "null" ]; then
TDUR_S=$(jq -nr --argjson ms "$TDUR_MS" '((($ms/1000)*10)|round)/10')
fi
if [ -n "$ADUR_MS" ] && [ "$ADUR_MS" != "null" ]; then
ADUR_S=$(jq -nr --argjson ms "$ADUR_MS" '((($ms/1000)*10)|round)/10')
fi
echo
echo "### Summary"
echo "- Total: ${TOTAL:-n/a}"
echo "- Success: ${SUCCESS:-n/a}"
echo "- Success Rate: ${RATE:-n/a}%"
if [ -n "${TDUR_S:-}" ]; then
echo "- Duration: ${TDUR_S}s"
fi
if [ -n "${ADUR_S:-}" ]; then
echo "- Avg: ${ADUR_S}s"
fi
echo
echo "### Problem Results"
# List each problem with link + mark + time
jq -r '
.results[] |
"- [" + .exercise + "](https://exercism.org/tracks/typescript/exercises/" + .exercise + ") "
+ (if .overallSuccess then "✅" else "❌" end)
+ " "
+ (((.totalDuration/1000*10)|round)/10|tostring) + "s"
' "${{ env.RESULT_DIR }}/latest.json" || true
fi
fi
echo
echo "### Docs"
echo "- Methodology: [docs/METHODOLOGY.md](https://github.com/laiso/ts-bench/blob/main/docs/METHODOLOGY.md)"
echo "- README: [README.md](https://github.com/laiso/ts-bench/)"
} >> "$GITHUB_STEP_SUMMARY"
if [ -f benchmark-summary.txt ]; then
node scripts/append-gh-summary.mjs
fi
- name: Upload artifacts (results + console)
if: always()
uses: actions/upload-artifact@v4
with:
name: benchmark-results
path: |
${{ env.RESULT_DIR }}/
${{ env.OUTPUT_DIR }}/${{ github.event.inputs.agent }}/logs/
benchmark-summary.txt
if-no-files-found: warn
retention-days: 14