Benchmark for codex using gpt-5.1-codex #146

Workflow file for this run

.github/workflows/benchmark.yml at fb15ed1

	name: Manual Benchmark Run

	on:
	workflow_dispatch:
	inputs:
	agent:
	description: Agent to run
	required: true
	default: claude
	type: choice
	options:
	- claude
	- codex
	- gemini
	- opencode
	- goose
	- qwen
	- aider
	- cursor
	- copilot
	provider:
	description: Provider (openai, anthropic, google, openrouter, etc.)
	required: true
	default: anthropic
	type: choice
	options:
	- openai
	- groq
	- cerebras
	- anthropic
	- google
	- openrouter
	- dashscope
	- xai
	- deepseek
	- moonshot
	- zai
	model:
	description: Model to use (e.g., claude-sonnet-4-20250514)
	required: true
	default: claude-3-5-haiku-20241022
	exercise:
	description: "Exercise selection: number (first N), comma list, or name"
	required: false
	default: ""
	custom_instruction:
	description: "Additional instruction text appended to the prompt"
	required: false
	default: ""
	verbose:
	description: Verbose console output
	required: false
	default: false
	type: boolean

	run-name: 'Benchmark for ${{ github.event.inputs.agent }} using ${{ github.event.inputs.model }}'

	jobs:
	run-benchmark:
	name: Run Benchmark
	runs-on: ubuntu-latest
	permissions:
	contents: read
	timeout-minutes: 120
	env:
	RESULT_DIR: ./data/results
	OUTPUT_DIR: ./results
	USE_DOCKER: 'false'

	steps:
	- name: Checkout
	uses: actions/checkout@v4
	with:
	submodules: recursive

	- name: Setup Node (for corepack/yarn)
	uses: actions/setup-node@v4
	with:
	node-version: "20"

	- name: Enable corepack
	run: corepack enable

	- uses: oven-sh/setup-bun@v2
	with:
	bun-version: latest

	- name: Install dependencies
	run: bun install --frozen-lockfile

	- name: Install agent CLI (local mode)
	if: ${{ env.USE_DOCKER == 'false' }}
	run: \|
	set -euo pipefail
	agent="${{ github.event.inputs.agent }}"
	echo "Installing CLI for agent: $agent"
	case "$agent" in
	claude)
	npm install -g @anthropic-ai/claude-code
	mkdir -p "$HOME/.claude"
	;;
	codex)
	npm install -g @openai/codex
	# Provide default AGENTS.md (optional)
	echo "Solve this TypeScript exercise. Read the test file to understand requirements and implement the solution." > AGENTS.md
	;;
	goose)
	CONFIGURE=false curl -fsSL https://github.com/block/goose/releases/download/stable/download_cli.sh \| bash
	echo "HOME=$HOME" >> "$GITHUB_ENV"
	echo "PATH=$HOME/.local/bin:$PATH" >> "$GITHUB_ENV"
	;;
	aider)
	curl -LsSf https://aider.chat/install.sh \| sh
	echo "PATH=$HOME/.local/bin:$PATH" >> "$GITHUB_ENV"
	;;
	gemini)
	npm install -g @google/gemini-cli
	;;
	qwen)
	npm install -g @qwen-code/qwen-code
	;;
	opencode)
	npm install -g opencode-ai
	;;
	cursor)
	curl -fsS https://cursor.com/install \| bash
	echo "PATH=$HOME/.cursor/bin:$PATH" >> "$GITHUB_ENV"
	;;
	copilot)
	npm install -g @github/copilot
	;;
	*)
	echo "No installer defined for agent: $agent" >&2
	;;
	esac

	- name: Setup Codex config.toml
	if: ${{ github.event.inputs.agent == 'codex' }}
	run: \|
	set -euo pipefail
	mkdir -p "$HOME/.codex"
	CONFIG_PATH="$HOME/.codex/config.toml"
	echo "Writing Codex config to $CONFIG_PATH"
	cat > "$CONFIG_PATH" << 'EOF'
	# Auto-generated by CI (CLI provides model/provider)
	[model_providers.openai]
	name = "OpenAI"
	base_url = "https://api.openai.com/v1"
	env_key = "OPENAI_API_KEY"

	[model_providers.anthropic]
	name = "Anthropic"
	base_url = "https://api.anthropic.com/v1"
	env_key = "ANTHROPIC_API_KEY"

	[model_providers.cerebras]
	name = "Cerebras"
	base_url = "https://api.cerebras.ai/v1"
	env_key = "CEREBRAS_API_KEY"

	[model_providers.groq]
	name = "Groq"
	base_url = "https://api.groq.com/openai/v1"
	env_key = "GROQ_API_KEY"

	[model_providers.openrouter]
	name = "OpenRouter"
	base_url = "https://openrouter.ai/api/v1"
	env_key = "OPENROUTER_API_KEY"
	EOF

	- name: Check local agent availability
	if: ${{ env.USE_DOCKER == 'false' }}
	run: \|
	set -euo pipefail
	agent="${{ github.event.inputs.agent }}"
	ok=0
	case "$agent" in
	claude) cmd=claude ;;
	goose) cmd=goose ;;
	aider) cmd=aider ;;
	codex) cmd="codex --version" ;;
	gemini) cmd=gemini ;;
	opencode) cmd=opencode ;;
	qwen) cmd=qwen ;;
	cursor) cmd=cursor-agent ;;
	copilot) cmd="copilot --version" ;;
	*) cmd="" ;;
	esac

	if [ -z "$cmd" ]; then
	echo "Unknown agent: $agent" >&2
	exit 1
	fi

	if echo "$cmd" \| grep -q '^npx'; then
	# npx-based command
	if $cmd >/dev/null 2>&1; then ok=1; fi
	else
	if command -v "$cmd" >/dev/null 2>&1; then ok=1; fi
	fi

	if [ "$ok" -ne 1 ]; then
	echo "::warning::Agent CLI '$agent' not found for local mode. Install it in this workflow or set USE_DOCKER to true." >&2
	else
	echo "Agent CLI '$agent' found."
	fi

	- name: Run benchmark
	env:
	ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
	OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
	GROQ_API_KEY: ${{ secrets.GROQ_API_KEY }}
	CEREBRAS_API_KEY: ${{ secrets.CEREBRAS_API_KEY }}
	GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
	GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
	OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
	DASHSCOPE_API_KEY: ${{ secrets.DASHSCOPE_API_KEY }}
	XAI_API_KEY: ${{ secrets.XAI_API_KEY }}
	DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }}
	ANTHROPIC_BASE_URL: ${{ secrets.ANTHROPIC_BASE_URL }}
	CURSOR_API_KEY: ${{ secrets.CURSOR_API_KEY }}
	CUSTOM_INSTRUCTION: ${{ github.event.inputs.custom_instruction }}
	MOONSHOT_API_KEY: ${{ secrets.MOONSHOT_API_KEY }}
	ZAI_API_KEY: ${{ secrets.ZAI_API_KEY }}
	run: \|
	set -euo pipefail
	mkdir -p "${{ env.RESULT_DIR }}"
	mkdir -p "${{ env.OUTPUT_DIR }}"

	CMD=(bun src/index.ts \
	--agent "${{ github.event.inputs.agent }}" \
	--model "${{ github.event.inputs.model }}" \
	--provider "${{ github.event.inputs.provider }}" \
	--output-dir "${{ env.OUTPUT_DIR }}")

	# Toggle Docker execution when USE_DOCKER is true
	if [ "${{ env.USE_DOCKER }}" = "true" ]; then
	CMD+=(--docker)
	fi

	# Persist results for GitHub Actions runs to power summaries
	CMD+=(--save-result --result-dir "${{ env.RESULT_DIR }}")

	# Exercise selection (number, comma list, or name)
	if [ -n "${{ github.event.inputs.exercise }}" ]; then
	CMD+=(--exercise "${{ github.event.inputs.exercise }}")
	fi

	# Verbose output toggle
	if [ "${{ github.event.inputs.verbose }}" = "true" ]; then
	CMD+=(--verbose)
	fi

	# Custom instruction (append to prompt)
	if [ -n "${CUSTOM_INSTRUCTION:-}" ]; then
	CMD+=(--custom-instruction "${CUSTOM_INSTRUCTION}")
	fi

	echo "Running: ${CMD[*]}"
	"${CMD[@]}" \| tee benchmark-summary.txt

	# Fixed artifact name; safe-name step not needed

	- name: Write Job Summary
	if: always()
	run: \|
	{
	echo "## Benchmark Results"
	echo
	echo "- Agent: ${{ github.event.inputs.agent }}"
	echo "- Model: ${{ github.event.inputs.model }}"
	echo "- Provider: ${{ github.event.inputs.provider }}"
	if [ -n "${{ github.event.inputs.custom_instruction }}" ]; then
	PREVIEW=$(printf %s "${{ github.event.inputs.custom_instruction }}" \| tr '\n' ' ' \| sed -E 's/\s+/ /g' \| cut -c1-140)
	echo "- Custom Instruction: ${PREVIEW}$( [ ${#PREVIEW} -eq 140 ] && echo '…' )"
	else
	echo "- Custom Instruction: (none)"
	fi

	# If latest.json exists, print a compact summary
	if [ -f "${{ env.RESULT_DIR }}/latest.json" ]; then
	if command -v jq >/dev/null 2>&1; then
	TOTAL=$(jq -r '.summary.totalCount' "${{ env.RESULT_DIR }}/latest.json" 2>/dev/null \|\| echo "")
	SUCCESS=$(jq -r '.summary.successCount' "${{ env.RESULT_DIR }}/latest.json" 2>/dev/null \|\| echo "")
	RATE=$(jq -r '.summary.successRate' "${{ env.RESULT_DIR }}/latest.json" 2>/dev/null \|\| echo "")
	TDUR_MS=$(jq -r '.summary.totalDuration' "${{ env.RESULT_DIR }}/latest.json" 2>/dev/null \|\| echo "")
	ADUR_MS=$(jq -r '.summary.avgDuration' "${{ env.RESULT_DIR }}/latest.json" 2>/dev/null \|\| echo "")
	# Pretty seconds (1 decimal if needed)
	if [ -n "$TDUR_MS" ] && [ "$TDUR_MS" != "null" ]; then
	TDUR_S=$(jq -nr --argjson ms "$TDUR_MS" '((($ms/1000)*10)\|round)/10')
	fi
	if [ -n "$ADUR_MS" ] && [ "$ADUR_MS" != "null" ]; then
	ADUR_S=$(jq -nr --argjson ms "$ADUR_MS" '((($ms/1000)*10)\|round)/10')
	fi
	echo
	echo "### Summary"
	echo "- Total: ${TOTAL:-n/a}"
	echo "- Success: ${SUCCESS:-n/a}"
	echo "- Success Rate: ${RATE:-n/a}%"
	if [ -n "${TDUR_S:-}" ]; then
	echo "- Duration: ${TDUR_S}s"
	fi
	if [ -n "${ADUR_S:-}" ]; then
	echo "- Avg: ${ADUR_S}s"
	fi

	echo
	echo "### Problem Results"
	# List each problem with link + mark + time
	jq -r '
	.results[] \|
	"- [" + .exercise + "](https://exercism.org/tracks/typescript/exercises/" + .exercise + ") "
	+ (if .overallSuccess then "✅" else "❌" end)
	+ " "
	+ (((.totalDuration/1000*10)\|round)/10\|tostring) + "s"
	' "${{ env.RESULT_DIR }}/latest.json" \|\| true
	fi
	fi
	echo
	echo "### Docs"
	echo "- Methodology: [docs/METHODOLOGY.md](https://github.com/laiso/ts-bench/blob/main/docs/METHODOLOGY.md)"
	echo "- README: [README.md](https://github.com/laiso/ts-bench/)"
	} >> "$GITHUB_STEP_SUMMARY"

	if [ -f benchmark-summary.txt ]; then
	node scripts/append-gh-summary.mjs
	fi

	- name: Upload artifacts (results + console)
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: benchmark-results
	path: \|
	${{ env.RESULT_DIR }}/
	${{ env.OUTPUT_DIR }}/${{ github.event.inputs.agent }}/logs/
	benchmark-summary.txt
	if-no-files-found: warn
	retention-days: 14

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Benchmark for codex using gpt-5.1-codex #146

Workflow file

Benchmark for codex using gpt-5.1-codex #146

Uh oh!

Jobs

Run details

Workflow file for this run