EntityProcess · christso · Jan 4, 2026 · Jan 1, 2026 · Jan 1, 2026 · Jan 1, 2026
diff --git a/.changeset/add-field-accuracy-evaluator.md b/.changeset/add-field-accuracy-evaluator.md
@@ -0,0 +1,12 @@
+---
+"@agentv/core": minor
+"agentv": minor
+---
+
+Add `field_accuracy`, `latency`, and `cost` evaluators
+
+- `field_accuracy`: Compare structured data fields with exact, numeric_tolerance, or date matching
+- `latency`: Check execution duration against threshold (uses traceSummary.durationMs)
+- `cost`: Check execution cost against budget (uses traceSummary.costUsd)
+
+See `examples/features/document-extraction/README.md` for usage examples.
diff --git a/.changeset/add-token-usage-evaluator.md b/.changeset/add-token-usage-evaluator.md
@@ -0,0 +1,6 @@
+---
+"@agentv/core": minor
+"agentv": minor
+---
+
+Add `token_usage` evaluator to gate on provider-reported token budgets.
diff --git a/.changeset/fix-composite-trace-context.md b/.changeset/fix-composite-trace-context.md
@@ -0,0 +1,5 @@
+---
+"@agentv/core": patch
+---
+
+Fix composite evaluators to pass through trace and output message context so trace-dependent evaluators (e.g. latency/cost/tool_trajectory) work when nested.
diff --git a/.claude/skills/agentv-eval-builder/SKILL.md b/.claude/skills/agentv-eval-builder/SKILL.md
@@ -14,6 +14,7 @@ description: Create and maintain AgentV YAML evaluation files for testing AI age
 - Rubrics: `references/rubric-evaluator.md` - Structured criteria-based evaluation
 - Composite Evaluators: `references/composite-evaluator.md` - Combine multiple evaluators
 - Tool Trajectory: `references/tool-trajectory-evaluator.md` - Validate agent tool usage
+- Structured Data + Metrics: `references/structured-data-evaluators.md` - `field_accuracy`, `latency`, `cost`
 - Custom Evaluators: `references/custom-evaluators.md` - Code and LLM judge templates
 - Batch CLI: `references/batch-cli-evaluator.md` - Evaluate batch runner output (JSONL)
 - Compare: `references/compare-command.md` - Compare evaluation results between runs

diff --git a/.claude/skills/agentv-eval-builder/references/eval-schema.json b/.claude/skills/agentv-eval-builder/references/eval-schema.json
@@ -4,11 +4,6 @@
   "description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
   "type": "object",
   "properties": {
-    "$schema": {
-      "type": "string",
-      "description": "Schema identifier",
-      "enum": ["agentv-eval-v2"]
-    },
     "description": {
       "type": "string",
       "description": "Description of what this eval suite covers"
@@ -37,7 +32,16 @@
               },
               "type": {
                 "type": "string",
-                "enum": ["code", "llm_judge"],
+                "enum": [
+                  "code",
+                  "llm_judge",
+                  "composite",
+                  "tool_trajectory",
+                  "field_accuracy",
+                  "latency",
+                  "cost",
+                  "token_usage"
+                ],
                 "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
               },
               "script": {

diff --git a/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md b/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md
@@ -0,0 +1,121 @@
+# Structured Data + Metrics Evaluators
+
+This reference covers the built-in evaluators used for grading structured outputs and gating on execution metrics:
+
+- `field_accuracy`
+- `latency`
+- `cost`
+- `token_usage`
+
+## Ground Truth (`expected_messages`)
+
+Put the expected structured output in the evalcase `expected_messages` (typically as the last `assistant` message with `content` as an object). Evaluators read expected values from there.
+
+```yaml
+evalcases:
+  - id: invoice-001
+    expected_messages:
+      - role: assistant
+        content:
+          invoice_number: "INV-2025-001234"
+          net_total: 1889
+```
+
+## `field_accuracy`
+
+Use `field_accuracy` to compare fields in the candidate JSON against the ground-truth object in `expected_messages`.
+
+```yaml
+execution:
+  evaluators:
+    - name: invoice_fields
+      type: field_accuracy
+      aggregation: weighted_average
+      fields:
+        - path: invoice_number
+          match: exact
+          required: true
+          weight: 2.0
+        - path: invoice_date
+          match: date
+          formats: ["DD-MMM-YYYY", "YYYY-MM-DD"]
+        - path: net_total
+          match: numeric_tolerance
+          tolerance: 1.0
+```
+
+### Match types
+
+- `exact`: strict equality
+- `date`: compares dates after parsing; optionally provide `formats`
+- `numeric_tolerance`: numeric compare within `tolerance` (set `relative: true` for relative tolerance)
+
+For fuzzy string matching, use a `code_judge` evaluator (e.g. Levenshtein) instead of adding a fuzzy mode to `field_accuracy`.
+
+### Aggregation
+
+- `weighted_average` (default): weighted mean of field scores
+- `all_or_nothing`: score 1.0 only if all graded fields pass
+
+## `latency` and `cost`
+
+These evaluators gate on execution metrics reported by the provider (via `traceSummary`).
+
+```yaml
+execution:
+  evaluators:
+    - name: performance
+      type: latency
+      threshold: 2000
+    - name: budget
+      type: cost
+      budget: 0.10
+```
+
+## `token_usage`
+
+Gate on provider-reported token usage (useful when cost is unavailable or model pricing differs).
+
+```yaml
+execution:
+  evaluators:
+    - name: token-budget
+      type: token_usage
+      max_total: 10000
+      # or:
+      # max_input: 8000
+      # max_output: 2000
+```
+
+## Common pattern: combine correctness + gates
+
+Use a `composite` evaluator if you want a single “release gate” score/verdict from multiple checks:
+
+```yaml
+execution:
+  evaluators:
+    - name: release_gate
+      type: composite
+      evaluators:
+        - name: correctness
+          type: field_accuracy
+          fields:
+            - path: invoice_number
+              match: exact
+        - name: latency
+          type: latency
+          threshold: 2000
+        - name: cost
+          type: cost
+          budget: 0.10
+        - name: tokens
+          type: token_usage
+          max_total: 10000
+      aggregator:
+        type: weighted_average
+        weights:
+          correctness: 0.8
+          latency: 0.1
+          cost: 0.05
+          tokens: 0.05
+```
diff --git a/apps/cli/src/commands/compare/index.ts b/apps/cli/src/commands/compare/index.ts
@@ -52,10 +52,7 @@ export function loadJsonlResults(filePath: string): EvalResult[] {
     .filter((line) => line.trim());
 
   return lines.map((line) => {
-    const record = JSON.parse(line) as {
-      eval_id?: string;
-      score?: number;
-    };
+    const record = JSON.parse(line) as { eval_id?: string; score?: number };
     if (typeof record.eval_id !== 'string') {
       throw new Error(`Missing eval_id in result: ${line}`);
     }

diff --git a/apps/cli/src/templates/.claude/skills/agentv-eval-builder/SKILL.md b/apps/cli/src/templates/.claude/skills/agentv-eval-builder/SKILL.md
@@ -14,6 +14,7 @@ description: Create and maintain AgentV YAML evaluation files for testing AI age
 - Rubrics: `references/rubric-evaluator.md` - Structured criteria-based evaluation
 - Composite Evaluators: `references/composite-evaluator.md` - Combine multiple evaluators
 - Tool Trajectory: `references/tool-trajectory-evaluator.md` - Validate agent tool usage
+- Structured Data + Metrics: `references/structured-data-evaluators.md` - `field_accuracy`, `latency`, `cost`
 - Custom Evaluators: `references/custom-evaluators.md` - Code and LLM judge templates
 - Batch CLI: `references/batch-cli-evaluator.md` - Evaluate batch runner output (JSONL)
 - Compare: `references/compare-command.md` - Compare evaluation results between runs

diff --git a/apps/cli/src/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json b/apps/cli/src/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json
@@ -4,11 +4,6 @@
   "description": "Schema for YAML evaluation files with conversation flows, multiple evaluators, and execution configuration",
   "type": "object",
   "properties": {
-    "$schema": {
-      "type": "string",
-      "description": "Schema identifier",
-      "enum": ["agentv-eval-v2"]
-    },
     "description": {
       "type": "string",
       "description": "Description of what this eval suite covers"
@@ -37,7 +32,16 @@
               },
               "type": {
                 "type": "string",
-                "enum": ["code", "llm_judge"],
+                "enum": [
+                  "code",
+                  "llm_judge",
+                  "composite",
+                  "tool_trajectory",
+                  "field_accuracy",
+                  "latency",
+                  "cost",
+                  "token_usage"
+                ],
                 "description": "Evaluator type: 'code' for scripts/regex/keywords, 'llm_judge' for LLM-based evaluation"
               },
               "script": {

diff --git a/...tes/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md b/...tes/.claude/skills/agentv-eval-builder/references/structured-data-evaluators.md
@@ -0,0 +1,121 @@
+# Structured Data + Metrics Evaluators
+
+This reference covers the built-in evaluators used for grading structured outputs and gating on execution metrics:
+
+- `field_accuracy`
+- `latency`
+- `cost`
+- `token_usage`
+
+## Ground Truth (`expected_messages`)
+
+Put the expected structured output in the evalcase `expected_messages` (typically as the last `assistant` message with `content` as an object). Evaluators read expected values from there.
+
+```yaml
+evalcases:
+  - id: invoice-001
+    expected_messages:
+      - role: assistant
+        content:
+          invoice_number: "INV-2025-001234"
+          net_total: 1889
+```
+
+## `field_accuracy`
+
+Use `field_accuracy` to compare fields in the candidate JSON against the ground-truth object in `expected_messages`.
+
+```yaml
+execution:
+  evaluators:
+    - name: invoice_fields
+      type: field_accuracy
+      aggregation: weighted_average
+      fields:
+        - path: invoice_number
+          match: exact
+          required: true
+          weight: 2.0
+        - path: invoice_date
+          match: date
+          formats: ["DD-MMM-YYYY", "YYYY-MM-DD"]
+        - path: net_total
+          match: numeric_tolerance
+          tolerance: 1.0
+```
+
+### Match types
+
+- `exact`: strict equality
+- `date`: compares dates after parsing; optionally provide `formats`
+- `numeric_tolerance`: numeric compare within `tolerance` (set `relative: true` for relative tolerance)
+
+For fuzzy string matching, use a `code_judge` evaluator (e.g. Levenshtein) instead of adding a fuzzy mode to `field_accuracy`.
+
+### Aggregation
+
+- `weighted_average` (default): weighted mean of field scores
+- `all_or_nothing`: score 1.0 only if all graded fields pass
+
+## `latency` and `cost`
+
+These evaluators gate on execution metrics reported by the provider (via `traceSummary`).
+
+```yaml
+execution:
+  evaluators:
+    - name: performance
+      type: latency
+      threshold: 2000
+    - name: budget
+      type: cost
+      budget: 0.10
+```
+
+## `token_usage`
+
+Gate on provider-reported token usage (useful when cost is unavailable or model pricing differs).
+
+```yaml
+execution:
+  evaluators:
+    - name: token-budget
+      type: token_usage
+      max_total: 10000
+      # or:
+      # max_input: 8000
+      # max_output: 2000
+```
+
+## Common pattern: combine correctness + gates
+
+Use a `composite` evaluator if you want a single “release gate” score/verdict from multiple checks:
+
+```yaml
+execution:
+  evaluators:
+    - name: release_gate
+      type: composite
+      evaluators:
+        - name: correctness
+          type: field_accuracy
+          fields:
+            - path: invoice_number
+              match: exact
+        - name: latency
+          type: latency
+          threshold: 2000
+        - name: cost
+          type: cost
+          budget: 0.10
+        - name: tokens
+          type: token_usage
+          max_total: 10000
+      aggregator:
+        type: weighted_average
+        weights:
+          correctness: 0.8
+          latency: 0.1
+          cost: 0.05
+          tokens: 0.05
+```
diff --git a/apps/cli/test/commands/compare/compare.test.ts b/apps/cli/test/commands/compare/compare.test.ts
@@ -22,7 +22,7 @@ describe('compare command', () => {
   });
 
   describe('loadJsonlResults', () => {
-    it('should load valid JSONL file with snake_case eval results', () => {
+    it('should load valid JSONL file with eval_id results', () => {
       const filePath = path.join(tempDir, 'results.jsonl');
       writeFileSync(
         filePath,

diff --git a/bun.lock b/bun.lock
@@ -21,7 +21,7 @@
     },
     "apps/cli": {
       "name": "agentv",
-      "version": "1.2.0",
+      "version": "1.6.0",
       "bin": {
         "agentv": "./dist/cli.js",
       },
@@ -39,7 +39,7 @@
     },
     "packages/core": {
       "name": "@agentv/core",
-      "version": "1.2.0",
+      "version": "1.5.0",
       "dependencies": {
         "@ai-sdk/anthropic": "^2.0.53",
         "@ai-sdk/azure": "^2.0.78",