diff --git a/docs/otel-integration.html b/docs/otel-integration.html
new file mode 100644
index 0000000..761621e
--- /dev/null
+++ b/docs/otel-integration.html
@@ -0,0 +1,1124 @@
+<!doctype html>
+<html lang="zh-Hant">
+<head>
+<meta charset="utf-8" />
+<title>OTel 整合探索 — ccxray</title>
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<style>
+  :root {
+    --bg: #0f1419;
+    --panel: #1a1f26;
+    --panel-2: #232932;
+    --border: #2a313c;
+    --text: #d8dee9;
+    --muted: #8b95a5;
+    --accent: #88c0d0;
+    --accent-2: #a3be8c;
+    --warn: #ebcb8b;
+    --danger: #bf616a;
+    --code-bg: #11161c;
+  }
+  * { box-sizing: border-box; }
+  html, body { margin: 0; padding: 0; background: var(--bg); color: var(--text); }
+  body {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "PingFang TC", "Microsoft JhengHei", sans-serif;
+    line-height: 1.7;
+    font-size: 15px;
+  }
+  .wrap { max-width: 920px; margin: 0 auto; padding: 40px 28px 80px; }
+  h1 {
+    font-size: 28px;
+    border-bottom: 1px solid var(--border);
+    padding-bottom: 12px;
+    margin-bottom: 8px;
+  }
+  h2 {
+    font-size: 20px;
+    margin-top: 48px;
+    color: var(--accent);
+    border-left: 3px solid var(--accent);
+    padding-left: 12px;
+  }
+  h3 {
+    font-size: 16px;
+    color: var(--accent-2);
+    margin-top: 28px;
+  }
+  p, li { color: var(--text); }
+  .muted { color: var(--muted); font-size: 13px; }
+  .lede {
+    color: var(--muted);
+    font-size: 14px;
+    margin-bottom: 32px;
+  }
+  code {
+    background: var(--code-bg);
+    border: 1px solid var(--border);
+    padding: 1px 6px;
+    border-radius: 3px;
+    font-family: "SF Mono", Menlo, Consolas, monospace;
+    font-size: 12.5px;
+    color: var(--warn);
+  }
+  pre {
+    background: var(--code-bg);
+    border: 1px solid var(--border);
+    padding: 14px 16px;
+    border-radius: 6px;
+    overflow-x: auto;
+    font-family: "SF Mono", Menlo, Consolas, monospace;
+    font-size: 12.5px;
+  }
+  pre code { background: none; border: none; padding: 0; color: var(--text); }
+  .diagram {
+    background: var(--panel);
+    border: 1px solid var(--border);
+    border-radius: 8px;
+    padding: 20px;
+    margin: 18px 0;
+    display: flex;
+    justify-content: center;
+  }
+  .diagram .mermaid { background: transparent; }
+  .card {
+    background: var(--panel);
+    border: 1px solid var(--border);
+    border-radius: 8px;
+    padding: 18px 22px;
+    margin: 14px 0;
+  }
+  .grid-3 {
+    display: grid;
+    grid-template-columns: 1fr 1fr 1fr;
+    gap: 14px;
+    margin: 16px 0;
+  }
+  .grid-3 .card { margin: 0; }
+  .grid-3 h4 { margin: 0 0 6px; color: var(--accent); font-size: 14px; }
+  table {
+    width: 100%;
+    border-collapse: collapse;
+    margin: 16px 0;
+    font-size: 13px;
+  }
+  th, td {
+    border: 1px solid var(--border);
+    padding: 8px 12px;
+    text-align: left;
+    vertical-align: top;
+  }
+  th {
+    background: var(--panel-2);
+    color: var(--accent);
+    font-weight: 600;
+  }
+  td.good { color: var(--accent-2); }
+  td.warn { color: var(--warn); }
+  td.bad  { color: var(--danger); }
+  .pill {
+    display: inline-block;
+    padding: 1px 8px;
+    border-radius: 10px;
+    font-size: 11px;
+    background: var(--panel-2);
+    border: 1px solid var(--border);
+    color: var(--muted);
+    margin-right: 6px;
+  }
+  .pill.A { color: var(--accent-2); border-color: var(--accent-2); }
+  .pill.B { color: var(--warn); border-color: var(--warn); }
+  .pill.C { color: var(--danger); border-color: var(--danger); }
+  .pill.D { color: #b48ead; border-color: #b48ead; }
+  .pill.recommended {
+    background: rgba(163, 190, 140, 0.15);
+    color: var(--accent-2);
+    border-color: var(--accent-2);
+    font-weight: 700;
+    padding: 2px 10px;
+  }
+  ul { padding-left: 22px; }
+  li { margin: 4px 0; }
+  .toc {
+    background: var(--panel);
+    border: 1px solid var(--border);
+    border-radius: 8px;
+    padding: 14px 22px;
+    margin: 18px 0 32px;
+    font-size: 14px;
+  }
+  .toc a { color: var(--accent); text-decoration: none; }
+  .toc a:hover { text-decoration: underline; }
+  .key {
+    border-left: 3px solid var(--accent-2);
+    background: rgba(163, 190, 140, 0.08);
+    padding: 10px 16px;
+    margin: 14px 0;
+    border-radius: 0 4px 4px 0;
+  }
+  .warn-box {
+    border-left: 3px solid var(--warn);
+    background: rgba(235, 203, 139, 0.08);
+    padding: 10px 16px;
+    margin: 14px 0;
+    border-radius: 0 4px 4px 0;
+  }
+</style>
+</head>
+<body>
+<div class="wrap">
+
+<h1>OTel 整合探索</h1>
+<div class="lede">理解 OpenTelemetry 是什麼,以及 ccxray 要如何接上 OTel 生態的三種方案比較</div>
+
+<div class="toc">
+  <strong>目錄</strong><br>
+  <a href="#what">1. OTel 是什麼?</a><br>
+  <a href="#signals">2. 三種訊號:Traces / Metrics / Logs</a><br>
+  <a href="#claude-now">3. Claude Code 內建的 OTel 已經做了什麼?</a><br>
+  <a href="#ccxray-pos">4. ccxray 站在 HTTP 層,看得到/看不到什麼?</a><br>
+  <a href="#plans">5. 四個整合方案(A / B / C / <span style="color: var(--accent-2);">D ★</span>)</a><br>
+  <a href="#compare">6. 四案比較表</a><br>
+  <a href="#mgmt">7. 管理者視角:MCP / tool / skill 使用統計</a><br>
+  <a href="#reco">8. 建議路線</a><br>
+  <a href="#premortem">9. 事前驗屍與解方(10 題,9 解,全 ≥ 9 分)</a>
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="what">1. OTel 是什麼?</h2>
+
+<p>OpenTelemetry(以下簡稱 <strong>OTel</strong>)<strong>不是一個產品</strong>,而是一套<strong>觀測資料的標準</strong>。它定義了:</p>
+<ul>
+  <li>資料的<strong>結構</strong>(span 長什麼樣、metric 長什麼樣)</li>
+  <li>資料的<strong>傳輸協議</strong>(叫 OTLP,通常透過 HTTP/gRPC)</li>
+  <li>各種語言的 <strong>SDK</strong>(Node、Python、Go...)讓你產生這些資料</li>
+</ul>
+<p>它解決的問題是:<em>「以前每個觀測後端(Datadog、New Relic、Honeycomb)都有自己的 SDK,換後端就要改程式碼。現在大家都講 OTel,你只要 emit 一次,送去哪都可以。」</em></p>
+
+<div class="diagram">
+<pre class="mermaid">
+flowchart LR
+    A[你的應用程式<br/>例如 ccxray] -->|OTel SDK<br/>產生標準資料| B[OTel Collector<br/>選配,中繼站]
+    B -->|OTLP 協議| C[Honeycomb]
+    B -->|OTLP 協議| D[Datadog]
+    B -->|OTLP 協議| E[Grafana / Jaeger]
+    B -->|OTLP 協議| F[Langfuse]
+    A -.直接送.-> C
+
+    style A fill:#88c0d0,stroke:#5e81ac,color:#0f1419
+    style B fill:#a3be8c,stroke:#5e81ac,color:#0f1419
+</pre>
+</div>
+
+<div class="key">
+<strong>記住一件事:</strong>你的程式 → OTel SDK → 後端。中間是「資料格式長一樣」。
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="signals">2. 三種訊號</h2>
+
+<p>OTel 把觀測資料分成三類,各自獨立,各自可開關:</p>
+
+<div class="grid-3">
+  <div class="card">
+    <h4>Traces 追蹤</h4>
+    <p class="muted">一次「操作」的時間軸</p>
+    <p>由多個 <code>span</code> 組成樹狀結構。每個 span 有開始/結束時間、parent。</p>
+    <p><em>ccxray 例子:</em> 一次 Claude turn = 一個 trace,內含 1 個 HTTP request span + N 個 tool span</p>
+  </div>
+  <div class="card">
+    <h4>Metrics 指標</h4>
+    <p class="muted">數字、計數、分布</p>
+    <p>Counter(累加)、Gauge(瞬時值)、Histogram(分布)。便宜、聚合好。</p>
+    <p><em>ccxray 例子:</em> input_tokens 累計、cost 累計、cache hit rate</p>
+  </div>
+  <div class="card">
+    <h4>Logs 事件</h4>
+    <p class="muted">結構化 log 紀錄</p>
+    <p>類似傳統 log,但是結構化(JSON),可以關聯到 trace 和 span。</p>
+    <p><em>ccxray 例子:</em> 完整 request body、tool 執行結果</p>
+  </div>
+</div>
+
+<h3>三者怎麼串在一起?</h3>
+
+<div class="diagram">
+<pre class="mermaid">
+flowchart TB
+    subgraph Trace [Trace: 一次 Claude turn]
+        S1["Span: HTTP POST /v1/messages<br/>200ms"]
+        S2["Span: tool_use Read<br/>50ms"]
+        S3["Span: tool_use Bash<br/>1200ms"]
+        S1 --> S2
+        S1 --> S3
+    end
+
+    subgraph Metrics [Metrics 同時被記錄]
+        M1["counter tokens.input += 2500"]
+        M2["counter cost.usd += 0.0125"]
+    end
+
+    subgraph Logs [Logs 關聯到 span]
+        L1["event user_prompt<br/>linked to S1"]
+        L2["event tool_result<br/>linked to S3"]
+    end
+
+    style S1 fill:#88c0d0,stroke:#5e81ac,color:#0f1419
+    style S2 fill:#a3be8c,stroke:#5e81ac,color:#0f1419
+    style S3 fill:#a3be8c,stroke:#5e81ac,color:#0f1419
+</pre>
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="claude-now">3. Claude Code 內建的 OTel 已經做了什麼?</h2>
+
+<p>當你設定 <code>CLAUDE_CODE_ENABLE_TELEMETRY=1</code>,Claude Code CLI 會<strong>自己</strong>送 OTel 出去,完全不用 ccxray:</p>
+
+<div class="diagram">
+<pre class="mermaid">
+flowchart LR
+    A[Claude Code CLI<br/>內建 OTel] -->|OTLP| B[你的 Collector]
+    B --> C[Honeycomb / Datadog]
+
+    A2[Claude Code CLI<br/>無 OTel 設定] -->|純 HTTP| X[Anthropic API]
+
+    style A fill:#a3be8c,stroke:#5e81ac,color:#0f1419
+    style A2 fill:#8b95a5,stroke:#5e81ac,color:#0f1419
+</pre>
+</div>
+
+<p>CLI 自己會 emit 的 span:</p>
+<ul>
+  <li><code>claude_code.interaction</code> — 一個 agent loop turn</li>
+  <li><code>claude_code.llm_request</code> — 每次呼叫 Anthropic API</li>
+  <li><code>claude_code.tool</code> — 每次工具呼叫(含 permission 等待和執行)</li>
+  <li><code>claude_code.hook</code> — 每次 hook 執行(beta)</li>
+</ul>
+
+<div class="warn-box">
+<strong>注意:</strong>這只在 <strong>Anthropic 官方 Claude Code</strong>。<strong>Codex、Gemini 等其他 provider 完全沒有</strong> OTel。
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="ccxray-pos">4. ccxray 站在 HTTP 層,看得到/看不到什麼?</h2>
+
+<div class="diagram">
+<pre class="mermaid">
+flowchart LR
+    CLI[Claude Code / Codex] -->|HTTP request| CCX[ccxray proxy]
+    CCX -->|forward| API[Anthropic / OpenAI API]
+    API -->|response| CCX
+    CCX -->|response| CLI
+
+    CCX -.寫入.-> LOG[(~/.ccxray/logs)]
+    CCX -.SSE.-> UI[Dashboard]
+
+    style CCX fill:#88c0d0,stroke:#5e81ac,color:#0f1419
+</pre>
+</div>
+
+<table>
+  <tr><th>ccxray 看得到 ✅</th><th>ccxray 看不到 ❌</th></tr>
+  <tr>
+    <td>
+      <ul>
+        <li>每次 HTTP request / response 的完整 payload</li>
+        <li>model、input/output/cache tokens</li>
+        <li>cost(用 LiteLLM pricing 算)</li>
+        <li>latency(從 request 進來到 response 結束)</li>
+        <li>從 response 解析 tool_use block → 知道 LLM <em>要求</em> 執行什麼工具</li>
+        <li>下一個 request 帶 tool_result 回來 → 知道工具<em>結果</em></li>
+        <li><strong>跨 provider</strong>:Codex / Gemini 也都看得到</li>
+      </ul>
+    </td>
+    <td>
+      <ul>
+        <li>工具<strong>實際執行</strong>的時間(只能推斷)</li>
+        <li>Permission prompt 等待時間</li>
+        <li>Hook 執行</li>
+        <li>本地檔案 I/O 細節</li>
+        <li>使用者的 prompt 輸入動作</li>
+      </ul>
+    </td>
+  </tr>
+</table>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="plans">5. 四個整合方案</h2>
+
+<h3><span class="pill A">方案 A</span> Metrics Only — 輕量起手式</h3>
+
+<p>只 emit 數字型指標:token、cost、request count、cache hit rate。<strong>不碰 trace</strong>。</p>
+
+<div class="diagram">
+<pre class="mermaid">
+flowchart LR
+    REQ[每次 HTTP 完成] --> M["counter tokens.input ++<br/>counter tokens.output ++<br/>counter cost.usd ++<br/>histogram latency ms"]
+    M -->|OTLP| COL[Collector]
+    COL --> GRA[Grafana / Datadog<br/>畫圖表]
+
+    style M fill:#a3be8c,stroke:#5e81ac,color:#0f1419
+</pre>
+</div>
+
+<div class="card">
+  <strong>動到哪些檔案</strong>
+  <ul>
+    <li>新增 <code>server/otel.js</code> — 只在 env var 存在時啟動 <code>@opentelemetry/sdk-node</code></li>
+    <li>修改 <code>server/forward.js</code> — request 結束時 <code>counter.add(tokens)</code></li>
+  </ul>
+  <strong>優點</strong>
+  <ul>
+    <li>實作最簡單(估 1-2 天)</li>
+    <li>跟 CLI 內建的 OTel <strong>不重複</strong>(CLI 也有 metrics,但 ccxray 多了 Codex 的)</li>
+    <li>對使用者最有用:可以在 Grafana 上畫每日 token / cost 趨勢</li>
+  </ul>
+  <strong>缺點</strong>
+  <ul>
+    <li>看不到「為什麼貴」(沒有 trace,不知道是哪個 turn 燒最多)</li>
+  </ul>
+</div>
+
+<!-- ─────── -->
+
+<h3><span class="pill B">方案 B</span> Metrics + Synthetic Traces — 中度整合</h3>
+
+<p>加上 trace,但 trace 是「合成」的(因為看不到真實 tool 執行時間,只能從 HTTP 推斷)。</p>
+
+<div class="diagram">
+<pre class="mermaid">
+flowchart TB
+    subgraph Trace [合成的 Trace]
+        I["claude_code.interaction<br/>由 session_id 群組"]
+        L["claude_code.llm_request<br/>真實 HTTP 時間"]
+        T1["ccxray.tool.synthetic<br/>從 tool_use 推斷"]
+        T2["ccxray.tool.synthetic<br/>從 tool_use 推斷"]
+        I --> L
+        L --> T1
+        L --> T2
+    end
+
+    style I fill:#88c0d0,stroke:#5e81ac,color:#0f1419
+    style L fill:#a3be8c,stroke:#5e81ac,color:#0f1419
+    style T1 fill:#ebcb8b,stroke:#5e81ac,color:#0f1419
+    style T2 fill:#ebcb8b,stroke:#5e81ac,color:#0f1419
+</pre>
+</div>
+
+<div class="card">
+  <strong>動到哪些檔案</strong>
+  <ul>
+    <li>方案 A 的全部</li>
+    <li>修改 <code>server/store.js</code> — session/turn 推斷時開 <code>interaction</code> span</li>
+    <li>修改 <code>server/forward.js</code> — 讀 incoming <code>traceparent</code> header 當父 context</li>
+    <li>解析 response 的 tool_use block,合成 tool span</li>
+  </ul>
+  <strong>優點</strong>
+  <ul>
+    <li>可以看到「一個 turn 內各 tool 的耗時分布」(雖然是估的)</li>
+    <li>如果使用者開了 CLI OTel,ccxray 的 span 可以自動掛在他們的 trace 底下</li>
+    <li>對 Codex 來說這是<strong>唯一</strong>的 trace 來源</li>
+  </ul>
+  <strong>缺點</strong>
+  <ul>
+    <li>同時開 CLI OTel + ccxray 會出現<strong>重複 span</strong>(同一個 llm_request 兩邊都送)</li>
+    <li>Tool 時間是「下一個 request 來的時候 - 上一個 response 結束」,不是真實執行時間</li>
+    <li>實作複雜度估 3-5 天</li>
+  </ul>
+</div>
+
+<!-- ─────── -->
+
+<h3><span class="pill C">方案 C</span> 完整(Metrics + Traces + Log Events)— 重度整合</h3>
+
+<p>把 ccxray 看到的<strong>完整 payload</strong> 也 emit 成 log event,讓使用者可以在 OTel 後端做全文搜尋。</p>
+
+<div class="diagram">
+<pre class="mermaid">
+flowchart LR
+    REQ[HTTP request] --> M[Metrics]
+    REQ --> T[Traces]
+    REQ --> L["Log Events<br/>完整 request / response JSON"]
+
+    M --> COL[Collector]
+    T --> COL
+    L --> COL
+
+    COL --> BE[Honeycomb / Langfuse<br/>可全文搜尋 payload]
+
+    style L fill:#bf616a,stroke:#5e81ac,color:#0f1419
+</pre>
+</div>
+
+<div class="card">
+  <strong>動到哪些檔案</strong>
+  <ul>
+    <li>方案 B 的全部</li>
+    <li>新增 log event emit:每次 request/response 結束,送一個完整 body 的 log event</li>
+    <li>需要處理大 payload 的 chunking、PII 遮蔽選項</li>
+  </ul>
+  <strong>優點</strong>
+  <ul>
+    <li>使用者可以在 Langfuse / Honeycomb 看完整對話歷史和工具結果</li>
+    <li>比 CLI 內建的 <code>OTEL_LOG_RAW_API_BODIES</code> 還完整(CLI 那個是 60KB truncated)</li>
+    <li>可以做進階分析:prompt 模式、tool 失敗率、超長 conversation</li>
+  </ul>
+  <strong>缺點</strong>
+  <ul>
+    <li>資料量爆增,後端費用顯著上升</li>
+    <li>隱私 / 合規問題(完整對話內容被外送)</li>
+    <li>跟 ccxray 自己的 local log 功能<strong>有點重疊</strong>(使用者已經可以在 dashboard 看)</li>
+    <li>實作複雜度估 1-2 週</li>
+  </ul>
+</div>
+
+<!-- ─────── -->
+
+<h3><span class="pill D">方案 D</span> <span class="pill recommended">★ 推薦</span> 雲端追蹤 + 本地反查(Hybrid)</h3>
+
+<p>把 ccxray 看到的 <strong>metadata</strong>(model、token、cost、tool 名稱、timing)送雲端,完整 payload <strong>留在本地</strong>。span 上掛一個 <code>ccxray.entry_id</code> attribute,在 Grafana 發現問題後可以回 ccxray dashboard 反查完整對話。</p>
+
+<div class="diagram">
+<pre class="mermaid">
+flowchart LR
+    REQ[HTTP request 進來] --> CCX[ccxray proxy]
+    CCX -->|完整 payload<br/>~50KB/turn| LOG[(~/.ccxray/logs<br/>本地)]
+    CCX -->|metadata + entry_id<br/>~1KB/turn| OTLP[OTLP Collector]
+    OTLP --> GRA[Grafana / Honeycomb<br/>聚合查詢]
+
+    GRA -.點 entry_id<br/>跳回本地.-> UI[ccxray Dashboard<br/>看完整 payload]
+    LOG --> UI
+
+    style CCX fill:#88c0d0,stroke:#5e81ac,color:#0f1419
+    style LOG fill:#a3be8c,stroke:#5e81ac,color:#0f1419
+    style UI fill:#b48ead,stroke:#5e81ac,color:#0f1419
+</pre>
+</div>
+
+<h4>反查的工作流</h4>
+
+<div class="diagram">
+<pre class="mermaid">
+sequenceDiagram
+    autonumber
+    actor U as 工程師
+    participant G as Grafana
+    participant D as ccxray Dashboard
+    participant F as 本地 log 檔
+
+    Note over G: 看到異常 spike<br/>cost 突然爆增
+    U->>G: 點開最貴的那個 span
+    G-->>U: trace 顯示 ccxray.entry_id=<br/>"2026-05-12T09-31-04-227"
+    U->>D: 開啟 http://localhost:5577/entry/2026-...
+    D->>F: 讀取本地 _req.json / _res.json
+    F-->>D: 完整 payload
+    D-->>U: 顯示完整對話、tool 呼叫、cache 結構
+    Note over U: 找到原因:<br/>某個 tool result<br/>把 200KB 文字塞進 context
+</pre>
+</div>
+
+<div class="card">
+  <strong>實際 emit 的 span 長這樣(metadata-only)</strong>
+<pre><code>{
+  "name": "ccxray.llm_request",
+  "attributes": {
+    "ccxray.entry_id":        "2026-05-12T09-31-04-227",
+    "ccxray.dashboard_url":   "http://localhost:5577/entry/2026-05-12T09-31-04-227",
+    "ccxray.provider":        "anthropic",
+    "model":                  "claude-opus-4-7",
+    "tokens.input":            45230,
+    "tokens.output":            1820,
+    "tokens.cache_read":       38500,
+    "tokens.cache_creation":    6730,
+    "cost.usd":              0.0825,
+    "latency_ms":              4210,
+    "tools.count":                 3,
+    "tools.names":  ["Read","Bash","Edit"]
+  }
+}</code></pre>
+  <strong>注意:沒有任何 prompt 文字、tool input、tool output。</strong>
+</div>
+
+<div class="card">
+  <strong>動到哪些檔案</strong>
+  <ul>
+    <li>方案 A 的全部(metrics)</li>
+    <li>新增 <code>server/otel.js</code> 多一段「開 span 並掛 entry_id」</li>
+    <li>修改 <code>server/forward.js</code> — 在 entry 寫入後 emit span,attribute 直接從現有 store 的 metadata 取</li>
+    <li>修改 <code>server/routes/api.js</code> — 加一個 <code>/entry/:id</code> 路由,直接 deep-link 到該筆</li>
+    <li><strong>不需要</strong>處理大 payload chunking、PII 遮蔽、log event(因為沒送)</li>
+  </ul>
+
+  <strong>優點</strong>
+  <ul>
+    <li class="good">✅ <strong>隱私零風險</strong>:沒有任何對話內容外送</li>
+    <li class="good">✅ <strong>資料量極小</strong>:每個 turn ~1KB,Grafana free tier 都吃得下</li>
+    <li class="good">✅ <strong>反查路徑清楚</strong>:Grafana 看到怪 → 點連結 → ccxray dashboard 看細節</li>
+    <li class="good">✅ <strong>跟 dashboard 不重疊,而是強化</strong>:Grafana 負責「橫向聚合」,dashboard 負責「縱向細節」</li>
+    <li class="good">✅ <strong>實作比 B 還簡單</strong>:不用合成 tool span 的時間(那個本來就不準),只送一個 llm_request span 加 entry_id 即可。估 2–3 天</li>
+  </ul>
+
+  <strong>缺點 / 限制</strong>
+  <ul>
+    <li class="warn">⚠️ 反查只能在「同一台機器」上做(本地 log 的本質)。如果使用者是遠端 / 多人共用,需要先決定 log 放哪</li>
+    <li class="warn">⚠️ 本地 log 被 rotate / 清掉後,trace 上的 entry_id 會變死連結。可在 dashboard 顯示「此 entry 已 expire」提示</li>
+    <li class="warn">⚠️ 如果使用者只開 Grafana 不開 ccxray dashboard,點連結會打不開(可考慮 fallback:span 上加一個 <code>ccxray.summary</code> attribute 提供 50 字摘要)</li>
+  </ul>
+</div>
+
+<div class="key">
+<strong>為什麼這個比 B 和 C 都好?</strong>
+<ul>
+  <li>比 <span class="pill B">B</span> 好:不用煩惱 synthetic tool span 的時間不準、不用煩惱跟 CLI 重複(metadata-level 重複沒關係),又多了反查能力</li>
+  <li>比 <span class="pill C">C</span> 好:價值幾乎一樣(都能看完整 payload),但隱私 / 成本 / 實作複雜度全面贏</li>
+  <li>本質上是「用 trace_id / entry_id 當索引,把儲存外包給本地」的設計</li>
+</ul>
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="compare">6. 四案比較表</h2>
+
+<table>
+  <tr>
+    <th>面向</th>
+    <th><span class="pill A">方案 A</span><br>Metrics Only</th>
+    <th><span class="pill B">方案 B</span><br>+ Synthetic Traces</th>
+    <th><span class="pill C">方案 C</span><br>完整 payload</th>
+    <th><span class="pill D">方案 D</span> <span class="pill recommended">★</span><br>Hybrid 反查</th>
+  </tr>
+  <tr>
+    <td>實作工時</td>
+    <td class="good">1–2 天</td>
+    <td class="warn">3–5 天</td>
+    <td class="bad">1–2 週</td>
+    <td class="good">2–3 天</td>
+  </tr>
+  <tr>
+    <td>使用者價值</td>
+    <td class="good">中:cost / token 趨勢</td>
+    <td class="good">高:turn timing 分析</td>
+    <td class="warn">看情境:power user</td>
+    <td class="good">最高:聚合 + 細節都有</td>
+  </tr>
+  <tr>
+    <td>跟 CLI 內建 OTel 衝突</td>
+    <td class="good">不衝突</td>
+    <td class="warn">span 重複</td>
+    <td class="bad">重複更嚴重</td>
+    <td class="good">不衝突<br>(用 ccxray.* namespace)</td>
+  </tr>
+  <tr>
+    <td>Codex / Gemini 支援</td>
+    <td class="good">是</td>
+    <td class="good">是(唯一)</td>
+    <td class="good">是(唯一)</td>
+    <td class="good">是(唯一)</td>
+  </tr>
+  <tr>
+    <td>資料量 / 後端費用</td>
+    <td class="good">很低</td>
+    <td class="warn">中</td>
+    <td class="bad">高,需取樣</td>
+    <td class="good">低(~1KB/turn)</td>
+  </tr>
+  <tr>
+    <td>隱私風險</td>
+    <td class="good">無</td>
+    <td class="good">低</td>
+    <td class="bad">高</td>
+    <td class="good">無(payload 不出機器)</td>
+  </tr>
+  <tr>
+    <td>取代 dashboard 的程度</td>
+    <td class="good">完全不衝突</td>
+    <td class="warn">部分重疊</td>
+    <td class="bad">高度重疊</td>
+    <td class="good">互補強化</td>
+  </tr>
+  <tr>
+    <td>需要使用者持續開 ccxray dashboard</td>
+    <td class="good">不需要</td>
+    <td class="good">不需要</td>
+    <td class="good">不需要</td>
+    <td class="warn">反查時需要本地 log 還在</td>
+  </tr>
+</table>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="mgmt">7. 管理者視角:還能看到什麼?</h2>
+
+<p>除了 cost / token,管理者(team lead、平台 owner)通常也想知道:</p>
+<ul>
+  <li><strong>誰在用什麼 MCP server?</strong>各用幾次?哪個 MCP 失敗率最高?</li>
+  <li><strong>哪些 tool 被叫最多?</strong>Bash、Read、Edit、WebSearch... 各佔多少?</li>
+  <li><strong>Skill 採用率?</strong>哪些 skill 被觸發?哪些建了沒人用?</li>
+  <li><strong>每個團隊 / 專案的差異?</strong>同樣是用 Claude Code,A 團隊跟 B 團隊行為差在哪?</li>
+</ul>
+
+<p>這些<strong>全部都是 metrics 加上 attribute(label)</strong>,屬於方案 A 和方案 D 的能力範圍,不需要 trace 或完整 payload。</p>
+
+<h3>能 emit 的 counter 範例</h3>
+
+<div class="card">
+<pre><code># 每個 MCP server 被叫的次數
+ccxray.mcp.invocations_total {server="filesystem", tool="read_file"} = 1248
+ccxray.mcp.invocations_total {server="github",     tool="create_pr"} =   42
+ccxray.mcp.invocations_total {server="slack",      tool="post_message"} = 89
+
+# MCP 失敗次數
+ccxray.mcp.errors_total {server="github", error_type="timeout"} = 7
+
+# 內建 tool 使用次數
+ccxray.tool.invocations_total {tool="Bash",   provider="anthropic"} = 5230
+ccxray.tool.invocations_total {tool="Read",   provider="anthropic"} = 8120
+ccxray.tool.invocations_total {tool="Edit",   provider="anthropic"} = 1840
+ccxray.tool.invocations_total {tool="WebSearch", provider="anthropic"} = 92
+
+# Skill 觸發次數(從 system prompt 解析)
+ccxray.skill.activations_total {skill="release",   provider="anthropic"} = 12
+ccxray.skill.activations_total {skill="git-commit", provider="anthropic"} = 87
+
+# 每個 provider 的 session 數
+ccxray.sessions_total {provider="anthropic"} = 234
+ccxray.sessions_total {provider="codex"}     =  41
+
+# 維度可組合:依 model 拆 token 消耗
+ccxray.tokens.input_total {model="claude-opus-4-7", provider="anthropic"} = 12_500_000
+ccxray.tokens.input_total {model="claude-sonnet-4-6", provider="anthropic"} = 38_200_000</code></pre>
+</div>
+
+<h3>ccxray 已經有的資料來源</h3>
+
+<table>
+  <tr><th>想 emit 的 metric</th><th>ccxray 現有來源</th><th>難度</th></tr>
+  <tr>
+    <td><code>ccxray.tool.invocations_total</code></td>
+    <td>response 內的 <code>tool_use</code> block(已經在解析)</td>
+    <td class="good">低</td>
+  </tr>
+  <tr>
+    <td><code>ccxray.mcp.invocations_total</code></td>
+    <td>tool name 以 <code>mcp__&lt;server&gt;__&lt;tool&gt;</code> 為前綴(已有命名規則)</td>
+    <td class="good">低</td>
+  </tr>
+  <tr>
+    <td><code>ccxray.skill.activations_total</code></td>
+    <td>system prompt 內的 skill 觸發 marker(<code>system-prompt.js</code> 已在解析)</td>
+    <td class="warn">中(需要確認 marker)</td>
+  </tr>
+  <tr>
+    <td><code>ccxray.sessions_total</code></td>
+    <td><code>store.js</code> 的 session 推斷</td>
+    <td class="good">低</td>
+  </tr>
+  <tr>
+    <td><code>ccxray.tokens.* / cost.*</code></td>
+    <td><code>pricing.js</code> + response usage 欄位</td>
+    <td class="good">低</td>
+  </tr>
+  <tr>
+    <td>依「使用者 / 團隊」拆分</td>
+    <td>需新增 <code>OTEL_RESOURCE_ATTRIBUTES=enduser.id=...</code> 設定指引</td>
+    <td class="warn">中(靠使用者設定環境變數)</td>
+  </tr>
+</table>
+
+<h3>更多管理者會在意的指標(全部都在方案 A 能力範圍)</h3>
+
+<p>以下指標 ccxray 都能從 HTTP 看到的資料推導出來,不需要 trace 或完整 payload:</p>
+
+<div class="grid-3">
+
+  <div class="card">
+    <h4>📈 生產力 / 採用</h4>
+    <ul>
+      <li><code>ccxray.users.active_daily</code><br><span class="muted">DAU / WAU,看推廣成效</span></li>
+      <li><code>ccxray.sessions.duration_seconds</code><br><span class="muted">histogram,session 平均時長</span></li>
+      <li><code>ccxray.turns_per_session</code><br><span class="muted">每 session 對話幾輪</span></li>
+      <li><code>ccxray.first_token_latency_ms</code><br><span class="muted">UX 體感速度</span></li>
+      <li><code>ccxray.agent_type.invocations</code><br><span class="muted">general / explore / plan / 自訂 subagent 各用幾次</span></li>
+    </ul>
+  </div>
+
+  <div class="card">
+    <h4>💰 成本效率</h4>
+    <ul>
+      <li><code>ccxray.cache.hit_ratio</code><br><span class="muted">cache_read / total_input,< 70% 表示 prompt 設計有問題</span></li>
+      <li><code>ccxray.cost_per_session_usd</code><br><span class="muted">histogram,找出燒錢 outlier</span></li>
+      <li><code>ccxray.tokens.output_per_input_ratio</code><br><span class="muted">產出 / 輸入比,過低代表 context 浪費</span></li>
+      <li><code>ccxray.quota.burn_rate_pct</code><br><span class="muted">5h / 週 quota 燒到幾 %</span></li>
+      <li><code>ccxray.retries_total</code><br><span class="muted">retry 次數,間接成本</span></li>
+    </ul>
+  </div>
+
+  <div class="card">
+    <h4>🚨 品質 / 可靠度</h4>
+    <ul>
+      <li><code>ccxray.errors_total{type}</code><br><span class="muted">rate_limit / overloaded / timeout / 500</span></li>
+      <li><code>ccxray.stop_reason{reason}</code><br><span class="muted">end_turn / tool_use / max_tokens / stop_sequence</span></li>
+      <li><code>ccxray.max_tokens_hit_rate</code><br><span class="muted">被截斷率,高代表 UX 差</span></li>
+      <li><code>ccxray.latency_ms{model,p95}</code><br><span class="muted">各 model SLA</span></li>
+      <li><code>ccxray.aborted_total</code><br><span class="muted">使用者 ctrl-c / Esc 比例</span></li>
+    </ul>
+  </div>
+
+  <div class="card">
+    <h4>🧠 使用模式</h4>
+    <ul>
+      <li><code>ccxray.context.utilization_pct</code><br><span class="muted">histogram,context window 平均吃多滿</span></li>
+      <li><code>ccxray.auto_compact.triggered_total</code><br><span class="muted">壓縮觸發次數,代表「需要更大 context」</span></li>
+      <li><code>ccxray.subagent.invocations</code><br><span class="muted">主 agent vs Task 子 agent 比例</span></li>
+      <li><code>ccxray.tools_per_turn</code><br><span class="muted">每輪平均叫幾個 tool</span></li>
+      <li><code>ccxray.thinking.token_ratio</code><br><span class="muted">extended thinking 佔輸出比例</span></li>
+    </ul>
+  </div>
+
+  <div class="card">
+    <h4>🛠️ Tool / MCP 細節</h4>
+    <ul>
+      <li><code>ccxray.tool.latency_ms{tool}</code><br><span class="muted">推估 tool 執行時間(下一個 request 進來 − 上一個 response 結束)</span></li>
+      <li><code>ccxray.tool.result_size_bytes{tool}</code><br><span class="muted">tool 回傳大小,過大會吃 context</span></li>
+      <li><code>ccxray.tool.failures_total{tool,reason}</code><br><span class="muted">從 tool_result 的 is_error 解析</span></li>
+      <li><code>ccxray.mcp.unique_servers</code><br><span class="muted">使用者連了幾個 MCP server</span></li>
+      <li><code>ccxray.bash.command_pattern{cmd}</code><br><span class="muted">最常 bash 跑什麼(取第一個 token,有 cardinality 風險,需設白名單)</span></li>
+    </ul>
+  </div>
+
+  <div class="card">
+    <h4>🔒 治理 / 安全</h4>
+    <ul>
+      <li><code>ccxray.permission_mode.usage{mode}</code><br><span class="muted">default / acceptEdits / bypassPermissions(yolo) 比例</span></li>
+      <li><code>ccxray.dangerous_tool.invocations</code><br><span class="muted">rm -rf / force-push / drop table 偵測</span></li>
+      <li><code>ccxray.file_writes_total</code><br><span class="muted">Edit + Write 加總</span></li>
+      <li><code>ccxray.provider.distribution</code><br><span class="muted">Anthropic vs Codex vs Gemini 比例</span></li>
+      <li><code>ccxray.system_prompt.version_changes</code><br><span class="muted">agent system prompt 改了幾次(知道誰在自訂)</span></li>
+    </ul>
+  </div>
+
+</div>
+
+<div class="warn-box">
+<strong>Cardinality 警告:</strong>帶 <code>{user}</code>、<code>{cmd}</code>、<code>{file_path}</code> 等高基數 attribute 的 metric 會把後端 explode。設計時:
+<ul>
+  <li>低基數 labels(tool name, model, provider, error type)— 直接用</li>
+  <li>中基數(user, team)— 用 <code>OTEL_RESOURCE_ATTRIBUTES</code>,且設使用者上限</li>
+  <li>高基數(file path, full command, prompt text)— <strong>不要當 metric label</strong>,只能放 trace attribute 或 log event</li>
+</ul>
+</div>
+
+<h3>管理者可以做的 Grafana / Datadog 報表</h3>
+
+<div class="diagram">
+<pre class="mermaid">
+flowchart TB
+    subgraph Reports [典型管理報表]
+        R1["📊 每週各團隊 token 消耗 / 成本<br/>(by enduser.id)"]
+        R2["🔧 Top 10 最常用 tool<br/>(by tool name)"]
+        R3["🔌 各 MCP server 使用熱度<br/>(by server name)"]
+        R4["⚙️ Skill 採用率排行<br/>(用了 vs 沒用)"]
+        R5["💸 哪個 model 燒最多錢<br/>(by model + provider)"]
+        R6["🚨 MCP 失敗率告警<br/>(error rate > X%)"]
+    end
+
+    M["ccxray 送出的 metrics<br/>含 attributes:<br/>tool / mcp / skill / model / user"] --> Reports
+
+    style M fill:#88c0d0,stroke:#5e81ac,color:#0f1419
+</pre>
+</div>
+
+<div class="key">
+<strong>關鍵洞察:</strong>「用了什麼 / 用幾次」這類問題 <strong>只需要 metrics(方案 A 的核心),不需要 trace 或 payload</strong>。Cardinality 控制好(tool 名稱、MCP server 名稱是有限集合),即使免費 tier 的 Grafana / Prometheus 都吃得下。
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="reco">8. 建議路線</h2>
+
+<div class="key">
+<strong>建議分兩階段:第一階段做方案 A(含管理面 metrics),第二階段升級到方案 D。</strong>
+</div>
+
+<h3>第一階段:方案 A — 多面向 Metrics(1–2 週)</h3>
+<ul>
+  <li>cost / token counter</li>
+  <li>tool / MCP / skill 使用次數 counter(管理者視角)</li>
+  <li>session / provider 維度</li>
+  <li>支援 <code>OTEL_RESOURCE_ATTRIBUTES</code> 讓使用者標記 team / project / user</li>
+  <li>價值:Grafana 一接,管理報表立刻有</li>
+</ul>
+
+<h3>第二階段:升級到方案 D — 加 trace + 反查(再 2–3 天)</h3>
+<ul>
+  <li>每個 llm_request 開一個 metadata-only span</li>
+  <li>span 帶 <code>ccxray.entry_id</code> + <code>dashboard_url</code></li>
+  <li>新增 <code>/entry/:id</code> deep-link 路由</li>
+  <li>價值:Grafana 看到異常 → 一鍵跳回本地看完整 payload</li>
+</ul>
+
+<h3>不建議做的</h3>
+<ul>
+  <li class="muted"><strong>方案 B 的 synthetic tool span 時間</strong>:時間不準,容易誤導,且跟 CLI 內建 OTel 直接打架</li>
+  <li class="muted"><strong>方案 C 的完整 payload 外送</strong>:隱私風險高,跟 ccxray 自身定位重疊。若使用者真要這個,應該獨立做「ccxray log 上傳到 S3 / 自建後端」的功能,而不是塞進 OTel pipeline</li>
+</ul>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="premortem">9. 事前驗屍與解方</h2>
+
+<p>建構前先想像「半年後失敗了會是因為什麼?」每題用 10 分制加權評估,只接受 ≥ 9 分方案。共 10 題(1 題跳過為可接受風險,2 題後續掃描補充),9 題解方全數通過。</p>
+
+<style>
+  .premortem-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 14px; margin: 16px 0; }
+  .pm-card { background: var(--panel); border: 1px solid var(--border); border-radius: 8px; padding: 16px 18px; }
+  .pm-card h4 { margin: 0 0 8px; font-size: 14px; color: var(--text); display: flex; align-items: center; gap: 8px; }
+  .pm-card .sev { font-size: 11px; padding: 1px 7px; border-radius: 10px; }
+  .pm-card .sev.high { background: rgba(191, 97, 106, 0.2); color: var(--danger); border: 1px solid var(--danger); }
+  .pm-card .sev.mid  { background: rgba(235, 203, 139, 0.18); color: var(--warn); border: 1px solid var(--warn); }
+  .pm-card .sev.low  { background: rgba(163, 190, 140, 0.18); color: var(--accent-2); border: 1px solid var(--accent-2); }
+  .pm-card .sev.skip { background: var(--panel-2); color: var(--muted); border: 1px solid var(--border); }
+  .pm-card .score { margin-left: auto; font-family: "SF Mono", Menlo, monospace; font-size: 12px; color: var(--accent-2); font-weight: 600; }
+  .pm-card .score.skip { color: var(--muted); font-weight: 400; }
+  .pm-card .problem { font-size: 12.5px; color: var(--muted); font-style: italic; margin: 0 0 8px; }
+  .pm-card .sol { font-size: 13px; margin: 6px 0 4px; color: var(--accent); font-weight: 600; }
+  .pm-card .ver { font-size: 13px; margin: 8px 0 4px; color: var(--accent-2); font-weight: 600; }
+  .pm-card ul { padding-left: 18px; margin: 4px 0; font-size: 12.5px; }
+  .pm-card li { margin: 2px 0; }
+</style>
+
+<div class="premortem-grid">
+
+  <div class="pm-card">
+    <h4>#1 Cardinality 爆炸 <span class="sev high">高傷害</span><span class="score">9.4 / 10</span></h4>
+    <p class="problem">使用者把 enduser.id 設成 email、bash command 當 label,Grafana 帳號被限流</p>
+    <div class="sol">解方</div>
+    <ul>
+      <li>Attribute key allow-list(View API)</li>
+      <li>Per-(metric, attribute) cardinality budget,超過改 <code>_overflow_</code></li>
+      <li><code>ccxray.metrics.overflow_total</code> sentinel + <code>ccxray status --metrics</code> 顯示用量</li>
+      <li>新 metric 必須註冊 schema,缺漏 CI fail</li>
+    </ul>
+    <div class="ver">驗證</div>
+    <ul>
+      <li>實作:餵 51 unique values,assert 第 51 為 overflow</li>
+      <li>上線:overflow counter > 0 → 自動冒泡警示</li>
+    </ul>
+  </div>
+
+  <div class="pm-card">
+    <h4>#2 沒人用,功能死掉 <span class="sev high">高傷害</span><span class="score">9.0 / 10</span></h4>
+    <p class="problem">半年後 < 5% 使用者啟用 OTel,維護成本變沉沒成本</p>
+    <div class="sol">解方</div>
+    <ul>
+      <li><code>ccxray --otel-demo</code> 本地一鍵起 Grafana,30 秒看到資料</li>
+      <li>README 90 秒接 Grafana 截圖教學</li>
+      <li>本地 heartbeat 統計使用率(不外送)</li>
+      <li>三個月 sunset clock:< 10 個 GitHub 提及則停損</li>
+    </ul>
+    <div class="ver">驗證</div>
+    <ul>
+      <li>實作:3 個新使用者走流程,中位數 < 5 分鐘看到資料</li>
+      <li>上線:三個月 KPI 閘門明確</li>
+    </ul>
+  </div>
+
+  <div class="pm-card">
+    <h4>#3 多機反查全壞 <span class="sev high">高傷害</span><span class="score">9.4 / 10</span></h4>
+    <p class="problem">Manager 在 Grafana 看到 trace,點 localhost 連結打不開(那是工程師的機器)</p>
+    <div class="sol">解方</div>
+    <ul>
+      <li>Span 帶 entry_id + host + 50字 summary + local_url + 可選 public_url</li>
+      <li>Dashboard <code>/entry/:id</code> 找不到時優雅降級,顯示「在 host=X」提示</li>
+      <li>文件明說:個人 / 小團隊 / 大團隊各自的反查路徑</li>
+    </ul>
+    <div class="ver">驗證</div>
+    <ul>
+      <li>實作:CI 雙 ccxray instance 模擬跨機反查</li>
+      <li>上線:<code>deeplink_resolved_total{outcome}</code> 追蹤 wrong_host 比例</li>
+    </ul>
+  </div>
+
+  <div class="pm-card">
+    <h4>#4 CLI OTel 衝突 → 雙重計費 <span class="sev mid">中傷害</span><span class="score">9.5 / 10</span></h4>
+    <p class="problem">使用者同時開 CLI 和 ccxray OTel,token 算兩倍,budget 警報全錯</p>
+    <div class="sol">解方</div>
+    <ul>
+      <li>強制 <code>ccxray.*</code> namespace,不模仿 <code>claude_code.*</code> 欄位</li>
+      <li>偵測 <code>CLAUDE_CODE_ENABLE_TELEMETRY</code> 進入 complement mode,印警告</li>
+      <li>每筆 emit 帶 <code>ccxray.source=ccxray-proxy</code> resource attribute</li>
+      <li><code>ccxray.reconciliation.token_diff_pct</code>:跟 CLI 對帳的差異</li>
+    </ul>
+    <div class="ver">驗證</div>
+    <ul>
+      <li>實作:雙開模式 fixture 測試,assert source attribute 分得開</li>
+      <li>上線:reconciliation diff > 5% 警報</li>
+    </ul>
+  </div>
+
+  <div class="pm-card">
+    <h4>#5 管理者誤用 metric 監控個人 <span class="sev high">高傷害</span><span class="score">9.7 / 10</span></h4>
+    <p class="problem">Team lead 拿使用次數開檢討會,工程師集體棄用 ccxray</p>
+    <div class="sol">解方</div>
+    <ul>
+      <li>三層 tier:<strong>預設 OFF</strong> / 專案匿名 / 個人具名</li>
+      <li>專案是上限,個人是下限,個人可隨時降級退出</li>
+      <li>個人具名走 <code>.ccxray.user.json</code>(gitignore),不入 repo</li>
+      <li>啟動 banner + <code>ccxray status --otel</code> + <code>ccxray otel preview</code> dry-run</li>
+      <li>文件明寫:不要用這些 metric 評估個人績效</li>
+    </ul>
+    <div class="ver">驗證</div>
+    <ul>
+      <li>實作:tier 升降 4 種組合矩陣全測</li>
+      <li>上線:<code>tier_distribution</code> 追蹤採用率,tier 2 < 5% 強化文件</li>
+    </ul>
+  </div>
+
+  <div class="pm-card">
+    <h4>#6 Parser drift(skill / MCP / tool)<span class="sev mid">中傷害</span><span class="score">9.4 / 10</span></h4>
+    <p class="problem">Claude Code 改 prompt 格式,skill detector 全 0,半年沒人發現</p>
+    <div class="sol">解方</div>
+    <ul>
+      <li>Schema 化 parser(<code>parsers/*.schema.json</code> 帶版本)</li>
+      <li>Snapshot fixtures:每 provider 一組固定 request/response</li>
+      <li>Sentinel metrics:<code>ccxray.parser.unknown_*_total</code> — 未識別不是 0,是「看到了但分類不了」</li>
+      <li>對帳 invariants:tool_use block count 必對得起 extracted count</li>
+      <li>Parser 包 try/catch,壞掉不影響 ccxray 核心</li>
+      <li><code>ccxray parser report</code> 命令一鍵看 unknown top 10</li>
+    </ul>
+    <div class="ver">驗證</div>
+    <ul>
+      <li>實作:餵未知 tool → assert sentinel ++,assert 不 throw</li>
+      <li>上線:reconciliation_mismatch > 0 = bug,unknown_* 持續 7 天自動建議檢查</li>
+    </ul>
+  </div>
+
+  <div class="pm-card">
+    <h4>#7 Bundle size 膨脹 <span class="sev skip">可接受</span><span class="score skip">— 跳過</span></h4>
+    <p class="problem">@opentelemetry/sdk-node + auto-instrumentations 把 ccxray 從 3MB 變 18MB</p>
+    <div class="sol">處置</div>
+    <ul>
+      <li>使用者評估為可接受風險,跳過正式評估</li>
+      <li>實作時自我約束:只 import 必要模組(api、sdk-metrics、exporter-otlp-http),不引 auto-instrumentations</li>
+    </ul>
+  </div>
+
+  <div class="pm-card">
+    <h4>#8 Hub mode env 傳遞 <span class="sev low">低傷害</span><span class="score">9.5 / 10</span></h4>
+    <p class="problem">使用者改 env 重新跑,但 hub 還在背景跑舊設定,以為改好實際沒送對地方</p>
+    <div class="sol">解方</div>
+    <ul>
+      <li>業務 OTel 走 <strong>client 端</strong>,不走 hub(hub 只負責 proxy + SSE broadcast)</li>
+      <li>每個 client 自己讀 <code>.ccxray.json</code> + 個人 config + env</li>
+      <li>不同 tier / endpoint 在同一個 hub 下自然共存</li>
+      <li>Hub 自己另開 <code>ccxray.hub.*</code> 運維 metric(uptime / requests / clients)</li>
+      <li><code>ccxray status</code> 顯示每個 client 的 tier 和 env 一致性</li>
+    </ul>
+    <div class="ver">驗證</div>
+    <ul>
+      <li>實作:兩個 client 不同 config,同 hub,assert 各送各的</li>
+      <li>上線:<code>env_inconsistency_total</code> 追蹤「改了沒重啟」累積</li>
+    </ul>
+  </div>
+
+  <div class="pm-card">
+    <h4>#11 Collector down 時記憶體 / 行為 <span class="sev mid">中傷害</span><span class="score">9.4 / 10</span></h4>
+    <p class="problem">Collector 掛掉,OTel SDK 無限重試,buffer 堆爆把 ccxray OOM</p>
+    <div class="sol">解方</div>
+    <ul>
+      <li>Bounded queue(2048),滿了 drop oldest</li>
+      <li>Circuit breaker:連續 5 次失敗 → open 60s → half-open 試探 → 失敗則 backoff(60→120→240→600s)</li>
+      <li>State + dropped 計數寫本地 log,不送網路(因為網路本來就斷)</li>
+      <li>設計選擇:丟資料 > 拖垮 ccxray,文件明說</li>
+    </ul>
+    <div class="ver">驗證</div>
+    <ul>
+      <li>實作:mock collector 回 500,assert memory 不增長、drop counter ++</li>
+      <li>上線:<code>circuit_breaker_open_seconds</code> 累積長 = 持續問題</li>
+    </ul>
+  </div>
+
+  <div class="pm-card">
+    <h4>#12 Config secret 風險 <span class="sev mid">中傷害</span><span class="score">9.5 / 10</span></h4>
+    <p class="problem">使用者把 Authorization token 寫進 .ccxray.json,commit 進 git</p>
+    <div class="sol">解方</div>
+    <ul>
+      <li><code>${ENV_VAR}</code> 插值,token 只能在 env</li>
+      <li>Schema 拒絕看起來像 secret 的字面值(Bearer、JWT、ghp_ 等 pattern)</li>
+      <li>第一次生成 <code>.ccxray.json</code> 時自動加 <code>.gitignore</code> 提醒</li>
+      <li><code>ccxray status</code> 掃 git tracked config 是否含明文 secret</li>
+    </ul>
+    <div class="ver">驗證</div>
+    <ul>
+      <li>實作:餵 Bearer 字面值 → schema 拒絕並給修正建議</li>
+      <li>實作:餵 <code>${TOKEN}</code> 但 env 未設 → 啟動失敗</li>
+    </ul>
+  </div>
+
+  <div class="pm-card">
+    <h4>#13 OTel 失敗 fallback 策略 <span class="sev mid">中傷害</span><span class="score">9.7 / 10</span></h4>
+    <p class="problem">OTel config 寫錯或 collector 掛,ccxray 整個跑不起來</p>
+    <div class="sol">解方</div>
+    <ul>
+      <li>三層失敗:config error(啟動失敗)/ init error(降級,ccxray 仍跑)/ runtime error(由 #11 處理)</li>
+      <li>狀態機:<code>disabled / active / degraded / circuit_open</code></li>
+      <li><code>~/.ccxray/otel.log</code> 紀錄最近 100 條失敗,自動 rotate</li>
+      <li>核心原則:OTel 是增強,不是必需。網路問題不擋,config 錯擋</li>
+    </ul>
+    <div class="ver">驗證</div>
+    <ul>
+      <li>實作:餵壞 endpoint URL → assert ccxray 仍啟動、proxy 仍轉發、status 標 degraded</li>
+      <li>上線:<code>otel.state{state}</code> 看 degraded 比例</li>
+    </ul>
+  </div>
+
+</div>
+
+<h3>共用基礎設施</h3>
+
+<p>#11–#13 共用同一組失敗處理框架,可降低總工時:</p>
+
+<div class="card">
+<pre><code>server/otel-health.js        # 失敗處理框架(共用)
+  ├─ State machine (active / degraded / circuit_open / disabled)
+  ├─ Bounded queue + drop counter
+  ├─ Circuit breaker
+  ├─ Local log writer (~/.ccxray/otel.log)
+  └─ Status reporter (餵給 ccxray status 命令)
+
+server/config-loader.js      # 配置載入(共用)
+  ├─ JSON Schema 驗證
+  ├─ ${ENV_VAR} 插值
+  ├─ Secret pattern 偵測
+  └─ .gitignore 檢查</code></pre>
+</div>
+
+<h3>結論</h3>
+
+<div class="key">
+<strong>事前驗屍 9 解全部 ≥ 9 分,可進入實作階段。</strong>每題的「上線後監測 metric」本身也是 ccxray 的 OTel emit 內容 — 設計上自我驗證:這套系統<strong>能持續偵測自己有沒有壞掉</strong>。
+</div>
+
+<p class="muted" style="margin-top: 48px; border-top: 1px solid var(--border); padding-top: 16px;">
+本文件位於 <code>docs/otel-integration.html</code>。內容為決策前的探索筆記,實作時請以最終 PR 為準。
+</p>
+
+</div>
+
+<script src="https://cdn.jsdelivr.net/npm/mermaid@10/dist/mermaid.min.js"></script>
+<script>
+  mermaid.initialize({
+    startOnLoad: true,
+    theme: 'dark',
+    themeVariables: {
+      darkMode: true,
+      background: '#1a1f26',
+      primaryColor: '#88c0d0',
+      primaryTextColor: '#d8dee9',
+      primaryBorderColor: '#5e81ac',
+      lineColor: '#8b95a5',
+      secondaryColor: '#a3be8c',
+      tertiaryColor: '#232932',
+      fontFamily: '-apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif',
+    },
+    flowchart: { htmlLabels: true, curve: 'basis' },
+  });
+</script>
+
+</body>
+</html>
diff --git a/docs/otel-phase1-overview.html b/docs/otel-phase1-overview.html
new file mode 100644
index 0000000..f0a8b26
--- /dev/null
+++ b/docs/otel-phase1-overview.html
@@ -0,0 +1,1138 @@
+<!doctype html>
+<html lang="zh-Hant">
+<head>
+<meta charset="utf-8" />
+<title>OTel Phase 1 Change — Visual Overview</title>
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<style>
+  :root {
+    --bg: #0f1419;
+    --panel: #1a1f26;
+    --panel-2: #232932;
+    --border: #2a313c;
+    --text: #d8dee9;
+    --muted: #8b95a5;
+    --accent: #88c0d0;
+    --accent-2: #a3be8c;
+    --warn: #ebcb8b;
+    --danger: #bf616a;
+    --code-bg: #11161c;
+    --purple: #b48ead;
+  }
+  * { box-sizing: border-box; }
+  html, body { margin: 0; padding: 0; background: var(--bg); color: var(--text); }
+  body {
+    font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", "PingFang TC", "Microsoft JhengHei", sans-serif;
+    line-height: 1.7;
+    font-size: 15px;
+  }
+  .wrap { max-width: 1000px; margin: 0 auto; padding: 40px 28px 80px; }
+  h1 { font-size: 28px; border-bottom: 1px solid var(--border); padding-bottom: 12px; margin-bottom: 8px; }
+  h2 {
+    font-size: 20px;
+    margin-top: 56px;
+    color: var(--accent);
+    border-left: 3px solid var(--accent);
+    padding-left: 12px;
+  }
+  h3 { font-size: 16px; color: var(--accent-2); margin-top: 28px; }
+  p, li { color: var(--text); }
+  .muted { color: var(--muted); font-size: 13px; }
+  .lede { color: var(--muted); font-size: 14px; margin-bottom: 32px; }
+  code {
+    background: var(--code-bg);
+    border: 1px solid var(--border);
+    padding: 1px 6px;
+    border-radius: 3px;
+    font-family: "SF Mono", Menlo, Consolas, monospace;
+    font-size: 12.5px;
+    color: var(--warn);
+  }
+  .diagram {
+    background: var(--panel);
+    border: 1px solid var(--border);
+    border-radius: 8px;
+    padding: 20px;
+    margin: 18px 0;
+    overflow-x: auto;
+  }
+  .card {
+    background: var(--panel);
+    border: 1px solid var(--border);
+    border-radius: 8px;
+    padding: 18px 22px;
+    margin: 14px 0;
+  }
+  .toc {
+    background: var(--panel);
+    border: 1px solid var(--border);
+    border-radius: 8px;
+    padding: 14px 22px;
+    margin: 18px 0 32px;
+    font-size: 14px;
+  }
+  .toc a { color: var(--accent); text-decoration: none; }
+  .toc a:hover { text-decoration: underline; }
+  .src {
+    margin-top: 8px;
+    font-size: 12px;
+    color: var(--muted);
+    border-top: 1px dashed var(--border);
+    padding-top: 8px;
+  }
+  .src a {
+    color: var(--accent);
+    text-decoration: none;
+    font-family: "SF Mono", Menlo, Consolas, monospace;
+    font-size: 11.5px;
+  }
+  .src a:hover { text-decoration: underline; }
+  .src .src-row { display: block; margin: 2px 0; }
+  table { width: 100%; border-collapse: collapse; margin: 16px 0; font-size: 13px; }
+  th, td { border: 1px solid var(--border); padding: 8px 12px; text-align: left; vertical-align: top; }
+  th { background: var(--panel-2); color: var(--accent); font-weight: 600; }
+  .pill {
+    display: inline-block;
+    padding: 1px 8px;
+    border-radius: 10px;
+    font-size: 11px;
+    background: var(--panel-2);
+    border: 1px solid var(--border);
+    color: var(--muted);
+    margin-right: 6px;
+    font-family: "SF Mono", Menlo, Consolas, monospace;
+  }
+  .pill.tier0 { color: var(--muted); border-color: var(--muted); }
+  .pill.tier1 { color: var(--accent); border-color: var(--accent); }
+  .pill.tier2 { color: var(--purple); border-color: var(--purple); }
+
+  /* SVG flow animations */
+  .flow-svg { display: block; width: 100%; height: auto; }
+  .flow-svg text { font-family: -apple-system, "SF Pro Text", sans-serif; font-size: 12px; }
+  .node-box { fill: var(--panel-2); stroke: var(--border); stroke-width: 1.5; rx: 6; ry: 6; }
+  .node-box.accent { stroke: var(--accent); }
+  .node-box.accent2 { stroke: var(--accent-2); }
+  .node-box.purple { stroke: var(--purple); }
+  .node-box.warn { stroke: var(--warn); }
+  .node-box.danger { stroke: var(--danger); }
+  .node-label { fill: var(--text); }
+  .node-sub { fill: var(--muted); font-size: 10.5px; }
+  .edge { stroke: var(--muted); stroke-width: 1.5; fill: none; }
+  .edge.accent { stroke: var(--accent); }
+  .edge.accent2 { stroke: var(--accent-2); }
+  .edge.dashed { stroke-dasharray: 4 4; }
+
+  .dot-req { fill: var(--accent); }
+  .dot-res { fill: var(--accent-2); }
+  .dot-otel { fill: var(--purple); }
+  .dot-fail { fill: var(--danger); }
+
+  @keyframes pulse-anim {
+    0%, 100% { transform: scale(1); opacity: 1; }
+    50% { transform: scale(1.18); opacity: 0.7; }
+  }
+  .pulse { animation: pulse-anim 1.6s ease-in-out infinite; transform-origin: center; transform-box: fill-box; }
+
+  .legend { display: flex; gap: 18px; flex-wrap: wrap; font-size: 12px; color: var(--muted); margin-top: 8px; }
+  .legend-dot { display: inline-block; width: 10px; height: 10px; border-radius: 50%; margin-right: 6px; vertical-align: middle; }
+
+  .key {
+    border-left: 3px solid var(--accent-2);
+    background: rgba(163, 190, 140, 0.08);
+    padding: 10px 16px;
+    margin: 14px 0;
+    border-radius: 0 4px 4px 0;
+  }
+  .warn-box {
+    border-left: 3px solid var(--warn);
+    background: rgba(235, 203, 139, 0.08);
+    padding: 10px 16px;
+    margin: 14px 0;
+    border-radius: 0 4px 4px 0;
+  }
+  .scope-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 14px; margin: 16px 0; }
+  .scope-grid .card { margin: 0; }
+  .scope-grid h4 { margin: 0 0 8px; font-size: 14px; }
+  .scope-in h4 { color: var(--accent-2); }
+  .scope-out h4 { color: var(--muted); }
+</style>
+</head>
+<body>
+<div class="wrap">
+
+<h1>OTel Phase 1 — 視覺總覽</h1>
+<div class="lede">
+  本頁逐節呈現 <code>add-otel-metrics-phase1</code> 這個 OpenSpec change 的全貌。每個圖示下方標註 <span class="muted">「Source:」</span> 是該宣稱的依據出處,點擊可開啟對應 spec 檔案。內容嚴格依據 proposal / design / specs / tasks,無推測成分。
+</div>
+
+<div class="toc">
+  <strong>目錄</strong><br>
+  <a href="#bigpicture">1. 大圖:資料流向(動畫)</a><br>
+  <a href="#tiers">2. 三層 Tier Opt-in 模型</a><br>
+  <a href="#config">3. 配置檔案與 env 插值</a><br>
+  <a href="#health">4. OTel 健康狀態機</a><br>
+  <a href="#parser">5. Parser pipeline 與 sentinel</a><br>
+  <a href="#cli">6. 跟 CLI 內建 OTel 共存</a><br>
+  <a href="#cardinality">7. Cardinality budget</a><br>
+  <a href="#modules">8. 新檔案與模組關係</a><br>
+  <a href="#scope">9. Phase 1 範圍 vs Phase 2</a><br>
+  <a href="#metrics">10. 完整 metric 清單</a>
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="bigpicture">1. 大圖:資料流向</h2>
+
+<p>ccxray 是 client / hub 雙進程架構。<strong>OTel 的初始化和 emit 都在 client 端</strong>,hub 純粹是 HTTP proxy + SSE broadcaster,不負責業務 metric。每個 client 自己讀自己的 <code>.ccxray.json</code> + <code>.ccxray.user.json</code>,所以同一個 hub 下不同 project 可以有不同 tier 和 endpoint。</p>
+
+<div class="diagram">
+<svg viewBox="0 0 900 360" class="flow-svg" aria-label="ccxray data flow with OTel">
+  <defs>
+    <marker id="arrow" viewBox="0 0 10 10" refX="9" refY="5" markerWidth="6" markerHeight="6" orient="auto">
+      <path d="M 0 0 L 10 5 L 0 10 z" fill="#8b95a5"></path>
+    </marker>
+    <marker id="arrow-accent" viewBox="0 0 10 10" refX="9" refY="5" markerWidth="6" markerHeight="6" orient="auto">
+      <path d="M 0 0 L 10 5 L 0 10 z" fill="#88c0d0"></path>
+    </marker>
+    <marker id="arrow-purple" viewBox="0 0 10 10" refX="9" refY="5" markerWidth="6" markerHeight="6" orient="auto">
+      <path d="M 0 0 L 10 5 L 0 10 z" fill="#b48ead"></path>
+    </marker>
+
+    <!-- Animation paths -->
+    <path id="path-req" d="M 80,180 L 240,180" />
+    <path id="path-fwd" d="M 360,180 L 520,180" />
+    <path id="path-res" d="M 520,200 L 360,200" />
+    <path id="path-back" d="M 240,200 L 80,200" />
+    <path id="path-otel" d="M 300,210 L 300,290 L 720,290" />
+  </defs>
+
+  <!-- Claude Code -->
+  <rect class="node-box accent" x="20" y="150" width="120" height="60"></rect>
+  <text class="node-label" x="80" y="178" text-anchor="middle">Claude Code</text>
+  <text class="node-sub" x="80" y="195" text-anchor="middle">(or Codex)</text>
+
+  <!-- ccxray client -->
+  <rect class="node-box accent" x="240" y="120" width="120" height="120"></rect>
+  <text class="node-label" x="300" y="148" text-anchor="middle" font-weight="600">ccxray client</text>
+  <text class="node-sub" x="300" y="166" text-anchor="middle">forward.js</text>
+  <text class="node-sub" x="300" y="182" text-anchor="middle">store.js</text>
+  <text class="node-sub" x="300" y="198" text-anchor="middle" fill="#b48ead">otel.js</text>
+  <text class="node-sub" x="300" y="214" text-anchor="middle" fill="#b48ead">otel-health.js</text>
+  <text class="node-sub" x="300" y="230" text-anchor="middle" fill="#b48ead">config-loader.js</text>
+
+  <!-- hub (annotation) -->
+  <rect class="node-box" x="240" y="60" width="120" height="40" stroke-dasharray="4 3"></rect>
+  <text class="node-sub" x="300" y="78" text-anchor="middle">ccxray hub</text>
+  <text class="node-sub" x="300" y="92" text-anchor="middle" font-size="9.5px">(no business metrics; proxy + SSE only)</text>
+
+  <!-- Anthropic -->
+  <rect class="node-box" x="520" y="150" width="120" height="60"></rect>
+  <text class="node-label" x="580" y="178" text-anchor="middle">Anthropic API</text>
+  <text class="node-sub" x="580" y="195" text-anchor="middle">/v1/messages</text>
+
+  <!-- OTLP Collector -->
+  <rect class="node-box purple" x="720" y="270" width="160" height="40"></rect>
+  <text class="node-label" x="800" y="295" text-anchor="middle">OTLP Collector</text>
+
+  <!-- Grafana / etc -->
+  <rect class="node-box purple" x="720" y="320" width="160" height="30" stroke-dasharray="4 3"></rect>
+  <text class="node-sub" x="800" y="340" text-anchor="middle">Grafana / Datadog / Honeycomb</text>
+
+  <!-- Local logs -->
+  <rect class="node-box accent2" x="200" y="280" width="160" height="40"></rect>
+  <text class="node-label" x="280" y="305" text-anchor="middle">~/.ccxray/logs</text>
+
+  <!-- Static edges -->
+  <path class="edge accent" d="M 140,180 L 240,180" marker-end="url(#arrow-accent)" />
+  <path class="edge accent" d="M 360,180 L 520,180" marker-end="url(#arrow-accent)" />
+  <path class="edge accent2" d="M 520,200 L 360,200" marker-end="url(#arrow)" />
+  <path class="edge accent2" d="M 240,200 L 140,200" marker-end="url(#arrow)" />
+  <path class="edge" d="M 280,240 L 280,280" marker-end="url(#arrow)" />
+  <path class="edge purple dashed" d="M 320,240 L 320,290 L 720,290" marker-end="url(#arrow-purple)" />
+
+  <text class="node-sub" x="190" y="172">request</text>
+  <text class="node-sub" x="430" y="172">forward</text>
+  <text class="node-sub" x="430" y="221">response</text>
+  <text class="node-sub" x="190" y="221">SSE</text>
+  <text class="node-sub" x="500" y="282">metrics (OTLP)</text>
+  <text class="node-sub" x="220" y="266">local log</text>
+
+  <!-- Animated dots -->
+  <circle r="5" class="dot-req">
+    <animateMotion dur="3s" repeatCount="indefinite">
+      <mpath href="#path-req" />
+    </animateMotion>
+  </circle>
+  <circle r="5" class="dot-req">
+    <animateMotion dur="3s" begin="0.3s" repeatCount="indefinite">
+      <mpath href="#path-fwd" />
+    </animateMotion>
+  </circle>
+  <circle r="5" class="dot-res">
+    <animateMotion dur="3s" begin="1.5s" repeatCount="indefinite">
+      <mpath href="#path-res" />
+    </animateMotion>
+  </circle>
+  <circle r="5" class="dot-res">
+    <animateMotion dur="3s" begin="1.8s" repeatCount="indefinite">
+      <mpath href="#path-back" />
+    </animateMotion>
+  </circle>
+  <circle r="5" class="dot-otel">
+    <animateMotion dur="4s" begin="2.1s" repeatCount="indefinite">
+      <mpath href="#path-otel" />
+    </animateMotion>
+  </circle>
+
+</svg>
+
+<div class="legend">
+  <span><span class="legend-dot" style="background:#88c0d0"></span>request</span>
+  <span><span class="legend-dot" style="background:#a3be8c"></span>response (SSE)</span>
+  <span><span class="legend-dot" style="background:#b48ead"></span>OTel metric export</span>
+</div>
+</div>
+
+<div class="src">
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-export/spec.md">specs/otel-export/spec.md</a> § Client-side OTel SDK initialization
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/design.md">design.md</a> § D2. Client-side emit, not hub-side
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/proposal.md">proposal.md</a> § Impact (server/hub.js note)
+  </span>
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="tiers">2. 三層 Tier Opt-in 模型</h2>
+
+<p>tier 是「會送出多少資訊」的開關。<strong>預設 tier 0(完全不送)</strong>。專案 config 是上限,個人 config 是下限,工程師永遠可以單方面降級退出。</p>
+
+<div class="diagram">
+<svg viewBox="0 0 900 320" class="flow-svg" aria-label="three tiers">
+
+  <!-- Tier stack -->
+  <rect class="node-box" x="20" y="60" width="220" height="50" stroke="#8b95a5"></rect>
+  <text class="node-label" x="130" y="82" text-anchor="middle" font-weight="600">Tier 0 — disabled</text>
+  <text class="node-sub" x="130" y="100" text-anchor="middle">no SDK init, no network egress</text>
+
+  <rect class="node-box accent" x="20" y="130" width="220" height="50"></rect>
+  <text class="node-label" x="130" y="152" text-anchor="middle" font-weight="600">Tier 1 — project anonymous</text>
+  <text class="node-sub" x="130" y="170" text-anchor="middle">project.name + optional team</text>
+
+  <rect class="node-box purple" x="20" y="200" width="220" height="50"></rect>
+  <text class="node-label" x="130" y="222" text-anchor="middle" font-weight="600">Tier 2 — personal named</text>
+  <text class="node-sub" x="130" y="240" text-anchor="middle">+ enduser.id (engineer-chosen)</text>
+
+  <!-- Resolution box -->
+  <rect class="node-box accent2" x="320" y="90" width="280" height="170"></rect>
+  <text class="node-label" x="460" y="118" text-anchor="middle" font-weight="600">effective_tier =</text>
+  <text class="node-label" x="460" y="140" text-anchor="middle" font-weight="600" font-size="14px" fill="#a3be8c">min(project_tier, personal_tier)</text>
+
+  <text class="node-sub" x="340" y="170">project = upper bound</text>
+  <text class="node-sub" x="340" y="190">personal = lower bound (can only equal-or-downgrade)</text>
+  <text class="node-sub" x="340" y="215" fill="#ebcb8b">project=1, personal=2 → clamps to 1 + warning</text>
+  <text class="node-sub" x="340" y="235" fill="#a3be8c">project=2, personal=0 → effective 0 (unilateral opt-out)</text>
+
+  <!-- Side: examples -->
+  <rect class="node-box" x="640" y="60" width="240" height="240"></rect>
+  <text class="node-label" x="760" y="82" text-anchor="middle" font-weight="600">Resolution Matrix</text>
+
+  <line x1="660" y1="100" x2="860" y2="100" stroke="#2a313c" />
+  <text class="node-sub" x="660" y="120">project</text>
+  <text class="node-sub" x="730" y="120">personal</text>
+  <text class="node-sub" x="810" y="120">effective</text>
+
+  <text class="node-sub" x="660" y="142">—</text>
+  <text class="node-sub" x="730" y="142">—</text>
+  <text class="node-sub" x="810" y="142" fill="#8b95a5">0</text>
+
+  <text class="node-sub" x="660" y="160">1</text>
+  <text class="node-sub" x="730" y="160">—</text>
+  <text class="node-sub" x="810" y="160" fill="#88c0d0">1</text>
+
+  <text class="node-sub" x="660" y="178">1</text>
+  <text class="node-sub" x="730" y="178">0</text>
+  <text class="node-sub" x="810" y="178" fill="#8b95a5">0 (opt-out)</text>
+
+  <text class="node-sub" x="660" y="196">1</text>
+  <text class="node-sub" x="730" y="196">2</text>
+  <text class="node-sub" x="810" y="196" fill="#ebcb8b">1 (clamped)</text>
+
+  <text class="node-sub" x="660" y="214">2</text>
+  <text class="node-sub" x="730" y="214">2</text>
+  <text class="node-sub" x="810" y="214" fill="#b48ead">2 (with enduser.id)</text>
+
+  <text class="node-sub" x="660" y="232">2</text>
+  <text class="node-sub" x="730" y="232">0</text>
+  <text class="node-sub" x="810" y="232" fill="#8b95a5">0 (opt-out)</text>
+
+  <text class="node-sub" x="660" y="265" font-size="10px" font-style="italic">「—」 表示該層 config 不存在</text>
+  <text class="node-sub" x="660" y="280" font-size="10px" font-style="italic">missing = treat as that side absent</text>
+
+  <!-- arrows -->
+  <path class="edge dashed" d="M 240,155 L 320,155" marker-end="url(#arrow)" />
+
+</svg>
+</div>
+
+<div class="key">
+<strong>關鍵限制(spec § Personal config gitignore enforcement):</strong>如果 <code>.ccxray.user.json</code> 被 git tracked,ccxray <strong>拒絕載入個人 identity</strong>,並建議 <code>git rm --cached</code>。
+</div>
+
+<div class="src">
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-tiers/spec.md">specs/otel-tiers/spec.md</a> § Three discrete tier values
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-tiers/spec.md">specs/otel-tiers/spec.md</a> § Tier resolution rule
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-tiers/spec.md">specs/otel-tiers/spec.md</a> § Engineer unilateral opt-out
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-tiers/spec.md">specs/otel-tiers/spec.md</a> § Personal config gitignore enforcement
+  </span>
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="config">3. 配置檔案與 env 插值</h2>
+
+<p>兩個檔案:<code>.ccxray.json</code>(專案層,checked into git)+ <code>.ccxray.user.json</code>(個人層,gitignored)。所有 string value 支援 <code>${VAR}</code> 從 process.env 替換。Schema 拒絕看起來像 secret 的字面值。</p>
+
+<div class="diagram">
+<svg viewBox="0 0 900 320" class="flow-svg" aria-label="config files and interpolation">
+  <defs>
+    <path id="interp-flow" d="M 470,100 L 600,100 L 600,165 L 720,165" />
+  </defs>
+
+  <!-- .ccxray.json -->
+  <rect class="node-box accent" x="20" y="40" width="260" height="170"></rect>
+  <text class="node-label" x="150" y="62" text-anchor="middle" font-weight="600">.ccxray.json</text>
+  <text class="node-sub" x="150" y="78" text-anchor="middle">(repo, in git)</text>
+  <text class="node-sub" x="40" y="105" font-family="monospace" font-size="11px">{</text>
+  <text class="node-sub" x="55" y="120" font-family="monospace" font-size="11px">"otel": {</text>
+  <text class="node-sub" x="70" y="135" font-family="monospace" font-size="11px">"tier": 1,</text>
+  <text class="node-sub" x="70" y="150" font-family="monospace" font-size="11px">"endpoint": "https://...",</text>
+  <text class="node-sub" x="70" y="165" font-family="monospace" font-size="11px">"headers": {</text>
+  <text class="node-sub" x="85" y="180" font-family="monospace" font-size="11px" fill="#ebcb8b">"Authorization": "Bearer ${TOKEN}"</text>
+  <text class="node-sub" x="70" y="195" font-family="monospace" font-size="11px">}</text>
+  <text class="node-sub" x="55" y="210" font-family="monospace" font-size="11px">}</text>
+
+  <!-- .ccxray.user.json -->
+  <rect class="node-box purple" x="20" y="230" width="260" height="80"></rect>
+  <text class="node-label" x="150" y="252" text-anchor="middle" font-weight="600">.ccxray.user.json</text>
+  <text class="node-sub" x="150" y="268" text-anchor="middle">(personal, gitignored)</text>
+  <text class="node-sub" x="40" y="290" font-family="monospace" font-size="11px">{ "otel": { "tier": 2, "identity": "alice",</text>
+  <text class="node-sub" x="40" y="304" font-family="monospace" font-size="11px">           "opt_in_acknowledged_at": "..." } }</text>
+
+  <!-- Loader -->
+  <rect class="node-box accent2" x="330" y="100" width="160" height="100"></rect>
+  <text class="node-label" x="410" y="125" text-anchor="middle" font-weight="600">config-loader.js</text>
+  <text class="node-sub" x="410" y="145" text-anchor="middle">1. Parse JSON</text>
+  <text class="node-sub" x="410" y="160" text-anchor="middle">2. Schema validate</text>
+  <text class="node-sub" x="410" y="175" text-anchor="middle">3. Interpolate ${VAR}</text>
+  <text class="node-sub" x="410" y="190" text-anchor="middle">4. Detect literal secrets</text>
+
+  <!-- env -->
+  <rect class="node-box warn" x="540" y="60" width="140" height="80" stroke-dasharray="4 3"></rect>
+  <text class="node-label" x="610" y="85" text-anchor="middle" font-weight="600">process.env</text>
+  <text class="node-sub" x="610" y="105" text-anchor="middle" font-family="monospace" font-size="11px">TOKEN=abc...</text>
+  <text class="node-sub" x="610" y="123" text-anchor="middle">(secret stays in env)</text>
+
+  <!-- Output -->
+  <rect class="node-box accent" x="720" y="130" width="160" height="80"></rect>
+  <text class="node-label" x="800" y="155" text-anchor="middle" font-weight="600">Loaded config</text>
+  <text class="node-sub" x="800" y="175" text-anchor="middle">effective_tier = 2</text>
+  <text class="node-sub" x="800" y="190" text-anchor="middle">Authorization: Bearer abc***</text>
+
+  <!-- Failure path -->
+  <rect class="node-box danger" x="720" y="240" width="160" height="60"></rect>
+  <text class="node-label" x="800" y="265" text-anchor="middle" font-weight="600" fill="#bf616a">Startup FAIL</text>
+  <text class="node-sub" x="800" y="282" text-anchor="middle">if literal Bearer, missing ${VAR},</text>
+  <text class="node-sub" x="800" y="295" text-anchor="middle">or schema error</text>
+
+  <!-- arrows -->
+  <path class="edge accent" d="M 280,125 L 330,125" marker-end="url(#arrow-accent)" />
+  <path class="edge purple" d="M 280,270 L 320,270 L 320,180 L 330,180" marker-end="url(#arrow-purple)" />
+  <path class="edge warn dashed" d="M 540,120 L 500,130" marker-end="url(#arrow)" />
+  <path class="edge accent" d="M 490,150 L 720,150" marker-end="url(#arrow-accent)" />
+  <path class="edge danger dashed" d="M 410,200 L 410,270 L 720,270" marker-end="url(#arrow)" />
+
+  <!-- Animated dot showing interpolation -->
+  <circle r="4" fill="#ebcb8b">
+    <animateMotion dur="2.4s" repeatCount="indefinite">
+      <mpath href="#interp-flow" />
+    </animateMotion>
+  </circle>
+
+</svg>
+</div>
+
+<table>
+  <tr><th>輸入</th><th>結果</th></tr>
+  <tr>
+    <td><code>"Authorization": "Bearer ${TOKEN}"</code> + <code>TOKEN=abc123</code></td>
+    <td style="color: var(--accent-2)">✓ 載入,實際值 <code>Bearer abc123</code></td>
+  </tr>
+  <tr>
+    <td><code>"Authorization": "Bearer ${MISSING}"</code> + 未設 env</td>
+    <td style="color: var(--danger)">✗ Startup 失敗,訊息含 file path / line / 變數名 <code>MISSING</code></td>
+  </tr>
+  <tr>
+    <td><code>"Authorization": "Bearer abc123longtokenvalue..."</code></td>
+    <td style="color: var(--danger)">✗ Schema 拒絕,建議改用 <code>${ENV_VAR}</code></td>
+  </tr>
+  <tr>
+    <td>JSON syntax error</td>
+    <td style="color: var(--danger)">✗ Startup 失敗,訊息含 line / column</td>
+  </tr>
+</table>
+
+<div class="src">
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-config/spec.md">specs/otel-config/spec.md</a> § Project and personal config files
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-config/spec.md">specs/otel-config/spec.md</a> § Environment variable interpolation
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-config/spec.md">specs/otel-config/spec.md</a> § Literal-secret rejection
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-config/spec.md">specs/otel-config/spec.md</a> § Config error fails fast at startup
+  </span>
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="health">4. OTel 健康狀態機</h2>
+
+<p>四個狀態,只能透過記錄在 spec 的條件轉換。<strong>核心承諾:OTel 失敗永遠不會擋 ccxray proxy</strong>。</p>
+
+<div class="diagram">
+<svg viewBox="0 0 940 420" class="flow-svg" aria-label="OTel health state machine">
+  <defs>
+    <marker id="sm-arrow" viewBox="0 0 10 10" refX="9" refY="5" markerWidth="7" markerHeight="7" orient="auto">
+      <path d="M 0 0 L 10 5 L 0 10 z" fill="#8b95a5"></path>
+    </marker>
+    <marker id="sm-arrow-green" viewBox="0 0 10 10" refX="9" refY="5" markerWidth="7" markerHeight="7" orient="auto">
+      <path d="M 0 0 L 10 5 L 0 10 z" fill="#a3be8c"></path>
+    </marker>
+    <marker id="sm-arrow-red" viewBox="0 0 10 10" refX="9" refY="5" markerWidth="7" markerHeight="7" orient="auto">
+      <path d="M 0 0 L 10 5 L 0 10 z" fill="#bf616a"></path>
+    </marker>
+  </defs>
+
+  <!-- start point -->
+  <circle cx="470" cy="28" r="7" fill="#d8dee9"></circle>
+  <text class="node-sub" x="488" y="32" font-size="11px">[start]</text>
+
+  <!-- top row: disabled, active, degraded -->
+  <rect class="node-box" x="40" y="80" width="200" height="60" rx="30" ry="30"></rect>
+  <text class="node-label" x="140" y="106" text-anchor="middle" font-weight="700">disabled</text>
+  <text class="node-sub" x="140" y="124" text-anchor="middle">no SDK, no egress</text>
+
+  <rect class="node-box accent" x="370" y="80" width="200" height="60" rx="30" ry="30"></rect>
+  <text class="node-label" x="470" y="106" text-anchor="middle" font-weight="700">active</text>
+  <text class="node-sub" x="470" y="124" text-anchor="middle">SDK init OK, exporting</text>
+
+  <rect class="node-box warn" x="700" y="80" width="200" height="60" rx="30" ry="30"></rect>
+  <text class="node-label" x="800" y="106" text-anchor="middle" font-weight="700">degraded</text>
+  <text class="node-sub" x="800" y="124" text-anchor="middle">init failed; proxy still OK</text>
+
+  <!-- bottom row: circuit_open, half_open -->
+  <rect class="node-box danger" x="290" y="250" width="200" height="60" rx="30" ry="30"></rect>
+  <text class="node-label" x="390" y="276" text-anchor="middle" font-weight="700">circuit_open</text>
+  <text class="node-sub" x="390" y="294" text-anchor="middle">exports paused, cooldown</text>
+
+  <rect class="node-box" x="570" y="250" width="200" height="60" rx="30" ry="30" stroke-dasharray="4 3"></rect>
+  <text class="node-label" x="670" y="276" text-anchor="middle" font-weight="700">half_open</text>
+  <text class="node-sub" x="670" y="294" text-anchor="middle">single trial export</text>
+
+  <!-- start → disabled -->
+  <path class="edge" d="M 462,34 C 380,55 240,72 232,80" marker-end="url(#sm-arrow)" />
+  <text class="node-sub" x="290" y="58" font-size="11px">tier=0 or no OTel pkg</text>
+
+  <!-- start → active -->
+  <path class="edge" stroke="#a3be8c" d="M 470,36 L 470,80" marker-end="url(#sm-arrow-green)" />
+  <text class="node-sub" x="478" y="60" fill="#a3be8c" font-size="11px">tier≥1, init OK</text>
+
+  <!-- start → degraded -->
+  <path class="edge" d="M 478,34 C 560,55 700,72 708,80" marker-end="url(#sm-arrow)" />
+  <text class="node-sub" x="600" y="58" font-size="11px">tier≥1, init fails</text>
+
+  <!-- active → circuit_open -->
+  <path class="edge" stroke="#bf616a" d="M 440,140 L 405,250" marker-end="url(#sm-arrow-red)" />
+  <text class="node-sub" x="290" y="200" fill="#bf616a" font-size="11px">5 consecutive failures</text>
+
+  <!-- half_open → active (trial OK) -->
+  <path class="edge" stroke="#a3be8c" d="M 620,250 C 560,200 520,160 500,142" marker-end="url(#sm-arrow-green)" />
+  <text class="node-sub" x="540" y="200" fill="#a3be8c" font-size="11px">trial OK</text>
+
+  <!-- circuit_open → half_open -->
+  <path class="edge" d="M 490,275 L 570,275" marker-end="url(#sm-arrow)" />
+  <text class="node-sub" x="498" y="268" font-size="11px">cooldown elapsed</text>
+
+  <!-- half_open → circuit_open (fail) -->
+  <path class="edge" stroke="#bf616a" d="M 570,290 C 540,310 510,310 490,300" marker-end="url(#sm-arrow-red)" />
+  <text class="node-sub" x="500" y="328" fill="#bf616a" font-size="11px">trial fails → backoff</text>
+
+  <!-- note: queue overflow while active -->
+  <rect x="610" y="160" width="280" height="60" rx="6" fill="rgba(136,192,208,0.06)" stroke="#88c0d0" stroke-dasharray="3 3"></rect>
+  <text class="node-sub" x="624" y="180" font-size="11px" font-weight="600" fill="#88c0d0">while active, queue full:</text>
+  <text class="node-sub" x="624" y="197" font-size="11px">drop oldest + exports_dropped_total++</text>
+  <text class="node-sub" x="624" y="212" font-size="11px" font-style="italic">(no state change)</text>
+
+  <!-- note: cooldown formula -->
+  <rect x="50" y="340" width="320" height="60" rx="6" fill="rgba(191,97,106,0.06)" stroke="#bf616a" stroke-dasharray="3 3"></rect>
+  <text class="node-sub" x="64" y="360" font-size="11px" font-weight="600" fill="#bf616a">cooldown formula:</text>
+  <text class="node-sub" x="64" y="377" font-size="11px" font-family="monospace">next = min(previous * 2, 600s)</text>
+  <text class="node-sub" x="64" y="392" font-size="11px" font-style="italic">starting from 60s after first trip</text>
+
+  <!-- note: degraded -->
+  <rect x="540" y="340" width="360" height="60" rx="6" fill="rgba(235,203,139,0.06)" stroke="#ebcb8b" stroke-dasharray="3 3"></rect>
+  <text class="node-sub" x="554" y="360" font-size="11px" font-weight="600" fill="#ebcb8b">when degraded:</text>
+  <text class="node-sub" x="554" y="377" font-size="11px">ccxray proxy keeps working; no further OTel attempts;</text>
+  <text class="node-sub" x="554" y="392" font-size="11px">visible in ccxray status --otel until process restart.</text>
+
+</svg>
+</div>
+
+<h3>失敗分層</h3>
+
+<table>
+  <tr><th>失敗類型</th><th>例子</th><th>處理</th></tr>
+  <tr>
+    <td>Config error</td>
+    <td>JSON syntax 錯、schema 違規、<code>${VAR}</code> 未解</td>
+    <td style="color: var(--danger)">啟動<strong>失敗</strong>(exit code != 0)</td>
+  </tr>
+  <tr>
+    <td>Init error</td>
+    <td>Endpoint URL 格式不合法</td>
+    <td style="color: var(--warn)">轉 <strong>degraded</strong>,ccxray 正常,status 顯示錯誤</td>
+  </tr>
+  <tr>
+    <td>Runtime error</td>
+    <td>Collector unreachable、auth fail、timeout</td>
+    <td style="color: var(--accent)">由 <strong>circuit breaker</strong> 處理,exponential backoff</td>
+  </tr>
+</table>
+
+<h3>持久化與容量限制</h3>
+
+<ul>
+  <li><code>~/.ccxray/otel.log</code> append,1 MB rotation,5 file retention(預設)</li>
+  <li>Export queue 預設 2048,滿 → drop oldest + <code>ccxray.otel.exports_dropped_total{signal}</code> ++</li>
+  <li>SDK shutdown 硬上限 2 秒,逾時強制 exit</li>
+</ul>
+
+<div class="src">
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-health/spec.md">specs/otel-health/spec.md</a> § Four-state OTel health machine
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-health/spec.md">specs/otel-health/spec.md</a> § Bounded export queue with drop-oldest semantics
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-health/spec.md">specs/otel-health/spec.md</a> § Circuit breaker with exponential backoff
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-health/spec.md">specs/otel-health/spec.md</a> § Failure log on local disk
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-health/spec.md">specs/otel-health/spec.md</a> § Never-block guarantee for the proxy
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-health/spec.md">specs/otel-health/spec.md</a> § Config errors fail fast, init/runtime errors degrade
+  </span>
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="parser">5. Parser pipeline 與 sentinel</h2>
+
+<p>解析 tool / MCP / skill / agent-type 從散落的 inline 字串改成 <strong>versioned JSON schemas</strong>。每筆 entry 都跑 reconciliation invariants;未識別的事件不會變 0,而是 increment sentinel counter,並寫進 <code>~/.ccxray/parser-drift.log</code>。</p>
+
+<div class="diagram">
+<svg viewBox="0 0 900 400" class="flow-svg" aria-label="parser pipeline">
+  <defs>
+    <path id="parse-known" d="M 320,100 L 460,100 L 460,170 L 600,170" />
+    <path id="parse-unknown" d="M 320,140 L 460,140 L 460,260 L 600,260" />
+    <path id="parse-mismatch" d="M 460,170 L 460,330 L 600,330" />
+  </defs>
+
+  <!-- Input -->
+  <rect class="node-box accent" x="20" y="80" width="200" height="80"></rect>
+  <text class="node-label" x="120" y="105" text-anchor="middle" font-weight="600">Response from upstream</text>
+  <text class="node-sub" x="120" y="125" text-anchor="middle">tool_use blocks,</text>
+  <text class="node-sub" x="120" y="142" text-anchor="middle">usage tokens, etc.</text>
+
+  <!-- Parser dispatch -->
+  <rect class="node-box accent2" x="260" y="60" width="180" height="120"></rect>
+  <text class="node-label" x="350" y="85" text-anchor="middle" font-weight="600">Parser dispatch</text>
+  <text class="node-sub" x="350" y="105" text-anchor="middle">server/parsers/index.js</text>
+  <text class="node-sub" x="350" y="125" text-anchor="middle" font-size="11px">anthropic-tools.schema.json</text>
+  <text class="node-sub" x="350" y="140" text-anchor="middle" font-size="11px">anthropic-skills.schema.json</text>
+  <text class="node-sub" x="350" y="155" text-anchor="middle" font-size="11px">mcp-tools.schema.json</text>
+  <text class="node-sub" x="350" y="170" text-anchor="middle" font-size="11px">codex-tools.schema.json</text>
+
+  <!-- Known path -->
+  <rect class="node-box accent" x="600" y="140" width="260" height="60"></rect>
+  <text class="node-label" x="730" y="165" text-anchor="middle" font-weight="600">Recognized → metrics</text>
+  <text class="node-sub" x="730" y="184" text-anchor="middle">ccxray.tool.invocations_total{tool}, etc.</text>
+
+  <!-- Unknown path -->
+  <rect class="node-box warn" x="600" y="230" width="260" height="60"></rect>
+  <text class="node-label" x="730" y="252" text-anchor="middle" font-weight="600">Unknown → sentinel</text>
+  <text class="node-sub" x="730" y="270" text-anchor="middle">ccxray.parser.unknown_*_total ++</text>
+  <text class="node-sub" x="730" y="285" text-anchor="middle">+ append to parser-drift.log</text>
+
+  <!-- Reconciliation mismatch -->
+  <rect class="node-box danger" x="600" y="305" width="260" height="60"></rect>
+  <text class="node-label" x="730" y="328" text-anchor="middle" font-weight="600">Invariant fail → mismatch</text>
+  <text class="node-sub" x="730" y="345" text-anchor="middle">tool_use count ≠ extracted count?</text>
+  <text class="node-sub" x="730" y="360" text-anchor="middle">ccxray.parser.reconciliation_mismatch_total ++</text>
+
+  <!-- arrows -->
+  <path class="edge accent" d="M 220,120 L 260,120" marker-end="url(#arrow-accent)" />
+  <path class="edge accent" d="M 440,110 L 600,170" marker-end="url(#arrow-accent)" />
+  <path class="edge warn" d="M 440,140 L 600,260" marker-end="url(#arrow)" />
+  <!-- reconciliation runs AFTER extraction; arrow from Recognized box to Invariant fail -->
+  <path class="edge dashed" d="M 730,200 L 730,305" marker-end="url(#arrow)" />
+  <text class="node-sub" x="744" y="252" font-size="10px" font-style="italic"><tspan x="744" dy="0">post-extract</tspan><tspan x="744" dy="12">check</tspan></text>
+
+  <!-- Animated dots -->
+  <circle r="5" class="dot-res">
+    <animateMotion dur="2.4s" repeatCount="indefinite">
+      <mpath href="#parse-known" />
+    </animateMotion>
+  </circle>
+  <circle r="5" fill="#ebcb8b">
+    <animateMotion dur="3s" begin="0.8s" repeatCount="indefinite">
+      <mpath href="#parse-unknown" />
+    </animateMotion>
+  </circle>
+</svg>
+</div>
+
+<h3>每個 schema 帶的元資料</h3>
+
+<div class="card">
+<pre><code>{
+  "version": "1.0.0",
+  "last_verified_against": "2026-05-10",
+  "patterns": [ ... ],
+  "examples": [ ... ]
+}</code></pre>
+</div>
+
+<h3>Error isolation</h3>
+
+<p>所有 parser 包在 try/catch。若拋例外 → <code>ccxray.parser.error_total{parser,error_type}</code> ++,該 entry 仍寫進本地 log,該 entry 對應 metric/span 帶 <code>ccxray.parser.degraded=true</code>。<strong>Parser 失敗不會影響 proxy 路徑</strong>。</p>
+
+<div class="src">
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/parser-schemas/spec.md">specs/parser-schemas/spec.md</a> § Versioned parser schemas per concern and provider
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/parser-schemas/spec.md">specs/parser-schemas/spec.md</a> § Sentinel counters for unknown tokens
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/parser-schemas/spec.md">specs/parser-schemas/spec.md</a> § Reconciliation invariants
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/parser-schemas/spec.md">specs/parser-schemas/spec.md</a> § Parser error isolation
+  </span>
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="cli">6. 跟 CLI 內建 OTel 共存</h2>
+
+<p>Claude Code CLI 也內建 OTel。ccxray 偵測到 <code>CLAUDE_CODE_ENABLE_TELEMETRY=1</code> 時進入 complement mode,所有 emit 加 <code>ccxray.cli_otel_active=true</code> attribute。<strong>ccxray 永遠不關自己的 emit</strong>(因為 CLI 沒 Codex 支援、ccxray 看的是 HTTP truth、且兩邊 diff 本身是價值訊號)。</p>
+
+<div class="diagram">
+<svg viewBox="0 0 900 280" class="flow-svg" aria-label="CLI coexistence">
+
+  <!-- Standalone mode -->
+  <rect class="node-box accent" x="20" y="40" width="400" height="200"></rect>
+  <text class="node-label" x="220" y="65" text-anchor="middle" font-weight="600">Standalone mode</text>
+  <text class="node-sub" x="220" y="82" text-anchor="middle">CLAUDE_CODE_ENABLE_TELEMETRY 未設</text>
+
+  <rect class="node-box" x="40" y="105" width="160" height="50"></rect>
+  <text class="node-sub" x="120" y="128" text-anchor="middle">Claude Code CLI</text>
+  <text class="node-sub" x="120" y="144" text-anchor="middle" fill="#bf616a">(no OTel)</text>
+
+  <rect class="node-box accent" x="240" y="105" width="160" height="50"></rect>
+  <text class="node-sub" x="320" y="128" text-anchor="middle">ccxray emits</text>
+  <text class="node-sub" x="320" y="144" text-anchor="middle" font-family="monospace" font-size="11px">ccxray.*</text>
+
+  <text class="node-sub" x="220" y="190" text-anchor="middle">→ Single source of truth</text>
+  <text class="node-sub" x="220" y="208" text-anchor="middle">→ Banner: "ccxray OTel tier: 1 (anonymous)"</text>
+
+  <!-- Complement mode -->
+  <rect class="node-box purple" x="460" y="40" width="420" height="200"></rect>
+  <text class="node-label" x="670" y="65" text-anchor="middle" font-weight="600">Complement mode</text>
+  <text class="node-sub" x="670" y="82" text-anchor="middle">CLAUDE_CODE_ENABLE_TELEMETRY=1</text>
+
+  <rect class="node-box" x="480" y="105" width="160" height="50"></rect>
+  <text class="node-sub" x="560" y="125" text-anchor="middle">CLI emits</text>
+  <text class="node-sub" x="560" y="142" text-anchor="middle" font-family="monospace" font-size="11px">claude_code.*</text>
+
+  <rect class="node-box accent" x="680" y="105" width="180" height="50"></rect>
+  <text class="node-sub" x="770" y="125" text-anchor="middle">ccxray emits</text>
+  <text class="node-sub" x="770" y="142" text-anchor="middle" font-family="monospace" font-size="11px">ccxray.* + cli_otel_active=true</text>
+
+  <text class="node-sub" x="670" y="180" text-anchor="middle">→ Reconciliation: ccxray.reconciliation.token_diff_pct{model}</text>
+  <text class="node-sub" x="670" y="198" text-anchor="middle">→ Both flow to user's collector, distinguishable via</text>
+  <text class="node-sub" x="670" y="215" text-anchor="middle">  resource attribute ccxray.source="ccxray-proxy"</text>
+
+</svg>
+</div>
+
+<div class="key">
+<strong>為什麼不關 ccxray emit:</strong>(1) CLI 內建 OTel 只有 Anthropic,Codex / Gemini 沒有;(2) ccxray 看到 HTTP truth,跟 CLI 不同視角;(3) 兩邊 diff 是高價值警報(代表某一邊 pricing 算錯)。
+</div>
+
+<div class="src">
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-export/spec.md">specs/otel-export/spec.md</a> § CLI OTel coexistence and complement mode
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-export/spec.md">specs/otel-export/spec.md</a> § Source resource attribute on every emit
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-export/spec.md">specs/otel-export/spec.md</a> § Reconciliation diff metric
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-export/spec.md">specs/otel-export/spec.md</a> § `ccxray.*` namespace for all emitted metrics
+  </span>
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="cardinality">7. Cardinality budget</h2>
+
+<p>每個 metric 宣告「允許哪些 attribute key」+「每個 key 最多幾個 unique value」。Key 不在 allow-list → 直接 drop(OTel View API)。Value 超 budget → 改記成 <code>_overflow_</code>,sentinel counter ++。</p>
+
+<div class="diagram">
+<svg viewBox="0 0 900 260" class="flow-svg" aria-label="cardinality budget">
+
+  <!-- Budget bar -->
+  <rect class="node-box accent" x="20" y="80" width="500" height="60"></rect>
+  <text class="node-label" x="270" y="105" text-anchor="middle" font-weight="600">ccxray.tool.invocations_total — tool budget: 50</text>
+
+  <!-- Filled portion -->
+  <rect x="30" y="115" width="220" height="18" fill="#a3be8c" rx="2" />
+  <rect x="250" y="115" width="260" height="18" fill="#232932" stroke="#2a313c" rx="2" />
+  <text class="node-sub" x="140" y="129" text-anchor="middle" font-size="11px" fill="#0f1419">23 used</text>
+  <text class="node-sub" x="380" y="129" text-anchor="middle" font-size="11px">27 remaining</text>
+
+  <!-- Incoming -->
+  <rect class="node-box" x="20" y="180" width="180" height="50"></rect>
+  <text class="node-sub" x="110" y="200" text-anchor="middle">incoming attribute</text>
+  <text class="node-sub" x="110" y="218" text-anchor="middle" font-family="monospace">tool="Bash"</text>
+
+  <text class="node-sub" x="240" y="208" font-size="20">→</text>
+
+  <rect class="node-box accent2" x="280" y="180" width="240" height="50"></rect>
+  <text class="node-sub" x="400" y="200" text-anchor="middle" fill="#a3be8c" font-weight="600">accepted as-is</text>
+  <text class="node-sub" x="400" y="218" text-anchor="middle">overflow_total: 0</text>
+
+  <!-- Overflow case -->
+  <rect class="node-box" x="560" y="80" width="320" height="60"></rect>
+  <text class="node-sub" x="720" y="100" text-anchor="middle">When 51st unique value arrives</text>
+  <text class="node-sub" x="720" y="120" text-anchor="middle" font-family="monospace" font-size="11px">tool="FancyNewToolThatNobodyKnows"</text>
+
+  <text class="node-sub" x="720" y="158" text-anchor="middle">↓</text>
+
+  <rect class="node-box warn" x="560" y="170" width="320" height="60"></rect>
+  <text class="node-sub" x="720" y="190" text-anchor="middle" fill="#ebcb8b" font-weight="600">recorded as tool="_overflow_"</text>
+  <text class="node-sub" x="720" y="208" text-anchor="middle">+ ccxray.metrics.overflow_total{</text>
+  <text class="node-sub" x="720" y="222" text-anchor="middle">  metric=..., attribute="tool" } ++</text>
+
+</svg>
+</div>
+
+<h3>不可當 metric label 的高基數欄位(per design.md § D4)</h3>
+
+<div class="warn-box">
+spec 沒有明確列出黑名單,但 design.md § D4 指出 <code>bash.command_pattern</code> 和 <code>file_path</code> <strong>明確 NOT 當 metric label</strong> 使用,避免基數爆炸。
+</div>
+
+<div class="src">
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-export/spec.md">specs/otel-export/spec.md</a> § Cardinality budget enforcement
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/design.md">design.md</a> § D4. Cardinality budget with overflow fallback
+  </span>
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="modules">8. 新檔案與模組關係</h2>
+
+<div class="diagram">
+<svg viewBox="0 0 940 500" class="flow-svg" aria-label="modules added and modified">
+  <defs>
+    <marker id="mod-arrow" viewBox="0 0 10 10" refX="9" refY="5" markerWidth="7" markerHeight="7" orient="auto">
+      <path d="M 0 0 L 10 5 L 0 10 z" fill="#88c0d0"></path>
+    </marker>
+    <marker id="mod-arrow-mute" viewBox="0 0 10 10" refX="9" refY="5" markerWidth="7" markerHeight="7" orient="auto">
+      <path d="M 0 0 L 10 5 L 0 10 z" fill="#8b95a5"></path>
+    </marker>
+  </defs>
+
+  <!-- ── group: New modules ── -->
+  <rect x="20" y="40" width="300" height="440" rx="8"
+        fill="rgba(163,190,140,0.05)" stroke="#a3be8c" stroke-width="1.5"></rect>
+  <text x="170" y="64" text-anchor="middle" font-weight="700" fill="#a3be8c" font-size="13px">新增模組</text>
+
+  <rect class="node-box accent2" x="40" y="82" width="260" height="50"></rect>
+  <text class="node-label" x="170" y="102" text-anchor="middle" font-weight="600" font-size="12.5px">server/config-loader.js</text>
+  <text class="node-sub" x="170" y="118" text-anchor="middle" font-size="10.5px">schema · ${VAR} · secrets · gitignore</text>
+
+  <rect class="node-box accent2" x="40" y="142" width="260" height="50"></rect>
+  <text class="node-label" x="170" y="162" text-anchor="middle" font-weight="600" font-size="12.5px">server/otel-health.js</text>
+  <text class="node-sub" x="170" y="178" text-anchor="middle" font-size="10.5px">state machine · queue · breaker · log</text>
+
+  <rect class="node-box accent2" x="40" y="202" width="260" height="50"></rect>
+  <text class="node-label" x="170" y="222" text-anchor="middle" font-weight="600" font-size="12.5px">server/otel.js</text>
+  <text class="node-sub" x="170" y="238" text-anchor="middle" font-size="10.5px">SDK init · registry · cardinality · source</text>
+
+  <rect class="node-box accent2" x="40" y="262" width="260" height="50"></rect>
+  <text class="node-label" x="170" y="282" text-anchor="middle" font-weight="600" font-size="12.5px">server/parsers/</text>
+  <text class="node-sub" x="170" y="298" text-anchor="middle" font-size="10.5px">*.schema.json + index.js</text>
+
+  <rect class="node-box accent2" x="40" y="322" width="260" height="50"></rect>
+  <text class="node-label" x="170" y="342" text-anchor="middle" font-weight="600" font-size="12.5px">test/fixtures/parser/</text>
+  <text class="node-sub" x="170" y="358" text-anchor="middle" font-size="10.5px">snapshot fixtures</text>
+
+  <rect class="node-box accent2" x="40" y="382" width="260" height="50"></rect>
+  <text class="node-label" x="170" y="402" text-anchor="middle" font-weight="600" font-size="12.5px">package.json</text>
+  <text class="node-sub" x="170" y="418" text-anchor="middle" font-size="10.5px">minimal OTel deps · lazy require</text>
+
+  <!-- ── group: Modified existing ── -->
+  <rect x="360" y="40" width="300" height="440" rx="8"
+        fill="rgba(136,192,208,0.05)" stroke="#88c0d0" stroke-width="1.5"></rect>
+  <text x="510" y="64" text-anchor="middle" font-weight="700" fill="#88c0d0" font-size="13px">被修改的既有檔案</text>
+
+  <rect class="node-box accent" x="380" y="82" width="260" height="50"></rect>
+  <text class="node-label" x="510" y="102" text-anchor="middle" font-weight="600" font-size="12.5px">server/forward.js</text>
+  <text class="node-sub" x="510" y="118" text-anchor="middle" font-size="10.5px">emit metrics on request complete</text>
+
+  <rect class="node-box accent" x="380" y="142" width="260" height="50"></rect>
+  <text class="node-label" x="510" y="162" text-anchor="middle" font-weight="600" font-size="12.5px">server/store.js</text>
+  <text class="node-sub" x="510" y="178" text-anchor="middle" font-size="10.5px">thin shim over parsers</text>
+
+  <rect class="node-box accent" x="380" y="202" width="260" height="50"></rect>
+  <text class="node-label" x="510" y="222" text-anchor="middle" font-weight="600" font-size="12.5px">server/system-prompt.js</text>
+  <text class="node-sub" x="510" y="238" text-anchor="middle" font-size="10.5px">skill marker via schema</text>
+
+  <rect class="node-box accent" x="380" y="262" width="260" height="50"></rect>
+  <text class="node-label" x="510" y="282" text-anchor="middle" font-weight="600" font-size="12.5px">server/hub.js</text>
+  <text class="node-sub" x="510" y="298" text-anchor="middle" font-size="10.5px">no business metrics (doc comment)</text>
+
+  <rect class="node-box accent" x="380" y="322" width="260" height="50"></rect>
+  <text class="node-label" x="510" y="342" text-anchor="middle" font-weight="600" font-size="12.5px">bin/ccxray.js</text>
+  <text class="node-sub" x="510" y="358" text-anchor="middle" font-size="10.5px">status --otel · otel preview · parser report</text>
+
+  <!-- ── group: Phase 2 (out of scope) ── -->
+  <rect x="700" y="40" width="220" height="210" rx="8"
+        fill="rgba(139,149,165,0.05)" stroke="#8b95a5" stroke-width="1.5" stroke-dasharray="6 4"></rect>
+  <text x="810" y="64" text-anchor="middle" font-weight="700" fill="#8b95a5" font-size="13px">Phase 2 follow-up</text>
+  <text x="810" y="80" text-anchor="middle" font-size="10.5px" fill="#8b95a5">(NOT in this change)</text>
+
+  <rect class="node-box" x="720" y="98" width="180" height="60" stroke="#8b95a5"></rect>
+  <text class="node-label" x="810" y="120" text-anchor="middle" font-weight="600" font-size="12.5px" fill="#8b95a5">span emit (traces)</text>
+  <text class="node-sub" x="810" y="138" text-anchor="middle" font-size="10.5px">ccxray.entry_id,</text>
+  <text class="node-sub" x="810" y="151" text-anchor="middle" font-size="10.5px">dashboard_url</text>
+
+  <rect class="node-box" x="720" y="170" width="180" height="60" stroke="#8b95a5"></rect>
+  <text class="node-label" x="810" y="192" text-anchor="middle" font-weight="600" font-size="12.5px" fill="#8b95a5">/entry/:id route</text>
+  <text class="node-sub" x="810" y="210" text-anchor="middle" font-size="10.5px">deep-link drill-back UI</text>
+
+  <!-- ── arrows: new → existing ── -->
+  <!-- otel.js (y=227) → forward.js (y=107) -->
+  <path class="edge accent" d="M 300,220 C 340,220 340,107 380,107" marker-end="url(#mod-arrow)" />
+  <!-- otel.js (y=227) → store.js (y=167) -->
+  <path class="edge accent" d="M 300,227 C 340,227 340,167 380,167" marker-end="url(#mod-arrow)" />
+  <!-- otel.js → ccxray.js (y=347) -->
+  <path class="edge accent dashed" d="M 300,234 C 340,234 340,347 380,347" marker-end="url(#mod-arrow)" />
+  <!-- config-loader (y=107) → ccxray.js (y=347) -->
+  <path class="edge accent dashed" d="M 300,107 C 340,107 340,347 380,347" marker-end="url(#mod-arrow)" />
+  <!-- otel-health (y=167) → ccxray.js (y=347) -->
+  <path class="edge accent dashed" d="M 300,167 C 340,167 340,347 380,347" marker-end="url(#mod-arrow)" />
+  <!-- parsers (y=287) → store.js (y=167) -->
+  <path class="edge accent" d="M 300,280 C 340,280 340,167 380,167" marker-end="url(#mod-arrow)" />
+  <!-- parsers → system-prompt.js (y=227) -->
+  <path class="edge accent" d="M 300,287 C 340,287 340,227 380,227" marker-end="url(#mod-arrow)" />
+  <!-- package.json → otel.js (within same column, drawn as a thin loop) -->
+  <path class="edge" d="M 170,382 C 110,360 110,260 130,228" marker-end="url(#mod-arrow-mute)" stroke-dasharray="3 3" />
+  <text class="node-sub" x="65" y="305" font-size="10px" fill="#8b95a5">deps</text>
+
+  <!-- annotation: parsers also feed fixtures (same column) -->
+  <text class="node-sub" x="170" y="374" font-size="10px" fill="#a3be8c" text-anchor="middle">↑ snapshot tests run against parsers/</text>
+
+</svg>
+</div>
+
+<div class="src">
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/proposal.md">proposal.md</a> § Impact
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/tasks.md">tasks.md</a> § Tasks 2 / 3 / 4 / 5 / 6 / 7 / 8 / 9
+  </span>
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="scope">9. Phase 1 範圍 vs Phase 2</h2>
+
+<div class="scope-grid">
+
+  <div class="card scope-in">
+    <h4>✓ Phase 1 — 本 change 範圍</h4>
+    <ul>
+      <li>Metrics emit(<code>ccxray.*</code> namespace)</li>
+      <li>三層 tier opt-in(預設 OFF)</li>
+      <li><code>.ccxray.json</code> / <code>.ccxray.user.json</code> + <code>${VAR}</code> 插值</li>
+      <li>OTel health(state machine、queue、breaker)</li>
+      <li>Parser schemas + sentinels + reconciliation</li>
+      <li>CLI 共存偵測 + reconciliation diff metric</li>
+      <li><code>ccxray status --otel</code> / <code>otel preview</code> / <code>parser report</code></li>
+      <li>啟動 banner、secrets masking</li>
+    </ul>
+  </div>
+
+  <div class="card scope-out">
+    <h4>✗ Phase 2 follow-up</h4>
+    <ul>
+      <li>Span emit(traces)</li>
+      <li><code>ccxray.entry_id</code> / <code>dashboard_url</code> attributes</li>
+      <li><code>/entry/:id</code> deep-link route</li>
+      <li><code>ccxray.hub.*</code> 運維 metrics(open question)</li>
+      <li><code>--otel-demo</code> Docker Compose helper(open question)</li>
+    </ul>
+  </div>
+
+</div>
+
+<div class="src">
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/proposal.md">proposal.md</a> § What Changes (last bullet: Out of scope)
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/design.md">design.md</a> § Non-Goals
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/design.md">design.md</a> § Open Questions
+  </span>
+</div>
+
+<!-- ───────────────────────────────────────────── -->
+<h2 id="metrics">10. 完整 metric 清單</h2>
+
+<p>所有 metric 都在 <code>ccxray.*</code> namespace,每筆 emit 帶 resource attribute <code>ccxray.source="ccxray-proxy"</code>。Complement mode 時額外帶 <code>ccxray.cli_otel_active=true</code>。</p>
+
+<table>
+  <tr><th>家族</th><th>Metric</th><th>Attributes</th></tr>
+  <tr>
+    <td rowspan="6" style="color: var(--accent)">Cost</td>
+    <td><code>ccxray.tokens.input_total</code></td><td>model, provider <sup>*</sup></td>
+  </tr>
+  <tr><td><code>ccxray.tokens.output_total</code></td><td>model, provider <sup>*</sup></td></tr>
+  <tr><td><code>ccxray.tokens.cache_read_total</code></td><td>model, provider <sup>*</sup></td></tr>
+  <tr><td><code>ccxray.tokens.cache_creation_total</code></td><td>model, provider <sup>*</sup></td></tr>
+  <tr><td><code>ccxray.cost.usd_total</code></td><td>model, provider <sup>*</sup></td></tr>
+  <tr><td><code>ccxray.cache.hit_ratio</code>(gauge)</td><td>model, provider <sup>*</sup></td></tr>
+
+  <tr>
+    <td rowspan="5" style="color: var(--accent-2)">Usage</td>
+    <td><code>ccxray.tool.invocations_total</code></td><td>tool, provider</td>
+  </tr>
+  <tr><td><code>ccxray.mcp.invocations_total</code></td><td>server, tool</td></tr>
+  <tr><td><code>ccxray.skill.activations_total</code></td><td>skill, provider</td></tr>
+  <tr><td><code>ccxray.sessions_total</code></td><td>provider</td></tr>
+  <tr><td><code>ccxray.agent_type.invocations_total</code></td><td>type</td></tr>
+
+  <tr>
+    <td rowspan="4" style="color: var(--warn)">Quality</td>
+    <td><code>ccxray.errors_total</code></td><td>type, provider</td>
+  </tr>
+  <tr><td><code>ccxray.stop_reason_total</code></td><td>reason</td></tr>
+  <tr><td><code>ccxray.latency_ms</code>(histogram)</td><td>model, provider</td></tr>
+  <tr><td><code>ccxray.max_tokens_hit_total</code></td><td>model</td></tr>
+
+  <tr>
+    <td rowspan="4" style="color: var(--purple)">Patterns</td>
+    <td><code>ccxray.context.utilization_pct</code>(histogram)</td><td></td>
+  </tr>
+  <tr><td><code>ccxray.auto_compact.triggered_total</code></td><td></td></tr>
+  <tr><td><code>ccxray.subagent.invocations_total</code></td><td></td></tr>
+  <tr><td><code>ccxray.tools_per_turn</code>(histogram)</td><td></td></tr>
+
+  <tr>
+    <td rowspan="4" style="color: var(--muted)">Governance</td>
+    <td><code>ccxray.permission_mode.usage_total</code></td><td>mode</td>
+  </tr>
+  <tr><td><code>ccxray.dangerous_tool.invocations_total</code></td><td>pattern</td></tr>
+  <tr><td><code>ccxray.file_writes_total</code></td><td></td></tr>
+  <tr><td><code>ccxray.provider.distribution_total</code></td><td>provider</td></tr>
+
+  <tr>
+    <td rowspan="9" style="color: var(--danger)">Sentinels</td>
+    <td><code>ccxray.metrics.overflow_total</code></td><td>metric, attribute</td>
+  </tr>
+  <tr><td><code>ccxray.parser.unknown_tool_total</code></td><td>provider</td></tr>
+  <tr><td><code>ccxray.parser.unknown_skill_marker_total</code></td><td>provider</td></tr>
+  <tr><td><code>ccxray.parser.unknown_mcp_format_total</code></td><td></td></tr>
+  <tr><td><code>ccxray.parser.fallback_used_total</code></td><td>parser, reason</td></tr>
+  <tr><td><code>ccxray.parser.reconciliation_mismatch_total</code></td><td>type</td></tr>
+  <tr><td><code>ccxray.parser.error_total</code></td><td>parser, error_type</td></tr>
+  <tr><td><code>ccxray.otel.exports_dropped_total</code></td><td>signal</td></tr>
+  <tr><td><code>ccxray.otel.state</code>(gauge)</td><td>state</td></tr>
+
+  <tr>
+    <td style="color: var(--accent)">CLI 對帳</td>
+    <td><code>ccxray.reconciliation.token_diff_pct</code>(gauge)</td><td>model</td>
+  </tr>
+
+  <tr>
+    <td style="color: var(--accent)">Tier 觀測</td>
+    <td><code>ccxray.otel.tier_distribution</code></td><td>tier</td>
+  </tr>
+</table>
+
+<p class="muted" style="font-size: 12px;">
+<sup>*</sup> Cost 系列的 attribute 並未在 <code>specs/otel-export/spec.md</code> § Required metric families 內逐一列出(spec 只在 Usage / Quality 部分內嵌標明)。本表依照實作時的常見維度(<code>model</code>、<code>provider</code>)預先填入,實際 attribute 註冊清單以 <code>server/otel.js</code> 的 metric registry 為準(Tasks § 4.5)。
+</p>
+
+<div class="src">
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-export/spec.md">specs/otel-export/spec.md</a> § Required metric families
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-export/spec.md">specs/otel-export/spec.md</a> § Cardinality budget enforcement
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-export/spec.md">specs/otel-export/spec.md</a> § Reconciliation diff metric
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/parser-schemas/spec.md">specs/parser-schemas/spec.md</a> § Sentinel counters / Reconciliation invariants / Parser error isolation
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-health/spec.md">specs/otel-health/spec.md</a> § Bounded export queue / Health state observable
+  </span>
+  <span class="src-row">Source:
+    <a href="../openspec/changes/add-otel-metrics-phase1/specs/otel-tiers/spec.md">specs/otel-tiers/spec.md</a> § Tier distribution sentinel
+  </span>
+</div>
+
+<p class="muted" style="margin-top: 56px; border-top: 1px solid var(--border); padding-top: 16px;">
+本檔案位於 <code>docs/otel-phase1-overview.html</code>,內容嚴格依據 <code>openspec/changes/add-otel-metrics-phase1/</code> 的 proposal / design / specs / tasks 文件。所有宣稱皆有出處連結;若你發現任何視覺與 spec 不一致,請以 spec 為準並回報。
+</p>
+
+</div>
+
+</body>
+</html>
diff --git a/openspec/changes/add-otel-metrics-phase1/.openspec.yaml b/openspec/changes/add-otel-metrics-phase1/.openspec.yaml
new file mode 100644
index 0000000..40cc12f
--- /dev/null
+++ b/openspec/changes/add-otel-metrics-phase1/.openspec.yaml
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-05-12
diff --git a/openspec/changes/add-otel-metrics-phase1/design.md b/openspec/changes/add-otel-metrics-phase1/design.md
new file mode 100644
index 0000000..0641cdc
--- /dev/null
+++ b/openspec/changes/add-otel-metrics-phase1/design.md
@@ -0,0 +1,166 @@
+## Context
+
+ccxray currently emits no telemetry to external systems. All observation happens via the local dashboard reading from `~/.ccxray/logs/`. Adding OpenTelemetry export changes ccxray's blast radius — data starts leaving the user's machine — and intersects with three sensitive design surfaces:
+
+1. **Privacy.** Engineers run ccxray in their own dev environment. Any telemetry that identifies them by default would break that contract.
+2. **Trust with managers.** Aggregated metrics are genuinely useful for engineering leaders, but a feature that lets a manager track individual tool usage will trigger a backlash that kills adoption.
+3. **Provider neutrality.** Claude Code's CLI has built-in OTel for Anthropic; Codex/Gemini have none. ccxray must coexist with the CLI without double-counting, and must remain the only telemetry source for non-Anthropic providers.
+
+Before drafting this design, an 11-risk pre-mortem was completed and recorded in `docs/otel-integration.html`. Every accepted solution scored ≥ 9/10 on weighted criteria including verification mechanisms. The design below is the synthesis of those solutions.
+
+## Goals / Non-Goals
+
+**Goals:**
+
+- Provide ccxray-emitted OTel metrics covering cost, usage (tool/MCP/skill), quality (errors/latency/cache), patterns (context/subagent), and governance.
+- Default OFF. Zero telemetry until the user explicitly opts in per-project.
+- Three-tier opt-in (disabled / project-anonymous / personal-named) where the project sets an upper bound and personal config can only equal-or-downgrade.
+- Coexist with Claude Code CLI's built-in OTel without overlap, with a reconciliation metric to surface accounting bugs on either side.
+- Never let OTel failure break the proxy. Config errors fail at startup, init errors degrade silently, runtime errors are absorbed by a bounded queue + circuit breaker.
+- Make parser drift visible. Unknown tools / skills / MCP markers must increment a sentinel counter rather than silently turn into zero.
+- Provide introspection: `ccxray status --otel`, `ccxray otel preview` (dry-run), `ccxray parser report`.
+
+**Non-Goals:**
+
+- **Traces / spans.** Phase 1 emits metrics only. Spans, `entry_id` deep-link attributes, and `/entry/:id` drill-back UI are Phase 2.
+- **Full payload export.** Request/response bodies never leave the machine. If a future user wants this, it belongs in a separate "ccxray log → S3 / self-hosted backend" product, not in the OTel pipeline.
+- **Synthetic tool span timing.** Tool execution durations inferred from HTTP cadence would be misleading; the CLI emits accurate timing for Anthropic, and we will not compete with inaccurate data.
+- **Central ccxray hub for team-wide aggregation.** Each engineer's ccxray remains local. Cross-machine correlation, if needed, is a Phase 2+ discussion.
+- **Auto-instrumentation.** We will not pull in `@opentelemetry/auto-instrumentations-node`. ccxray controls every emit point explicitly to keep the dependency footprint and behavior predictable.
+
+## Decisions
+
+### D1. Default OFF with three-tier opt-in
+
+Three tier values:
+
+- **tier 0 (disabled)** — no OTel SDK initialization, no network egress. Default behavior when no config file or env override exists.
+- **tier 1 (project anonymous)** — metrics emit with project-level attributes (`project.name`, optional `team`) but no individual identity. Activated by `.ccxray.json` checked into the repo.
+- **tier 2 (personal named)** — adds `enduser.id` (a self-chosen string, not necessarily real name) to allow individual ccxray usage analytics. Activated by `.ccxray.user.json` in the working directory, which is gitignored.
+
+Resolution rule: `effective_tier = min(project_tier, personal_tier)`. Project config is the upper bound; personal config can only equal or downgrade. An engineer can always set tier 0 in personal config to opt out of project-level emit on their own machine.
+
+**Alternatives considered:**
+
+- *Always-on anonymous* — rejected. "Anonymous" telemetry has well-documented re-identification risks; defaulting to ON breaks the implicit trust contract.
+- *Cookie-style consent prompt at startup* — rejected. Prompt fatigue leads to blanket yes; one-time `opt_in_acknowledged_at` timestamp in personal config achieves the same intent without nagging.
+- *k-anonymity at the backend* — rejected. ccxray does not control the backend; small teams (k < 5) cannot rely on this guarantee.
+
+### D2. Client-side emit, not hub-side
+
+OTel SDK initialization and metric emission happen in the client process (the one that ran `ccxray claude`). The hub remains a pure HTTP proxy plus SSE broadcaster. The hub MAY emit its own operational metrics under `ccxray.hub.*` namespace using a separate config (`~/.ccxray/hub-config.json`), but it does NOT emit business metrics on behalf of clients.
+
+This means different projects connecting to the same hub can configure different tiers, endpoints, and `OTEL_RESOURCE_ATTRIBUTES` without interfering with each other.
+
+**Alternatives considered:**
+
+- *Hub-side emit with per-client config fanout* — rejected. Adds a routing/fan-out concern to the hub with no clear value; the hub would need to track which spans belong to which client config.
+- *Hub-only emit, ignore per-project differences* — rejected. Conflicts with D1 and forces every project on a host to share one OTel destination.
+
+### D3. `ccxray.*` namespace, never mirror `claude_code.*`
+
+Every metric uses `ccxray.<system>.<aspect>` (`ccxray.tokens.input_total`, `ccxray.tool.invocations_total`, etc.). Every emit carries the resource attribute `ccxray.source="ccxray-proxy"`. When the CLI's `CLAUDE_CODE_ENABLE_TELEMETRY=1` is detected, ccxray enters "complement mode" and adds `ccxray.cli_otel_active=true` to its emits, plus a startup notice explaining how to choose between the two metric families.
+
+**Cross-source reconciliation: pivoted to downstream.** An earlier version of this design proposed emitting `ccxray.reconciliation.token_diff_pct{model}` as a gauge. After expert review (Sigelman / Majors / Sridharan), Phase 1 drops the in-proxy diff gauge for these reasons: (1) the gauge is pre-aggregated and cannot answer "which request diverged"; (2) the diff is rarely zero for legitimate reasons (SSE chunking, retries, prompt-cache edge cases) → alert fatigue; (3) acquiring the CLI's counts in-process requires either querying the user's storage backend (couples ccxray to Prom/OTLP dialects) or embedding an OTLP receiver (turns ccxray from proxy into telemetry product, violating instrumentation neutrality and expanding blast radius). Instead, ccxray emits faithful per-request signals and ccxray-internal invariant metrics (`ccxray.invariants.*`); cross-source reconciliation against the CLI is a downstream concern — see `docs/otel-recon.md` for recording-rule / sidecar / wide-event join recipes. A `--debug-reconcile` ad-hoc flag may be reconsidered in a later phase.
+
+**Alternatives considered:**
+
+- *Auto-disable ccxray emit when CLI is active* — rejected. Loses the reconciliation signal and forfeits ccxray's Codex/Gemini advantage.
+- *Same metric names, different resource* — rejected. Backends commonly aggregate by metric name first; using the same names would force users to filter by resource attribute on every panel.
+
+### D4. Cardinality budget with overflow fallback
+
+Every metric declares an allow-list of attribute keys and a per-key cardinality budget (e.g. `tool=50`, `model=10`, `mcp_server=30`). Attribute values are tracked in a `Set` per (metric, attribute); when the Set reaches budget size, subsequent unique values are recorded as the literal string `_overflow_` and a sentinel counter `ccxray.metrics.overflow_total{metric,attribute}` increments.
+
+Attribute keys not in the allow-list are dropped at the View API layer (OTel SDK native enforcement). High-cardinality candidates that look attractive (`bash.command_pattern`, `file_path`) are explicitly NOT emitted as metric labels.
+
+**Alternatives considered:**
+
+- *Trust the backend to handle cardinality* — rejected. Free-tier Grafana Cloud, open-source Prometheus, and many enterprise backends impose hard limits that result in dropped series or account-level throttling.
+- *Silent drop on overflow* — rejected. Violates the "no silent failure" principle.
+
+### D5. Failure isolation via state machine + bounded queue + circuit breaker
+
+`server/otel-health.js` owns a state machine with four states:
+
+- `disabled` — OTel never initialized (tier 0 or no config).
+- `active` — SDK initialized, exports succeeding.
+- `degraded` — SDK init failed; ccxray continues without OTel; status command shows the error.
+- `circuit_open` — runtime export failures triggered the circuit breaker; periodic half-open retries.
+
+The export queue is bounded (default 2048 entries, configurable). On overflow, oldest entries are dropped and `ccxray.otel.exports_dropped_total{signal}` increments locally (network is presumed unreachable when the queue overflows).
+
+Circuit breaker: 5 consecutive failures → `circuit_open` for 60s → `half_open` test → success returns to `active`, failure backs off (60 → 120 → 240 → 600s max).
+
+**Alternatives considered:**
+
+- *Unbounded queue with retries* — rejected. OOMs ccxray when the collector is down.
+- *Fail-fast on first error* — rejected. Transient errors are common; one timeout should not disable telemetry for the rest of the session.
+
+### D6. Config: `.ccxray.json` + `.ccxray.user.json` with `${ENV_VAR}` interpolation
+
+Two-file config:
+
+- `.ccxray.json` — project root, checked into git, sets tier upper bound and shared settings (endpoint, headers, resource attributes).
+- `.ccxray.user.json` — project root or `$HOME`, gitignored, sets personal identity and overrides (only ever equal-or-downgrade vs project config).
+
+Both files support `${ENV_VAR}` interpolation in string values. The schema validator rejects any string that looks like a literal secret (`Bearer [A-Za-z0-9]{20,}`, `sk_live_*`, `ghp_*`, JWT structure) when not wrapped in `${...}`. First-time generation auto-amends `.gitignore` to include `.ccxray.user.json`.
+
+Config errors (syntax, schema, unresolved `${VAR}`) fail at startup with a clear error pointing to the offending line. Init errors (bad endpoint format) transition to `degraded`. Runtime errors (collector down) transition to `circuit_open`.
+
+**Alternatives considered:**
+
+- *Single file with comments marking secrets* — rejected. JSON has no comments and the convention is too fragile.
+- *Pure env-var configuration* — rejected. Loses per-project granularity; same shell environment cannot easily switch contexts when working across multiple repos.
+
+### D7. Parser schema-ization with sentinel counters
+
+Tool / MCP / skill / agent-type detection moves from inline strings in `system-prompt.js` / `store.js` / `helpers.js` to versioned JSON schemas under `server/parsers/`. Each schema declares the patterns it recognizes and carries a `last_verified_against` date.
+
+For every entry processed, parsers emit:
+
+- The recognized metrics (tool invocations, skill activations, etc.).
+- `ccxray.parser.unknown_*_total{provider}` counters when a token/marker is seen but not recognized.
+- `ccxray.parser.reconciliation_mismatch_total{type}` when invariants fail (e.g. count of `tool_use` blocks in response ≠ count of tools extracted by parser).
+
+Parsers are wrapped in try/catch; on exception, `ccxray.parser.error_total{parser}` increments and the entry continues to be written to local logs (degraded OTel, never blocked proxy).
+
+Snapshot fixtures under `test/fixtures/parser/` lock current behavior; changes require committing new snapshots and pass review.
+
+**Alternatives considered:**
+
+- *Keep inline parsing* — rejected. Already fragile (silent dependence on Claude Code's evolving prompt format) and cannot detect drift.
+- *Server-side parser updates via remote schema fetch* — rejected. Adds a new failure surface and security concern.
+
+### D8. CLI surface: `status --otel`, `otel preview`, `parser report`
+
+- `ccxray status --otel` — current tier, endpoint, OTel state, cardinality usage (e.g. `tool: 23/50`), dropped event counters, circuit breaker state.
+- `ccxray otel preview` — dry-run printing the next export's content without sending. Lets users see exactly what would be exported before enabling.
+- `ccxray parser report` — last 7 days of unknown tool / skill / MCP markers grouped by frequency; generates a GitHub issue template body for drift reports.
+
+Startup banner declares the active tier and (if applicable) complement-mode coexistence with CLI OTel.
+
+## Risks / Trade-offs
+
+- **Risk: Adoption stalls because individual devs do not have an OTel backend.** → Ship `ccxray --otel-demo` that spins up a local Grafana + Prometheus via Docker Compose so a developer can see their own metrics in 30 seconds without joining any external service. Set a 3-month KPI gate: < 10 GitHub references → pause Phase 2 investment.
+- **Risk: Manager misuse for individual surveillance.** → Default OFF + tier 2 requires personal opt-in by the engineer + explicit `docs/otel-ethics.md` distributed as part of the change ("these metrics are not for individual performance evaluation; the reasons follow…"). Track `ccxray.otel.tier_distribution`: if tier 2 share is < 5%, strengthen the docs.
+- **Risk: Cardinality explosion despite budgets.** → Budgets enforced at SDK View API layer with sentinel counter for overflow visibility. CI lint blocks new metrics that lack a schema entry. `ccxray.metrics.overflow_total > 0` for sustained periods triggers an in-status warning.
+- **Risk: Bundle bloat from OTel SDK.** → Import only `@opentelemetry/api`, `@opentelemetry/sdk-metrics`, `@opentelemetry/exporter-metrics-otlp-http`, `@opentelemetry/resources`. No auto-instrumentations. Optional dependency pattern so the package still resolves when OTel deps are absent (lazy require).
+- **Risk: Hub-mode env changes don't propagate.** → Business OTel is client-side (D2); hub env only affects `ccxray.hub.*` operational metrics. `ccxray status` displays per-client tier/endpoint so users can see whether each client has picked up the env they expected.
+- **Risk: Parser drift when Anthropic changes the prompt format.** → Sentinel counters (`ccxray.parser.unknown_*_total`) make drift visible within hours instead of months; `last_verified_against` dates trigger quarterly re-verification; `ccxray parser report` makes drift reports easy to file.
+- **Risk: OTel semconv conventions evolve and our attribute names become out of date.** → All metric names live in the schema registry under `server/otel.js`; a future migration is a search-and-replace plus a deprecation period.
+- **Trade-off: We do not compete with the CLI on Anthropic tool span timing.** → Acceptable. Our value is the HTTP-layer truth, Codex/Gemini coverage, the reconciliation diff, and the future Phase 2 drill-back.
+
+## Migration Plan
+
+- **Forward.** Phase 1 ships behind opt-in defaults; existing ccxray users see no behavior change. Adopters add a `.ccxray.json`, set an endpoint, and confirm with `ccxray otel preview` before traffic flows. The `--otel-demo` subcommand provides a zero-config local Grafana for evaluation.
+- **Rollback.** Each `ccxray.*` metric is a contract; once shipped, names cannot be renamed without a deprecation cycle. The schema registry tracks every metric with its introduction version.
+- **Phase 2 prerequisites.** Shared modules introduced here (`otel-health.js`, `config-loader.js`, parser schemas, sentinel framework, status surface) are designed to host Phase 2's span emit and `/entry/:id` route without rework.
+
+## Open Questions
+
+- Should `.ccxray.json` lookup walk up from cwd to the nearest enclosing dir (monorepo-friendly), or only check cwd? Recommendation: walk up to nearest git root, take the first match.
+- Should we ship `--otel-demo` Docker Compose files in this PR or as a follow-up doc? Recommendation: follow-up, to keep Phase 1 scope tight.
+- Should `ccxray.hub.*` operational metrics ship in Phase 1 or be deferred? Recommendation: defer to keep this change focused on the client side.
+- For the auto-update of `.gitignore`, should the user be prompted or should it be automatic? Recommendation: prompt the first time, with a `--yes` flag for automation.
+- Should `ccxray --otel-demo` be a documented dev tool only, or a supported feature? Recommendation: dev tool only (clearly labeled experimental).
diff --git a/openspec/changes/add-otel-metrics-phase1/proposal.md b/openspec/changes/add-otel-metrics-phase1/proposal.md
new file mode 100644
index 0000000..c490321
--- /dev/null
+++ b/openspec/changes/add-otel-metrics-phase1/proposal.md
@@ -0,0 +1,47 @@
+## Why
+
+ccxray captures everything an agent does at the HTTP layer — full request/response, token counts, cost, tool calls, MCP server activity, skill activations — but the data lives only in the local dashboard. Teams that already operate Grafana / Datadog / Honeycomb cannot aggregate ccxray's signals into their existing observability pipeline. Claude Code's CLI has built-in OTel for Anthropic only and does not expose the HTTP-layer truth ccxray sees; Codex, Gemini, and future providers have no OTel at all. The full design rationale, pre-mortem (11 risks scored ≥ 9/10) and alternative options live at `docs/otel-integration.html`.
+
+This change adds Phase 1: emit ccxray's metrics over OTLP, gated behind a default-off tiered opt-in, with a failure model that never degrades the proxy. Phase 2 (metadata-only traces with `entry_id` drill-back) is a follow-up.
+
+## What Changes
+
+- New optional metric export under `ccxray.*` namespace covering cost, usage (tool / MCP / skill / agent_type / provider), quality (errors, stop_reason, latency, max_tokens_hit_rate), patterns (context_utilization, auto_compact_triggered, subagent_ratio, tools_per_turn) and governance (permission_mode, dangerous_tool, file_writes).
+- New configuration files: `.ccxray.json` (repo, project-level) and `.ccxray.user.json` (gitignored, personal). `${ENV_VAR}` interpolation. Schema rejects literal-looking secrets. Auto-add `.ccxray.user.json` to `.gitignore` if missing.
+- Three-tier opt-in model: **tier 0 disabled (default)** / tier 1 anonymous project-level / tier 2 personal named. Project config is the upper bound; personal config can only equal or downgrade. Engineers can opt out unilaterally.
+- Detect `CLAUDE_CODE_ENABLE_TELEMETRY=1` and enter "complement mode" with `ccxray.cli_otel_active=true` attribute; every metric carries `ccxray.source="ccxray-proxy"` resource attribute. ccxray emits ccxray-internal invariant metrics (`ccxray.invariants.*`); cross-source reconciliation against the CLI is documented as a downstream pattern (recording rules / sidecar / wide-event join on `request_id`) in `docs/otel-recon.md`, not as an in-proxy gauge — keeps ccxray as a transparent proxy with bounded blast radius.
+- Cardinality budget per (metric, attribute) with `_overflow_` fallback and `ccxray.metrics.overflow_total` sentinel; attribute key allow-list enforced via OTel View API.
+- Parser schema-ization: extract tool / MCP / skill detection into `server/parsers/*.schema.json` with snapshot fixtures, sentinel metrics (`ccxray.parser.unknown_*_total`), and reconciliation invariants (tool_use block count must equal extracted count).
+- Failure fallback: config errors fail fast at startup; init errors degrade silently (ccxray keeps proxying); runtime errors handled by bounded queue (drop oldest) + circuit breaker (5 failures → open 60s → exponential backoff). OTel failures **never** break the proxy.
+- New shared modules: `server/otel-health.js` (state machine, circuit breaker, bounded queue, local log writer) and `server/config-loader.js` (JSON schema validation, env interpolation, secret detection, gitignore check).
+- OTel emit lives in the **client** process, not the hub. Each project's tier/endpoint coexists on the same hub. Hub gains its own operational metrics under `ccxray.hub.*` namespace.
+- New CLI commands: `ccxray status --otel` (current tier, endpoint, health, cardinality usage), `ccxray otel preview` (dry-run printing the next export's content), `ccxray parser report` (recent unknown events for drift detection).
+- Out of scope (Phase 2 follow-up): span emit (traces), `/entry/:id` deep-link route, `ccxray.entry_id` / `dashboard_url` attributes.
+
+## Capabilities
+
+### New Capabilities
+
+- `otel-config`: `.ccxray.json` and `.ccxray.user.json` schema, `${ENV_VAR}` interpolation, literal-secret rejection, `.gitignore` auto-amend, project-upper-bound + personal-lower-bound merging rules.
+- `otel-export`: OTel SDK initialization (client-side, not hub), metric definitions under `ccxray.*` namespace, `ccxray.source` resource attribute, cardinality budget enforcement with `_overflow_` fallback, CLI coexistence detection and complement-mode signaling, ccxray-internal invariant metrics, explicit non-emit of cross-source diff gauge (deferred to downstream).
+- `otel-tiers`: three-tier opt-in (disabled / project-anonymous / personal-named), tier resolution with project as upper bound and personal as lower bound, `enduser.id` attribute only in tier 2, opt-in acknowledgment timestamp persisted in personal config.
+- `otel-health`: failure state machine (`disabled / active / degraded / circuit_open`), bounded export queue with drop-oldest semantics, circuit breaker with exponential backoff, local failure log at `~/.ccxray/otel.log` with rotation, never-block guarantee for the proxy path.
+- `parser-schemas`: extract skill / MCP / tool / agent-type detection into versioned JSON schemas, snapshot fixtures per provider (Anthropic + Codex), sentinel metrics for unknown events, reconciliation invariants run per entry, try/catch isolation so parser failure does not affect ccxray core.
+- `otel-introspection`: `ccxray status --otel` view (tier, endpoint, health, cardinality, dropped counts), `ccxray otel preview` dry-run, `ccxray parser report` for drift inspection, startup banner declaring active tier and CLI coexistence mode.
+
+### Modified Capabilities
+
+(None — Phase 1 is additive. Existing capabilities are not changed.)
+
+## Impact
+
+- New `server/otel.js`, `server/otel-health.js`, `server/config-loader.js`, `server/parsers/` directory tree (schemas + fixtures + unknown-handler).
+- `server/forward.js` — emit metric on request completion (counters + histograms) via the otel-health-guarded queue; no behavior change when OTel is disabled.
+- `server/store.js` — session / tool / skill / MCP / agent_type detection becomes a thin shim over `server/parsers/*`; reconciliation invariants run per entry; sentinel counters incremented on unknown.
+- `server/system-prompt.js` — agent-type and skill marker detection moves into `parsers/anthropic-skills.schema.json`; existing parsing behavior preserved.
+- `server/hub.js` — hub gains optional `ccxray.hub.*` operational metrics (uptime, request rate, connected clients) under its own config in `~/.ccxray/hub-config.json`. Hub does NOT emit business metrics; those stay client-side.
+- `server/routes/api.js` — no new HTTP routes in Phase 1 (deep-link route is Phase 2).
+- `bin/ccxray.js` or equivalent CLI entry — new subcommands: `status --otel`, `otel preview`, `parser report`. Existing commands unaffected when OTel is disabled.
+- `package.json` — add minimal OTel dependencies (`@opentelemetry/api`, `@opentelemetry/sdk-metrics`, `@opentelemetry/exporter-metrics-otlp-http`, `@opentelemetry/resources`). No auto-instrumentations. Optional dependency pattern so the package still works if OTel is not installed.
+- New docs: `docs/otel-integration.html` (already exists, decision record), `docs/otel-ethics.md` (why these metrics are not for individual performance evaluation), `docs/otel-quickstart.md` (90-second Grafana onboarding).
+- Tests: parser snapshot fixtures, cardinality budget enforcement tests, tier resolution matrix tests, failure-mode tests (collector down, bad endpoint, bad auth, malformed config).
diff --git a/openspec/changes/add-otel-metrics-phase1/specs/otel-config/spec.md b/openspec/changes/add-otel-metrics-phase1/specs/otel-config/spec.md
new file mode 100644
index 0000000..b30283a
--- /dev/null
+++ b/openspec/changes/add-otel-metrics-phase1/specs/otel-config/spec.md
@@ -0,0 +1,90 @@
+## ADDED Requirements
+
+### Requirement: Project and personal config files
+
+ccxray SHALL read two optional configuration files at startup: `.ccxray.json` (project-level, repo-checked-in) and `.ccxray.user.json` (personal-level, gitignored). Both files use JSON. Missing files SHALL be treated as tier 0 (disabled).
+
+#### Scenario: No config present
+
+- **WHEN** ccxray starts in a directory with neither `.ccxray.json` nor `.ccxray.user.json`
+- **THEN** OTel SDK SHALL NOT initialize and no network egress SHALL occur
+
+#### Scenario: Project config present, no personal config
+
+- **WHEN** ccxray starts in a directory with `.ccxray.json` that enables tier 1
+- **THEN** OTel SDK SHALL initialize at tier 1 with project-level attributes only
+
+#### Scenario: Both project and personal config present
+
+- **WHEN** project config sets tier 1 and personal config sets tier 2 with `enduser.id`
+- **THEN** the effective tier SHALL be tier 2 and `enduser.id` SHALL be attached to emitted metrics
+
+### Requirement: Tier resolution as upper bound and lower bound
+
+The effective tier SHALL be `min(project_tier, personal_tier)` so that the project config is an upper bound and personal config can only equal-or-downgrade. An engineer SHALL be able to unilaterally opt out by setting tier 0 in personal config.
+
+#### Scenario: Personal config downgrades from project
+
+- **WHEN** project config enables tier 1 and personal config explicitly sets tier 0
+- **THEN** no OTel emission SHALL occur for this engineer
+
+#### Scenario: Personal config cannot exceed project
+
+- **WHEN** project config enables tier 1 and personal config sets tier 2
+- **THEN** the effective tier SHALL be tier 2 only if the project explicitly authorizes tier 2; otherwise tier resolution SHALL clamp to tier 1 and emit a warning
+
+### Requirement: Environment variable interpolation
+
+All string values in config files SHALL support `${VAR}` interpolation, resolved at load time from `process.env`. Unresolved variables SHALL cause startup failure with a clear error message naming the missing variable.
+
+#### Scenario: Header value uses env var
+
+- **WHEN** config contains `"Authorization": "Bearer ${OTLP_TOKEN}"` and `OTLP_TOKEN=abc123` is set in the environment
+- **THEN** the loaded header value SHALL be `"Bearer abc123"` and the literal string SHALL NOT appear in any debug log line
+
+#### Scenario: Missing env var
+
+- **WHEN** config contains `"Authorization": "Bearer ${MISSING_VAR}"` and `MISSING_VAR` is not set
+- **THEN** ccxray SHALL exit non-zero with an error message that includes the file path, line, and the variable name `MISSING_VAR`
+
+### Requirement: Literal-secret rejection
+
+The schema validator SHALL reject any string value that matches a literal-secret pattern (`Bearer [A-Za-z0-9]{20,}`, `sk_live_*`, `sk_test_*`, `ghp_*`, JWT three-segment structure) unless the value is wrapped in `${...}`. Pure URLs and hostnames SHALL be allowed.
+
+#### Scenario: Literal bearer token rejected
+
+- **WHEN** config contains `"Authorization": "Bearer abc123longtokenvalue..."`
+- **THEN** ccxray SHALL exit at startup with an error suggesting the user switch to `${ENV_VAR}` interpolation
+
+#### Scenario: Interpolated bearer token accepted
+
+- **WHEN** config contains `"Authorization": "Bearer ${TOKEN}"` and `TOKEN` is set
+- **THEN** ccxray SHALL load successfully and use the resolved value
+
+### Requirement: Gitignore auto-amend on first generation
+
+When ccxray writes a new `.ccxray.user.json` for the first time, it SHALL check whether the file is covered by the project's `.gitignore`. If not, ccxray SHALL prompt the user (or apply automatically when `--yes` is passed) to append `.ccxray.user.json` to `.gitignore`.
+
+#### Scenario: Gitignore missing entry
+
+- **WHEN** ccxray creates `.ccxray.user.json` in a repo whose `.gitignore` does not list it
+- **THEN** ccxray SHALL prompt for permission to append `.ccxray.user.json` and reflect the choice in the next run
+
+#### Scenario: Gitignore already covers the file
+
+- **WHEN** ccxray creates `.ccxray.user.json` and `.gitignore` already contains an entry matching the file
+- **THEN** no prompt SHALL appear and the file SHALL be written silently
+
+### Requirement: Config error fails fast at startup
+
+Config syntax errors, schema violations, unresolved `${VAR}` references, and literal-secret matches SHALL cause ccxray to exit non-zero at startup with an actionable error message. ccxray SHALL NOT silently continue with a partial config.
+
+#### Scenario: Invalid JSON
+
+- **WHEN** `.ccxray.json` contains malformed JSON
+- **THEN** ccxray SHALL print a parse error citing the file path and the offending line/column, and SHALL exit non-zero
+
+#### Scenario: Schema violation
+
+- **WHEN** `.ccxray.json` sets `otel.tier` to an unknown value
+- **THEN** ccxray SHALL print a schema error naming the field and listing valid values, and SHALL exit non-zero
diff --git a/openspec/changes/add-otel-metrics-phase1/specs/otel-export/spec.md b/openspec/changes/add-otel-metrics-phase1/specs/otel-export/spec.md
new file mode 100644
index 0000000..f4c0112
--- /dev/null
+++ b/openspec/changes/add-otel-metrics-phase1/specs/otel-export/spec.md
@@ -0,0 +1,128 @@
+## ADDED Requirements
+
+### Requirement: Client-side OTel SDK initialization
+
+OTel SDK initialization SHALL occur in the client process (the one running `ccxray claude` or similar) and SHALL NOT occur in the hub process. The hub SHALL remain a pure HTTP proxy and SSE broadcaster.
+
+#### Scenario: Client initializes OTel
+
+- **WHEN** a ccxray client process starts with tier ≥ 1
+- **THEN** the OTel SDK SHALL initialize within the client process and emit metrics tagged with that client's resource attributes
+
+#### Scenario: Hub does not emit business metrics
+
+- **WHEN** the ccxray hub forwards an HTTP request between a client and an upstream provider
+- **THEN** the hub SHALL NOT emit any business metric on behalf of the client, regardless of the client's tier setting
+
+### Requirement: `ccxray.*` namespace for all emitted metrics
+
+Every metric SHALL be named under the `ccxray.<system>.<aspect>` pattern. No metric SHALL be named identically to a Claude Code CLI metric or any other upstream OTel convention that would overlap.
+
+#### Scenario: Metric naming
+
+- **WHEN** an OTel metric is registered
+- **THEN** its name SHALL start with the literal prefix `ccxray.`
+
+#### Scenario: Namespace collision prevention
+
+- **WHEN** code attempts to register a metric whose name matches a `claude_code.*` pattern
+- **THEN** registration SHALL fail and tests SHALL flag it
+
+### Requirement: Source resource attribute on every emit
+
+Every metric SHALL carry the resource attribute `ccxray.source="ccxray-proxy"` so that backends can filter ccxray-emitted data from data emitted by other OTel sources running on the same host.
+
+#### Scenario: Source attribute present
+
+- **WHEN** any metric is exported by ccxray
+- **THEN** its resource attributes SHALL include `ccxray.source="ccxray-proxy"`
+
+### Requirement: Cardinality budget enforcement
+
+Each metric SHALL declare its allowed attribute keys and a numeric cardinality budget per key. Attribute keys not in the allow-list SHALL be dropped via OTel View API. When the count of unique values for an allow-listed key reaches its budget, subsequent unique values SHALL be replaced with the literal string `_overflow_` and the sentinel counter `ccxray.metrics.overflow_total{metric,attribute}` SHALL increment.
+
+#### Scenario: Allowed attribute within budget
+
+- **WHEN** `ccxray.tool.invocations_total` receives an attribute `tool="Read"` and `Read` is the 3rd of 50 budgeted tool names
+- **THEN** the metric SHALL emit with `tool="Read"` and `ccxray.metrics.overflow_total` SHALL NOT increment
+
+#### Scenario: Budget exhausted
+
+- **WHEN** the cardinality budget for `tool` is 50 and a 51st unique tool name arrives
+- **THEN** the metric SHALL emit with `tool="_overflow_"` and `ccxray.metrics.overflow_total{metric="ccxray.tool.invocations_total",attribute="tool"}` SHALL increment by 1
+
+#### Scenario: Unallowed attribute key
+
+- **WHEN** code attempts to record `ccxray.tool.invocations_total` with attribute `bash_command="rm -rf /tmp/foo"` while `bash_command` is not in the allow-list
+- **THEN** the `bash_command` attribute SHALL be dropped before emission
+
+### Requirement: CLI OTel coexistence and complement mode
+
+ccxray SHALL detect the presence of `CLAUDE_CODE_ENABLE_TELEMETRY=1` in the environment and, when detected, SHALL emit all metrics with an additional attribute `ccxray.cli_otel_active=true`. ccxray SHALL print a startup notice explaining how to choose between ccxray and CLI metrics when both are active. ccxray SHALL NOT disable any of its own metrics based on CLI coexistence.
+
+#### Scenario: CLI OTel detected
+
+- **WHEN** ccxray starts with `CLAUDE_CODE_ENABLE_TELEMETRY=1` set
+- **THEN** ccxray SHALL print a startup notice indicating complement mode and SHALL add `ccxray.cli_otel_active=true` to all emitted metrics
+
+#### Scenario: CLI OTel not detected
+
+- **WHEN** ccxray starts without `CLAUDE_CODE_ENABLE_TELEMETRY`
+- **THEN** ccxray SHALL print a notice indicating standalone mode and the attribute `ccxray.cli_otel_active` SHALL NOT be set
+
+### Requirement: Internal invariant metrics; cross-source reconciliation is a downstream concern
+
+ccxray SHALL emit invariant metrics that describe ccxray-internal consistency only. ccxray SHALL NOT emit a cross-source diff metric (e.g. ccxray vs CLI token counts) as part of Phase 1. Cross-source reconciliation SHALL be performed by downstream consumers (recording rules, Grafana panels, sidecar processes) using `request_id` or `session_id` joins on per-request metrics emitted independently by ccxray and the CLI.
+
+Rationale: A pre-aggregated diff gauge cannot answer "which request diverged" and produces persistent non-zero values for legitimate reasons (SSE chunking boundaries, retries, prompt-caching edge cases), creating alert fatigue. ccxray's correct role is to emit faithful per-request signals; cross-source diff is an analytical task that belongs in the user's observability tier, where it can be expressed as a derived series.
+
+#### Scenario: Parser sum invariant
+
+- **WHEN** ccxray's parser extracts a sum of per-tool token attributions that differs from the upstream `usage` block totals for the same response
+- **THEN** `ccxray.invariants.parser_mismatch_total{type="token_sum"}` SHALL increment
+
+#### Scenario: SSE stream completeness invariant
+
+- **WHEN** ccxray observes the upstream SSE stream terminating without a `[DONE]` (Anthropic) or `response.completed` (OpenAI Responses) terminal event
+- **THEN** `ccxray.invariants.sse_truncated_total{provider}` SHALL increment
+
+#### Scenario: No cross-source diff gauge is emitted
+
+- **WHEN** OTel is enabled at any tier
+- **THEN** no metric whose name matches `ccxray.reconciliation.*` SHALL be registered with the SDK in Phase 1
+
+### Requirement: Required metric families
+
+ccxray SHALL emit the following metric families when OTel is enabled:
+
+- **Cost**: `ccxray.tokens.input_total`, `ccxray.tokens.output_total`, `ccxray.tokens.cache_read_total`, `ccxray.tokens.cache_creation_total`, `ccxray.cost.usd_total`, `ccxray.cache.hit_ratio` (gauge).
+- **Usage**: `ccxray.tool.invocations_total{tool,provider}`, `ccxray.mcp.invocations_total{server,tool}`, `ccxray.skill.activations_total{skill,provider}`, `ccxray.sessions_total{provider}`, `ccxray.agent_type.invocations_total{type}`.
+- **Quality**: `ccxray.errors_total{type,provider}`, `ccxray.stop_reason_total{reason}`, `ccxray.latency_ms` (histogram, attributes: `model`,`provider`), `ccxray.max_tokens_hit_total{model}`.
+- **Patterns**: `ccxray.context.utilization_pct` (histogram), `ccxray.auto_compact.triggered_total`, `ccxray.subagent.invocations_total`, `ccxray.tools_per_turn` (histogram).
+- **Governance**: `ccxray.permission_mode.usage_total{mode}`, `ccxray.dangerous_tool.invocations_total{pattern}`, `ccxray.file_writes_total`, `ccxray.provider.distribution_total{provider}`.
+
+Each metric SHALL be registered with its allow-list of attribute keys and cardinality budget at SDK initialization.
+
+#### Scenario: Cost metric emission after a turn
+
+- **WHEN** ccxray completes forwarding a request and receives a usage block from the upstream provider
+- **THEN** `ccxray.tokens.input_total`, `ccxray.tokens.output_total`, and `ccxray.cost.usd_total` SHALL each increment by the corresponding value
+
+#### Scenario: Tool invocation metric
+
+- **WHEN** ccxray detects a `tool_use` block named `Bash` in a response
+- **THEN** `ccxray.tool.invocations_total` SHALL increment by 1 with attribute `tool="Bash"`
+
+### Requirement: Minimal optional dependencies
+
+The OTel-related Node.js dependencies SHALL be limited to `@opentelemetry/api`, `@opentelemetry/sdk-metrics`, `@opentelemetry/exporter-metrics-otlp-http`, and `@opentelemetry/resources`. Auto-instrumentation packages SHALL NOT be included. Dependencies SHALL be resolved lazily so that ccxray remains functional even when OTel packages are absent (tier 0 only).
+
+#### Scenario: OTel packages absent and tier 0
+
+- **WHEN** OTel packages are not installed and effective tier is 0
+- **THEN** ccxray SHALL start normally without referencing any OTel package
+
+#### Scenario: OTel packages absent and tier ≥ 1
+
+- **WHEN** OTel packages are not installed and effective tier is ≥ 1
+- **THEN** ccxray SHALL emit a clear error explaining which packages to install and SHALL exit non-zero
diff --git a/openspec/changes/add-otel-metrics-phase1/specs/otel-health/spec.md b/openspec/changes/add-otel-metrics-phase1/specs/otel-health/spec.md
new file mode 100644
index 0000000..42ad9c2
--- /dev/null
+++ b/openspec/changes/add-otel-metrics-phase1/specs/otel-health/spec.md
@@ -0,0 +1,99 @@
+## ADDED Requirements
+
+### Requirement: Four-state OTel health machine
+
+ccxray SHALL maintain an OTel health state machine with exactly four states: `disabled`, `active`, `degraded`, and `circuit_open`. Transitions SHALL be driven exclusively by the conditions described in the subsequent requirements; no other code path SHALL mutate state.
+
+#### Scenario: Disabled at startup
+
+- **WHEN** effective tier is 0 or OTel packages are absent
+- **THEN** the state SHALL be `disabled` and `ccxray.otel.state` SHALL emit only its disabled gauge (where possible) and otherwise stay silent
+
+#### Scenario: Active after successful init
+
+- **WHEN** effective tier is ≥ 1 and SDK initialization completes
+- **THEN** the state SHALL be `active`
+
+### Requirement: Bounded export queue with drop-oldest semantics
+
+The OTel export queue SHALL be bounded by a configurable size (default 2048 entries). When the queue is full and a new export is attempted, the oldest queued entry SHALL be dropped to make room. Each drop SHALL increment `ccxray.otel.exports_dropped_total{signal}`.
+
+#### Scenario: Queue under limit
+
+- **WHEN** the queue holds fewer than its configured maximum entries and a new export arrives
+- **THEN** the new entry SHALL be appended and no drop SHALL occur
+
+#### Scenario: Queue at limit
+
+- **WHEN** the queue is at its configured maximum and a new export arrives
+- **THEN** the oldest entry SHALL be removed, the new entry SHALL be appended, and `ccxray.otel.exports_dropped_total{signal="<signal name>"}` SHALL increment by 1
+
+### Requirement: Circuit breaker with exponential backoff
+
+After 5 consecutive export failures, the state SHALL transition to `circuit_open` and exports SHALL be paused. After an initial cooldown of 60 seconds, the state SHALL transition to `half_open` and a single export SHALL be attempted. Success SHALL return the state to `active`. Failure SHALL keep the state at `circuit_open` and the cooldown SHALL double up to a maximum of 600 seconds.
+
+#### Scenario: Trip on 5 consecutive failures
+
+- **WHEN** 5 consecutive export attempts return errors
+- **THEN** the state SHALL transition to `circuit_open` and no further exports SHALL be attempted until the cooldown elapses
+
+#### Scenario: Half-open success returns to active
+
+- **WHEN** the cooldown elapses, the state moves to `half_open`, and the trial export succeeds
+- **THEN** the state SHALL transition back to `active` and the cooldown SHALL reset to 60 seconds
+
+#### Scenario: Half-open failure increases cooldown
+
+- **WHEN** the trial export in `half_open` fails
+- **THEN** the state SHALL remain `circuit_open` and the next cooldown SHALL be `min(previous_cooldown * 2, 600)` seconds
+
+### Requirement: Failure log on local disk
+
+Failed export attempts and state transitions SHALL be written to `~/.ccxray/otel.log` in append mode. The file SHALL be rotated once it exceeds a configurable size (default 1 MB). Rotated files SHALL be retained up to a configurable count (default 5).
+
+#### Scenario: Export error recorded
+
+- **WHEN** an export attempt fails with a network error
+- **THEN** a single line SHALL be appended to `~/.ccxray/otel.log` containing the timestamp, the error class, and the queue depth at time of failure
+
+#### Scenario: File rotated at size limit
+
+- **WHEN** `~/.ccxray/otel.log` exceeds 1 MB
+- **THEN** it SHALL be renamed to `otel.log.1` (with existing rotations shifted), a fresh `otel.log` SHALL be created, and files beyond the retention count SHALL be deleted
+
+### Requirement: Never-block guarantee for the proxy
+
+OTel export operations SHALL NOT block the HTTP proxy path. All emit operations SHALL enqueue without awaiting export completion. SDK shutdown during process exit SHALL be capped at 2 seconds and SHALL NOT prevent clean exit on timeout.
+
+#### Scenario: Collector unreachable
+
+- **WHEN** the OTLP endpoint is unreachable for the duration of a proxy request
+- **THEN** the proxy SHALL forward the request and return the response with no additional latency from OTel
+
+#### Scenario: SDK shutdown timeout
+
+- **WHEN** the process is exiting and OTel SDK flush is in progress
+- **THEN** the shutdown SHALL be aborted after 2 seconds and the process SHALL exit cleanly
+
+### Requirement: Config errors fail fast, init/runtime errors degrade
+
+Config parsing or schema errors SHALL cause non-zero process exit at startup with an actionable message. SDK initialization errors (e.g. invalid endpoint URL format) SHALL transition the state to `degraded` and SHALL NOT block ccxray startup. Runtime export errors SHALL be handled by the circuit breaker without affecting other ccxray behavior.
+
+#### Scenario: Bad endpoint URL
+
+- **WHEN** `.ccxray.json` sets `otel.endpoint` to a string that is not a valid URL
+- **THEN** ccxray SHALL continue to start, the state SHALL be `degraded`, the dashboard and proxy SHALL function normally, and `ccxray status --otel` SHALL display the error
+
+#### Scenario: Missing required field
+
+- **WHEN** `.ccxray.json` enables tier 1 but omits `otel.endpoint`
+- **THEN** ccxray SHALL exit non-zero at startup with an error pointing to the missing field
+
+### Requirement: Health state observable via metric and status command
+
+The current health state SHALL be observable through (a) a gauge `ccxray.otel.state{state}` (where possible — emitted only when state is `active` or `degraded`), and (b) the `ccxray status --otel` output regardless of state.
+
+#### Scenario: State visible in status command
+
+- **WHEN** an engineer runs `ccxray status --otel`
+- **THEN** the output SHALL include the current state, the last 3 state transitions with timestamps, and the current circuit breaker cooldown remaining (if applicable)
diff --git a/openspec/changes/add-otel-metrics-phase1/specs/otel-introspection/spec.md b/openspec/changes/add-otel-metrics-phase1/specs/otel-introspection/spec.md
new file mode 100644
index 0000000..53f1589
--- /dev/null
+++ b/openspec/changes/add-otel-metrics-phase1/specs/otel-introspection/spec.md
@@ -0,0 +1,66 @@
+## ADDED Requirements
+
+### Requirement: `ccxray status --otel` shows effective configuration and health
+
+The `ccxray status --otel` command SHALL print:
+
+- The current effective tier (0/1/2) and which config files contributed.
+- The endpoint URL with any `${VAR}` masked.
+- The OTel health state (`disabled / active / degraded / circuit_open`) and last 3 state transitions with timestamps.
+- The circuit breaker cooldown remaining (when applicable).
+- Per-metric cardinality usage in `current / budget` format (e.g. `tool: 23/50`).
+- Total counts: exports succeeded, exports failed, exports dropped (last hour and last 24 hours).
+- The `opt_in_acknowledged_at` timestamp for tier 2 (when applicable).
+- CLI coexistence indicator: whether `CLAUDE_CODE_ENABLE_TELEMETRY` is detected.
+
+#### Scenario: Status at tier 1
+
+- **WHEN** ccxray is running at tier 1 with a healthy collector
+- **THEN** `ccxray status --otel` SHALL show `tier=1`, `state=active`, the endpoint, cardinality usage rows for each registered metric, and the export success/failure counts
+
+#### Scenario: Status at tier 0
+
+- **WHEN** ccxray is running at tier 0
+- **THEN** `ccxray status --otel` SHALL show `tier=0`, `state=disabled`, and SHALL NOT attempt to read OTel runtime state
+
+### Requirement: `ccxray otel preview` dry-run
+
+The `ccxray otel preview` command SHALL print the exact JSON body that would be sent to the OTel collector on the next export, including all attribute values and resource attributes, WITHOUT sending any network request. Secrets resolved from `${ENV_VAR}` SHALL be masked in the output.
+
+#### Scenario: Preview before enabling
+
+- **WHEN** an engineer runs `ccxray otel preview` after setting up `.ccxray.json`
+- **THEN** the command SHALL print a single JSON object representing the next export, with `Authorization` and similar header values shown as `Bearer ***` rather than the resolved token
+
+#### Scenario: Preview with no recent metrics
+
+- **WHEN** ccxray has no queued metrics to export
+- **THEN** the command SHALL print a notice that no metrics are pending and SHALL exit zero
+
+### Requirement: Startup banner declares active tier and mode
+
+When ccxray starts at tier ≥ 1, it SHALL print a one-line banner to stderr summarizing: tier value, endpoint (without secret), and complement-mode status (if CLI OTel is active). The banner SHALL NOT print when tier is 0.
+
+#### Scenario: Banner at tier 1 standalone
+
+- **WHEN** ccxray starts at tier 1 without CLI OTel
+- **THEN** stderr SHALL contain a single line matching the pattern `ccxray OTel tier: 1 (anonymous) → <endpoint>` followed by no further banner output for that launch
+
+#### Scenario: Banner at tier 1 complement
+
+- **WHEN** ccxray starts at tier 1 with `CLAUDE_CODE_ENABLE_TELEMETRY=1`
+- **THEN** stderr SHALL contain a line indicating `tier: 1` and `complement-mode: true`
+
+#### Scenario: No banner at tier 0
+
+- **WHEN** ccxray starts at tier 0
+- **THEN** stderr SHALL NOT contain any OTel-related banner line
+
+### Requirement: Secrets masking in all introspection output
+
+`ccxray status --otel` and `ccxray otel preview` SHALL mask any value resolved from a `${VAR}` interpolation. Masked values SHALL display as the prefix (up to 4 characters) followed by `***`. The full unmasked value SHALL never be printed by any introspection command.
+
+#### Scenario: Auth header masked
+
+- **WHEN** the resolved auth header is `Bearer abc123longtokenvalue`
+- **THEN** introspection output SHALL display `Bearer abc1***` and SHALL NOT print the remainder of the token
diff --git a/openspec/changes/add-otel-metrics-phase1/specs/otel-tiers/spec.md b/openspec/changes/add-otel-metrics-phase1/specs/otel-tiers/spec.md
new file mode 100644
index 0000000..e10c2c9
--- /dev/null
+++ b/openspec/changes/add-otel-metrics-phase1/specs/otel-tiers/spec.md
@@ -0,0 +1,79 @@
+## ADDED Requirements
+
+### Requirement: Three discrete tier values
+
+ccxray SHALL support exactly three tier values for OTel export:
+
+- **0 — disabled**: No SDK initialization, no network egress.
+- **1 — project anonymous**: Emit with project-level resource attributes (`project.name`, optional `team`) but no individual identity.
+- **2 — personal named**: Emit with `enduser.id` attached (a self-chosen string set by the engineer).
+
+#### Scenario: Tier 0 produces no egress
+
+- **WHEN** the effective tier resolves to 0
+- **THEN** no OTel package SHALL be loaded and no network connection SHALL be opened for telemetry
+
+#### Scenario: Tier 1 omits identity
+
+- **WHEN** the effective tier resolves to 1 and a request completes
+- **THEN** emitted metrics SHALL include `project.name` (if configured) but SHALL NOT include any `enduser.id` attribute
+
+#### Scenario: Tier 2 includes identity
+
+- **WHEN** the effective tier resolves to 2 and personal config provides `identity: "alice"`
+- **THEN** emitted metrics SHALL include `enduser.id="alice"` as a resource attribute
+
+### Requirement: Tier resolution rule
+
+The effective tier SHALL be `min(project_tier, personal_tier)`. If either side is absent, the present side SHALL be used. The minimum SHALL clamp downward; personal config SHALL NOT exceed project config.
+
+#### Scenario: Personal lower than project
+
+- **WHEN** project tier is 1 and personal tier is 0
+- **THEN** the effective tier SHALL be 0
+
+#### Scenario: Project lower than personal
+
+- **WHEN** project tier is 1 and personal tier is 2 without project authorization for tier 2
+- **THEN** the effective tier SHALL be 1 and ccxray SHALL emit a warning that personal tier is clamped
+
+#### Scenario: Equal tiers
+
+- **WHEN** project tier is 1 and personal tier is 1
+- **THEN** the effective tier SHALL be 1
+
+### Requirement: Engineer unilateral opt-out
+
+Any engineer SHALL be able to opt out of OTel emission for their own machine by setting `tier: 0` in `.ccxray.user.json`, regardless of the project config. This opt-out SHALL take effect on the next ccxray launch.
+
+#### Scenario: Opt-out overrides project tier
+
+- **WHEN** project config sets tier 2 and personal config sets tier 0
+- **THEN** the engineer's ccxray client SHALL emit no telemetry until personal config is changed
+
+### Requirement: Personal config gitignore enforcement
+
+The personal config file `.ccxray.user.json` SHALL be excluded from version control. ccxray SHALL refuse to load personal-tier identity from a file that is currently tracked by git and SHALL emit a warning explaining the risk.
+
+#### Scenario: Personal config tracked by git
+
+- **WHEN** `.ccxray.user.json` exists in the repo and is tracked by git
+- **THEN** ccxray SHALL print a warning recommending `git rm --cached` and SHALL refuse to apply the personal identity until the file is untracked or moved to `$HOME`
+
+### Requirement: Opt-in acknowledgment timestamp
+
+When personal config sets tier 2 for the first time, the file SHALL record an `opt_in_acknowledged_at` ISO 8601 timestamp. This timestamp SHALL be displayed in `ccxray status --otel` so the engineer can confirm when they last opted in.
+
+#### Scenario: First-time tier 2 opt-in
+
+- **WHEN** a user creates `.ccxray.user.json` with tier 2 for the first time
+- **THEN** ccxray SHALL write the current time into the file as `opt_in_acknowledged_at` and SHALL include it in subsequent `status --otel` output
+
+### Requirement: Tier distribution sentinel
+
+ccxray SHALL emit `ccxray.otel.tier_distribution{tier}` as a counter incremented once per process launch that initializes OTel, labeled with the effective tier value. This metric is meant to inform documentation strengthening decisions (e.g. low tier 2 share suggests trust concerns).
+
+#### Scenario: Counter increments on launch
+
+- **WHEN** ccxray client process initializes at tier 1
+- **THEN** `ccxray.otel.tier_distribution{tier="1"}` SHALL increment by 1
diff --git a/openspec/changes/add-otel-metrics-phase1/specs/parser-schemas/spec.md b/openspec/changes/add-otel-metrics-phase1/specs/parser-schemas/spec.md
new file mode 100644
index 0000000..36e6b23
--- /dev/null
+++ b/openspec/changes/add-otel-metrics-phase1/specs/parser-schemas/spec.md
@@ -0,0 +1,83 @@
+## ADDED Requirements
+
+### Requirement: Versioned parser schemas per concern and provider
+
+Detection logic for tool / MCP / skill / agent-type SHALL be expressed as JSON schemas under `server/parsers/`. There SHALL be at minimum one schema per (concern, provider) pair:
+
+- `parsers/anthropic-tools.schema.json`
+- `parsers/anthropic-skills.schema.json`
+- `parsers/anthropic-agent-types.schema.json`
+- `parsers/mcp-tools.schema.json` (provider-agnostic MCP naming convention)
+- `parsers/codex-tools.schema.json`
+
+Each schema SHALL include a `version` field (semver) and a `last_verified_against` field (ISO 8601 date). Inline string matching in `server/system-prompt.js`, `server/store.js`, or other code paths SHALL be removed in favor of the schema-driven parser.
+
+#### Scenario: Schema referenced at runtime
+
+- **WHEN** ccxray processes an Anthropic response containing a `tool_use` block
+- **THEN** the tool name SHALL be classified using `parsers/anthropic-tools.schema.json` and SHALL NOT be matched against any hardcoded list embedded in other files
+
+### Requirement: Snapshot fixtures per provider
+
+Test fixtures under `test/fixtures/parser/` SHALL cover at minimum the following cases per provider:
+
+- Basic tool invocation
+- Tool invocation with a skill marker active
+- Subagent invocation (Anthropic Task tool)
+- MCP server tool invocation
+- An intentional unknown tool name
+
+Each fixture SHALL pair an input (request or response JSON) with an expected parser output snapshot. Parser changes SHALL require committing new snapshots and SHALL pass review before merge.
+
+#### Scenario: Snapshot drift fails CI
+
+- **WHEN** parser code is changed in a way that alters fixture output
+- **THEN** the test suite SHALL fail with a diff between old and new snapshot until the snapshot is updated and reviewed
+
+### Requirement: Sentinel counters for unknown tokens
+
+When the parser encounters a token, marker, or block that does not match any registered pattern in the relevant schema, it SHALL increment one of:
+
+- `ccxray.parser.unknown_tool_total{provider}`
+- `ccxray.parser.unknown_skill_marker_total{provider}`
+- `ccxray.parser.unknown_mcp_format_total`
+- `ccxray.parser.fallback_used_total{parser,reason}`
+
+The unknown event SHALL also be recorded with a short sample to `~/.ccxray/parser-drift.log` for later inspection via `ccxray parser report`.
+
+#### Scenario: Unknown tool name observed
+
+- **WHEN** ccxray sees a `tool_use` block whose `name` does not match any pattern in `parsers/anthropic-tools.schema.json`
+- **THEN** `ccxray.parser.unknown_tool_total{provider="anthropic"}` SHALL increment by 1 and a sample SHALL be appended to `~/.ccxray/parser-drift.log`
+
+### Requirement: Reconciliation invariants
+
+For every processed entry the parser SHALL verify the following invariants:
+
+- Number of `tool_use` blocks in the response equals the number of tool entries extracted by the parser.
+- Sum of input/output token counts attributed by the parser equals the corresponding values in the upstream usage block.
+
+When an invariant fails, `ccxray.parser.reconciliation_mismatch_total{type}` SHALL increment by 1 and the entry ID SHALL be appended to `~/.ccxray/parser-drift.log`. The mismatch SHALL NOT alter the entry's local log content.
+
+#### Scenario: Tool count mismatch
+
+- **WHEN** a response contains 3 `tool_use` blocks but the parser extracts only 2 tool entries
+- **THEN** `ccxray.parser.reconciliation_mismatch_total{type="tool_count"}` SHALL increment and the entry ID SHALL be recorded in the drift log
+
+### Requirement: Parser error isolation
+
+Parser code SHALL be wrapped in try/catch boundaries. On exception, `ccxray.parser.error_total{parser,error_type}` SHALL increment and the originating entry SHALL still be written to local logs. The OTel span/metric for the affected entry SHALL be tagged `ccxray.parser.degraded=true`. Parser failure SHALL NOT propagate to the proxy path or terminate ccxray.
+
+#### Scenario: Parser throws
+
+- **WHEN** the skill marker parser throws a runtime exception while processing a response
+- **THEN** ccxray SHALL log the exception locally, increment `ccxray.parser.error_total{parser="anthropic-skills",error_type="<class>"}`, write the entry to disk as usual, and continue forwarding subsequent requests
+
+### Requirement: `ccxray parser report` command
+
+The `ccxray parser report` command SHALL print the top unknown tokens by frequency from the last 7 days of `~/.ccxray/parser-drift.log`, grouped by category (tool / skill / MCP / fallback). The output SHALL include sample tokens and a GitHub issue body template the user can copy to file a drift report.
+
+#### Scenario: Reporting after seeing unknown markers
+
+- **WHEN** the engineer has accumulated unknown markers and runs `ccxray parser report`
+- **THEN** the command SHALL print a categorized summary, the most recent 5 unique samples per category, and a formatted GitHub issue body
diff --git a/openspec/changes/add-otel-metrics-phase1/tasks.md b/openspec/changes/add-otel-metrics-phase1/tasks.md
new file mode 100644
index 0000000..5c9af0e
--- /dev/null
+++ b/openspec/changes/add-otel-metrics-phase1/tasks.md
@@ -0,0 +1,105 @@
+## 1. Dependencies and package wiring
+
+- [x] 1.1 Add `@opentelemetry/api`, `@opentelemetry/sdk-metrics`, `@opentelemetry/exporter-metrics-otlp-http`, `@opentelemetry/resources` as `dependencies` in `package.json` (no auto-instrumentations)
+- [x] 1.2 Implement lazy require in a helper so ccxray still runs at tier 0 when OTel packages are absent
+- [x] 1.3 Update `package-lock.json` and confirm bundle size delta is within an acceptable bound
+
+## 2. Config loader (`server/config-loader.js`)
+
+- [ ] 2.1 Define JSON schema for `.ccxray.json` (project) and `.ccxray.user.json` (personal) covering: `otel.enabled`, `otel.tier`, `otel.endpoint`, `otel.headers`, `otel.resource_attributes`, `otel.cardinality_overrides`
+- [ ] 2.2 Implement schema validation with line/column error reporting
+- [ ] 2.3 Implement `${ENV_VAR}` interpolation across all string values; fail fast with named variable on unresolved
+- [ ] 2.4 Implement literal-secret detector (Bearer/JWT/`sk_*`/`ghp_*`) that rejects values not wrapped in `${...}`
+- [ ] 2.5 Implement project config lookup walking up from cwd to git root, taking the first `.ccxray.json` match
+- [ ] 2.6 Implement personal config lookup: cwd first, then `$HOME` fallback
+- [ ] 2.7 Implement tier resolution `effective = min(project_tier, personal_tier)` with downward clamp warning
+- [ ] 2.8 Implement `.gitignore` check and auto-amend with `--yes` flag for `.ccxray.user.json`
+- [ ] 2.9 Reject personal config that is currently tracked by git, with explanatory error
+- [ ] 2.10 Persist `opt_in_acknowledged_at` ISO 8601 timestamp on first tier 2 enable
+- [ ] 2.11 Unit tests covering all error paths, interpolation, secret rejection, tier resolution matrix
+
+## 3. OTel health module (`server/otel-health.js`)
+
+- [x] 3.1 Implement state machine with four states: `disabled / active / degraded / circuit_open` and transitions only via documented APIs
+- [ ] 3.2 Implement bounded export queue with drop-oldest semantics and `ccxray.otel.exports_dropped_total{signal}` increment per drop
+- [ ] 3.3 Implement circuit breaker: 5 consecutive failures trips, 60s initial cooldown, half-open trial, exponential backoff to 600s max
+- [ ] 3.4 Implement `~/.ccxray/otel.log` append writer with size-based rotation (default 1 MB, 5 file retention)
+- [x] 3.5 Implement SDK shutdown with 2-second hard cap to never block process exit
+- [ ] 3.6 Surface state and metrics via a status reporter API consumed by the CLI status command
+- [ ] 3.7 Unit tests with mock collector (200 / 500 / timeout) covering queue overflow, circuit transitions, half-open recovery, and exponential backoff
+
+## 4. OTel SDK initialization (`server/otel.js`)
+
+- [x] 4.1 Implement SDK init for metrics only, with `ccxray.source="ccxray-proxy"` resource attribute
+- [ ] 4.2 Define metric registry with allow-list of attribute keys and cardinality budgets per metric (View API)
+- [ ] 4.3 Implement cardinality budget tracker with `_overflow_` fallback and `ccxray.metrics.overflow_total{metric,attribute}` sentinel
+- [ ] 4.4 Detect `CLAUDE_CODE_ENABLE_TELEMETRY=1` and apply `ccxray.cli_otel_active=true` attribute in complement mode
+- [ ] 4.5 Register all metric families per `otel-export/spec.md`: cost, usage, quality, patterns, governance
+- [ ] 4.6 Register sentinel metrics: overflow, parser unknowns, parser mismatches, otel state, reconciliation diff, tier distribution
+- [ ] 4.7 Implement export-time masking of any value resolved from `${ENV_VAR}` for log lines and trace dumps
+- [ ] 4.8 Implement internal invariant metrics (`ccxray.invariants.parser_mismatch_total{type}`, `ccxray.invariants.sse_truncated_total`) — cross-source diff against CLI is NOT in Phase 1; documented as downstream pattern instead
+- [ ] 4.9 Unit tests for namespace lint (no metric name starts with `claude_code.`), source attribute presence, budget enforcement, complement mode attribute, lazy SDK init at tier 0
+
+## 5. Parser schema-ization (`server/parsers/`)
+
+- [ ] 5.1 Define the JSON schema format (fields: `version`, `last_verified_against`, `patterns`, `examples`)
+- [ ] 5.2 Author `parsers/anthropic-tools.schema.json` covering current internal tool names
+- [ ] 5.3 Author `parsers/anthropic-skills.schema.json` covering known skill marker formats from `system-prompt.js`
+- [ ] 5.4 Author `parsers/anthropic-agent-types.schema.json` for general / explore / plan / known subagent types
+- [ ] 5.5 Author `parsers/mcp-tools.schema.json` for `mcp__<server>__<tool>` naming
+- [ ] 5.6 Author `parsers/codex-tools.schema.json` for OpenAI Responses tool patterns
+- [ ] 5.7 Implement parser dispatch in `server/parsers/index.js` consuming the schemas
+- [ ] 5.8 Replace inline string matching in `server/system-prompt.js`, `server/store.js`, and `server/helpers.js` with schema dispatch calls
+- [ ] 5.9 Implement sentinel emission for unknown tools / skills / MCP markers and `~/.ccxray/parser-drift.log` append writer
+- [ ] 5.10 Implement reconciliation invariants: tool_use block count equals extracted count; token attribution sums equal usage block values
+- [ ] 5.11 Wrap parser calls in try/catch with `ccxray.parser.error_total{parser,error_type}` increment and `ccxray.parser.degraded=true` attribute on the affected entry
+- [ ] 5.12 Author snapshot fixtures under `test/fixtures/parser/` for every (provider, scenario) pair listed in `parser-schemas/spec.md`
+- [ ] 5.13 Wire snapshot tests into `npm test`
+
+## 6. Wire metrics into forward / store paths
+
+- [ ] 6.1 In `server/forward.js`, emit cost / token / latency / error / stop_reason metrics after each completed forward, using the otel-health queue _(partial: `emit('entry_completed', { entry })` wired in all 3 forward paths with full entry payload; routing through the otel-health queue is pending §3.2)_
+- [ ] 6.2 In `server/store.js`, emit usage / pattern / governance metrics as session/tool/skill/MCP detection runs through the new parsers
+- [ ] 6.3 Ensure no emit path can throw into the proxy code path; all emits are best-effort
+- [ ] 6.4 Add a unit test that verifies forward.js continues to function with OTel disabled, init-failed (degraded), and circuit_open states
+
+## 7. CLI introspection commands
+
+- [ ] 7.1 Implement `ccxray status --otel` per `otel-introspection/spec.md`: tier, endpoint (masked), state, transitions, cooldown, cardinality usage rows, success/failure/dropped counts, opt_in_acknowledged_at, CLI coexistence flag
+- [ ] 7.2 Implement `ccxray otel preview` dry-run printing next-export JSON with secrets masked
+- [ ] 7.3 Implement `ccxray parser report` command summarizing top unknown tokens and generating a GitHub issue body template
+- [ ] 7.4 Add startup banner declaring tier and complement-mode status when tier ≥ 1
+- [ ] 7.5 Unit tests for each command and banner output
+
+## 8. Hub-side coexistence (minimal Phase 1 changes)
+
+- [ ] 8.1 Confirm the hub does NOT initialize OTel SDK for business metrics; document this explicitly in the hub module header comment
+- [ ] 8.2 Make `ccxray status` aware of per-client OTel state via hub's existing client registration channel (so cross-client visibility works)
+- [ ] 8.3 Defer `ccxray.hub.*` operational metrics to a follow-up change (per Open Questions in design.md)
+
+## 9. Documentation
+
+- [ ] 9.1 Add `docs/otel-ethics.md` (bilingual): why these metrics are not for individual performance evaluation; what acceptable uses look like
+- [ ] 9.2 Add `docs/otel-quickstart.md` (bilingual): 90-second Grafana onboarding with screenshots
+- [ ] 9.3 Reference `docs/otel-integration.html` (existing) as the design record from README
+- [ ] 9.4 Update README with a single section: "Optional: send metrics to your observability backend" linking to quickstart and ethics docs
+- [ ] 9.5 Update `CLAUDE.md` Architecture section to note the new modules and their roles
+- [ ] 9.6 Add `docs/otel-recon.md` (bilingual): why cross-source reconciliation is a downstream concern, recording-rule / Grafana-panel / sidecar recipes for diffing ccxray vs CLI counts on `request_id`
+
+## 10. Verification gates
+
+- [ ] 10.1 CI lint: every emitted metric name MUST exist in `server/otel.js` schema registry; new metrics without registry entries fail build
+- [ ] 10.2 CI lint: no metric name SHALL start with `claude_code.`; assertion runs across all `server/**/*.js`
+- [ ] 10.3 Integration test: spin a local OTLP collector (docker), run a synthetic ccxray session, assert collector received the expected metric families with correct attributes
+- [ ] 10.4 Integration test: simulate collector returning 500 → assert circuit opens, queue drops oldest, ccxray continues forwarding
+- [ ] 10.5 Integration test: simulate `CLAUDE_CODE_ENABLE_TELEMETRY=1` → assert `cli_otel_active` attribute appears on emitted metrics
+- [ ] 10.6 Manual usability test: 3 new engineers walk README + quickstart, target median time-to-first-metric < 5 minutes
+- [ ] 10.7 Set 3-month KPI gate in repo: track GitHub references to "otel" / "OTEL_EXPORTER"; if < 10 within 3 months of release, pause Phase 2 work and revisit
+
+## 11. Release prep
+
+- [ ] 11.1 Update CHANGELOG with new dependencies, default-off behavior, three-tier model, and link to design doc
+- [ ] 11.2 Confirm npm publish package size delta and document in PR description
+- [ ] 11.3 Open follow-up issue for Phase 2 (span emit + `/entry/:id` drill-back)
+- [ ] 11.4 Open follow-up issue for `--otel-demo` Docker Compose helper
+- [ ] 11.5 Open follow-up issue for `ccxray.hub.*` operational metrics
diff --git a/package-lock.json b/package-lock.json
index 3afd051..866b5e2 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,15 +1,19 @@
 {
   "name": "ccxray",
-  "version": "1.5.0",
+  "version": "1.9.2",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "ccxray",
-      "version": "1.5.0",
+      "version": "1.9.2",
       "license": "MIT",
       "dependencies": {
         "@anthropic-ai/tokenizer": "^0.0.4",
+        "@opentelemetry/api": "^1.9.0",
+        "@opentelemetry/exporter-metrics-otlp-http": "^0.205.0",
+        "@opentelemetry/resources": "^2.0.0",
+        "@opentelemetry/sdk-metrics": "^2.0.0",
         "ws": "^8.19.0"
       },
       "bin": {
@@ -497,6 +501,299 @@
         "node": ">= 8"
       }
     },
+    "node_modules/@opentelemetry/api": {
+      "version": "1.9.1",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.1.tgz",
+      "integrity": "sha512-gLyJlPHPZYdAk1JENA9LeHejZe1Ti77/pTeFm/nMXmQH/HFZlcS/O2XJB+L8fkbrNSqhdtlvjBVjxwUYanNH5Q==",
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=8.0.0"
+      }
+    },
+    "node_modules/@opentelemetry/api-logs": {
+      "version": "0.205.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/api-logs/-/api-logs-0.205.0.tgz",
+      "integrity": "sha512-wBlPk1nFB37Hsm+3Qy73yQSobVn28F4isnWIBvKpd5IUH/eat8bwcL02H9yzmHyyPmukeccSl2mbN5sDQZYnPg==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/api": "^1.3.0"
+      },
+      "engines": {
+        "node": ">=8.0.0"
+      }
+    },
+    "node_modules/@opentelemetry/core": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-2.1.0.tgz",
+      "integrity": "sha512-RMEtHsxJs/GiHHxYT58IY57UXAQTuUnZVco6ymDEqTNlJKTimM4qPUPVe8InNFyBjhHBEAx4k3Q8LtNayBsbUQ==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/semantic-conventions": "^1.29.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.0.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/exporter-metrics-otlp-http": {
+      "version": "0.205.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/exporter-metrics-otlp-http/-/exporter-metrics-otlp-http-0.205.0.tgz",
+      "integrity": "sha512-fFxNQ/HbbpLmh1pgU6HUVbFD1kNIjrkoluoKJkh88+gnmpFD92kMQ8WFNjPnSbjg2mNVnEkeKXgCYEowNW+p1w==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.1.0",
+        "@opentelemetry/otlp-exporter-base": "0.205.0",
+        "@opentelemetry/otlp-transformer": "0.205.0",
+        "@opentelemetry/resources": "2.1.0",
+        "@opentelemetry/sdk-metrics": "2.1.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": "^1.3.0"
+      }
+    },
+    "node_modules/@opentelemetry/exporter-metrics-otlp-http/node_modules/@opentelemetry/resources": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.1.0.tgz",
+      "integrity": "sha512-1CJjf3LCvoefUOgegxi8h6r4B/wLSzInyhGP2UmIBYNlo4Qk5CZ73e1eEyWmfXvFtm1ybkmfb2DqWvspsYLrWw==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.1.0",
+        "@opentelemetry/semantic-conventions": "^1.29.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.3.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/exporter-metrics-otlp-http/node_modules/@opentelemetry/sdk-metrics": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-metrics/-/sdk-metrics-2.1.0.tgz",
+      "integrity": "sha512-J9QX459mzqHLL9Y6FZ4wQPRZG4TOpMCyPOh6mkr/humxE1W2S3Bvf4i75yiMW9uyed2Kf5rxmLhTm/UK8vNkAw==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.1.0",
+        "@opentelemetry/resources": "2.1.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.9.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/otlp-exporter-base": {
+      "version": "0.205.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/otlp-exporter-base/-/otlp-exporter-base-0.205.0.tgz",
+      "integrity": "sha512-2MN0C1IiKyo34M6NZzD6P9Nv9Dfuz3OJ3rkZwzFmF6xzjDfqqCTatc9v1EpNfaP55iDOCLHFyYNCgs61FFgtUQ==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.1.0",
+        "@opentelemetry/otlp-transformer": "0.205.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": "^1.3.0"
+      }
+    },
+    "node_modules/@opentelemetry/otlp-transformer": {
+      "version": "0.205.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/otlp-transformer/-/otlp-transformer-0.205.0.tgz",
+      "integrity": "sha512-KmObgqPtk9k/XTlWPJHdMbGCylRAmMJNXIRh6VYJmvlRDMfe+DonH41G7eenG8t4FXn3fxOGh14o/WiMRR6vPg==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/api-logs": "0.205.0",
+        "@opentelemetry/core": "2.1.0",
+        "@opentelemetry/resources": "2.1.0",
+        "@opentelemetry/sdk-logs": "0.205.0",
+        "@opentelemetry/sdk-metrics": "2.1.0",
+        "@opentelemetry/sdk-trace-base": "2.1.0",
+        "protobufjs": "^7.3.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": "^1.3.0"
+      }
+    },
+    "node_modules/@opentelemetry/otlp-transformer/node_modules/@opentelemetry/resources": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.1.0.tgz",
+      "integrity": "sha512-1CJjf3LCvoefUOgegxi8h6r4B/wLSzInyhGP2UmIBYNlo4Qk5CZ73e1eEyWmfXvFtm1ybkmfb2DqWvspsYLrWw==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.1.0",
+        "@opentelemetry/semantic-conventions": "^1.29.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.3.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/otlp-transformer/node_modules/@opentelemetry/sdk-metrics": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-metrics/-/sdk-metrics-2.1.0.tgz",
+      "integrity": "sha512-J9QX459mzqHLL9Y6FZ4wQPRZG4TOpMCyPOh6mkr/humxE1W2S3Bvf4i75yiMW9uyed2Kf5rxmLhTm/UK8vNkAw==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.1.0",
+        "@opentelemetry/resources": "2.1.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.9.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/resources": {
+      "version": "2.7.1",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.7.1.tgz",
+      "integrity": "sha512-DeT6KKolmC4e/dRQvMQ/RwlnzhaqeiFOXY5ngoOPJ07GgVVKxZOg9EcrNZb5aTzUn+iCrJldAgOfQm1O/QfPAQ==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.7.1",
+        "@opentelemetry/semantic-conventions": "^1.29.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.3.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/resources/node_modules/@opentelemetry/core": {
+      "version": "2.7.1",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-2.7.1.tgz",
+      "integrity": "sha512-QAqIj32AtK6+pEVNG7EOVxHdE06RP+FM5qpiEJ4RtDcFIqKUZHYhl7/7UY5efhwmwNAg7j8QbJVBLxMerc0+gw==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/semantic-conventions": "^1.29.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.0.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/sdk-logs": {
+      "version": "0.205.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-logs/-/sdk-logs-0.205.0.tgz",
+      "integrity": "sha512-nyqhNQ6eEzPWQU60Nc7+A5LIq8fz3UeIzdEVBQYefB4+msJZ2vuVtRuk9KxPMw1uHoHDtYEwkr2Ct0iG29jU8w==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/api-logs": "0.205.0",
+        "@opentelemetry/core": "2.1.0",
+        "@opentelemetry/resources": "2.1.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.4.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/sdk-logs/node_modules/@opentelemetry/resources": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.1.0.tgz",
+      "integrity": "sha512-1CJjf3LCvoefUOgegxi8h6r4B/wLSzInyhGP2UmIBYNlo4Qk5CZ73e1eEyWmfXvFtm1ybkmfb2DqWvspsYLrWw==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.1.0",
+        "@opentelemetry/semantic-conventions": "^1.29.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.3.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/sdk-metrics": {
+      "version": "2.7.1",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-metrics/-/sdk-metrics-2.7.1.tgz",
+      "integrity": "sha512-MpDJdkiFDs3Pm1RHO3KByuZbuBdJEXEAkiC0+yJdsZGVCdf1RpHR6n+LHDcS7ffmfrt5kVCzJSCfm4z2C7v0uQ==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.7.1",
+        "@opentelemetry/resources": "2.7.1"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.9.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/sdk-metrics/node_modules/@opentelemetry/core": {
+      "version": "2.7.1",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-2.7.1.tgz",
+      "integrity": "sha512-QAqIj32AtK6+pEVNG7EOVxHdE06RP+FM5qpiEJ4RtDcFIqKUZHYhl7/7UY5efhwmwNAg7j8QbJVBLxMerc0+gw==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/semantic-conventions": "^1.29.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.0.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/sdk-trace-base": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-base/-/sdk-trace-base-2.1.0.tgz",
+      "integrity": "sha512-uTX9FBlVQm4S2gVQO1sb5qyBLq/FPjbp+tmGoxu4tIgtYGmBYB44+KX/725RFDe30yBSaA9Ml9fqphe1hbUyLQ==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.1.0",
+        "@opentelemetry/resources": "2.1.0",
+        "@opentelemetry/semantic-conventions": "^1.29.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.3.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/sdk-trace-base/node_modules/@opentelemetry/resources": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-2.1.0.tgz",
+      "integrity": "sha512-1CJjf3LCvoefUOgegxi8h6r4B/wLSzInyhGP2UmIBYNlo4Qk5CZ73e1eEyWmfXvFtm1ybkmfb2DqWvspsYLrWw==",
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "2.1.0",
+        "@opentelemetry/semantic-conventions": "^1.29.0"
+      },
+      "engines": {
+        "node": "^18.19.0 || >=20.6.0"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.3.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/semantic-conventions": {
+      "version": "1.41.1",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/semantic-conventions/-/semantic-conventions-1.41.1.tgz",
+      "integrity": "sha512-/UhIkaZgPutTFmQ7RnIJGgDXZmtEJ7Dvi86xNTFWcnRxVRNk/aotsqDJYeEvDP+FSMB2SdW+pQzNMcWP0rwuNA==",
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=14"
+      }
+    },
     "node_modules/@posthog/core": {
       "version": "1.10.0",
       "resolved": "https://registry.npmjs.org/@posthog/core/-/core-1.10.0.tgz",
@@ -507,6 +804,70 @@
         "cross-spawn": "^7.0.6"
       }
     },
+    "node_modules/@protobufjs/aspromise": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
+      "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/base64": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz",
+      "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/codegen": {
+      "version": "2.0.5",
+      "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.5.tgz",
+      "integrity": "sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/eventemitter": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz",
+      "integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/fetch": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz",
+      "integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==",
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "@protobufjs/aspromise": "^1.1.1",
+        "@protobufjs/inquire": "^1.1.0"
+      }
+    },
+    "node_modules/@protobufjs/float": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz",
+      "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/inquire": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.1.tgz",
+      "integrity": "sha512-mnzgDV26ueAvk7rsbt9L7bE0SuAoqyuys/sMMrmVcN5x9VsxpcG3rqAUSgDyLp0UZlmNfIbQ4fHfCtreVBk8Ew==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/path": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz",
+      "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/pool": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz",
+      "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==",
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/utf8": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.1.tgz",
+      "integrity": "sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==",
+      "license": "BSD-3-Clause"
+    },
     "node_modules/@puppeteer/browsers": {
       "version": "2.13.0",
       "resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.13.0.tgz",
@@ -1460,6 +1821,12 @@
         "url": "https://github.com/sponsors/sindresorhus"
       }
     },
+    "node_modules/long": {
+      "version": "5.3.2",
+      "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz",
+      "integrity": "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==",
+      "license": "Apache-2.0"
+    },
     "node_modules/lru-cache": {
       "version": "7.18.3",
       "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-7.18.3.tgz",
@@ -1771,6 +2138,30 @@
         "node": ">=0.4.0"
       }
     },
+    "node_modules/protobufjs": {
+      "version": "7.5.8",
+      "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.8.tgz",
+      "integrity": "sha512-dvpCIeLPbXZS/Ete7yLaO7RenOdken2NHKykBXbsaGxZT0UTltcarBciw+A78SRQs9iMAAVpsYA+l8b1hTePIA==",
+      "hasInstallScript": true,
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "@protobufjs/aspromise": "^1.1.2",
+        "@protobufjs/base64": "^1.1.2",
+        "@protobufjs/codegen": "^2.0.5",
+        "@protobufjs/eventemitter": "^1.1.0",
+        "@protobufjs/fetch": "^1.1.0",
+        "@protobufjs/float": "^1.0.2",
+        "@protobufjs/inquire": "^1.1.1",
+        "@protobufjs/path": "^1.1.2",
+        "@protobufjs/pool": "^1.1.0",
+        "@protobufjs/utf8": "^1.1.1",
+        "@types/node": ">=13.7.0",
+        "long": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=12.0.0"
+      }
+    },
     "node_modules/proxy-agent": {
       "version": "6.5.0",
       "resolved": "https://registry.npmjs.org/proxy-agent/-/proxy-agent-6.5.0.tgz",
diff --git a/package.json b/package.json
index bd5b57c..767851a 100644
--- a/package.json
+++ b/package.json
@@ -37,6 +37,10 @@
   },
   "dependencies": {
     "@anthropic-ai/tokenizer": "^0.0.4",
+    "@opentelemetry/api": "^1.9.0",
+    "@opentelemetry/exporter-metrics-otlp-http": "^0.205.0",
+    "@opentelemetry/resources": "^2.0.0",
+    "@opentelemetry/sdk-metrics": "^2.0.0",
     "ws": "^8.19.0"
   },
   "devDependencies": {
diff --git a/server/cli.js b/server/cli.js
new file mode 100644
index 0000000..0124a36
--- /dev/null
+++ b/server/cli.js
@@ -0,0 +1,63 @@
+'use strict';
+
+// CLI argv parsing for ccxray. Splits flag detection from server/index.js so
+// new subcommands can be added without growing the entry-point file. Mutates
+// process.argv in place to strip consumed flags (existing behaviour).
+
+const providers = require('./providers');
+
+function parseArgs(argv = process.argv, env = process.env) {
+  const portIdx = argv.indexOf('--port');
+  let explicitPort = false;
+  let port = null;
+  if (portIdx !== -1) {
+    const portVal = argv[portIdx + 1];
+    const parsed = parseInt(portVal, 10);
+    if (!portVal || isNaN(parsed) || parsed < 1 || parsed > 65535) {
+      console.error('\x1b[31mError: --port requires a valid port number (1-65535)\x1b[0m');
+      process.exit(1);
+    }
+    port = parsed;
+    explicitPort = true;
+    argv.splice(portIdx, 2);
+  }
+
+  const hubMode = argv.includes('--hub-mode');
+  if (hubMode) argv.splice(argv.indexOf('--hub-mode'), 1);
+
+  const allowUpstreamLoop = argv.includes('--allow-upstream-loop') || env.CCXRAY_ALLOW_UPSTREAM_LOOP === '1';
+  if (argv.includes('--allow-upstream-loop')) argv.splice(argv.indexOf('--allow-upstream-loop'), 1);
+
+  const noBrowser = argv.includes('--no-browser');
+  if (noBrowser) argv.splice(argv.indexOf('--no-browser'), 1);
+
+  const cliCommand = argv[2];
+  const unknownCommand = cliCommand
+    && cliCommand !== 'status'
+    && !cliCommand.startsWith('-')
+    && !providers.isAgentProvider(cliCommand);
+  if (unknownCommand) {
+    console.error(`\x1b[31mError: unsupported provider "${cliCommand}". Supported providers: ${providers.supportedProviderList()}\x1b[0m`);
+    process.exit(1);
+  }
+
+  const agentCommand = providers.isAgentProvider(cliCommand) ? cliCommand : null;
+  const agentMode = Boolean(agentCommand);
+  const agentArgs = agentMode ? argv.slice(3) : [];
+  const displayName = providers.getDisplayName(agentCommand, env);
+
+  return {
+    port,
+    explicitPort,
+    hubMode,
+    allowUpstreamLoop,
+    noBrowser,
+    cliCommand,
+    agentCommand,
+    agentMode,
+    agentArgs,
+    displayName,
+  };
+}
+
+module.exports = { parseArgs };
diff --git a/server/config-loader.js b/server/config-loader.js
new file mode 100644
index 0000000..e108ac0
--- /dev/null
+++ b/server/config-loader.js
@@ -0,0 +1,64 @@
+'use strict';
+
+// Minimal config loader for the OTel rollout (Phase 2a slice).
+// This intentionally implements only the surface needed for the first
+// vertical slice: read .ccxray.json from cwd if present, return a default
+// shape otherwise. Env interpolation, literal-secret detection, gitignore
+// auto-amend, personal config (.ccxray.user.json), and walk-up-to-git-root
+// lookup all land in later Phase 2 sub-phases per the OpenSpec change.
+
+const fs = require('fs');
+const path = require('path');
+
+const DEFAULT_CONFIG = Object.freeze({
+  otel: Object.freeze({
+    enabled: false,
+    tier: 0,
+    endpoint: null,
+    headers: Object.freeze({}),
+    resource_attributes: Object.freeze({}),
+    cardinality_overrides: Object.freeze({}),
+  }),
+});
+
+function projectConfigPath(cwd) {
+  return path.join(cwd || process.cwd(), '.ccxray.json');
+}
+
+function readProjectConfig(cwd) {
+  const file = projectConfigPath(cwd);
+  let raw;
+  try {
+    raw = fs.readFileSync(file, 'utf8');
+  } catch (err) {
+    if (err.code === 'ENOENT') return { config: DEFAULT_CONFIG, source: null };
+    throw new Error(`config-loader: failed to read ${file}: ${err.message}`);
+  }
+  let parsed;
+  try {
+    parsed = JSON.parse(raw);
+  } catch (err) {
+    throw new Error(`config-loader: ${file} is not valid JSON (${err.message})`);
+  }
+  return { config: mergeWithDefaults(parsed), source: file };
+}
+
+function mergeWithDefaults(input) {
+  const otel = input && typeof input.otel === 'object' && input.otel ? input.otel : {};
+  return {
+    otel: {
+      enabled: otel.enabled === true,
+      tier: Number.isInteger(otel.tier) ? otel.tier : 0,
+      endpoint: typeof otel.endpoint === 'string' ? otel.endpoint : null,
+      headers: otel.headers && typeof otel.headers === 'object' ? { ...otel.headers } : {},
+      resource_attributes: otel.resource_attributes && typeof otel.resource_attributes === 'object'
+        ? { ...otel.resource_attributes }
+        : {},
+      cardinality_overrides: otel.cardinality_overrides && typeof otel.cardinality_overrides === 'object'
+        ? { ...otel.cardinality_overrides }
+        : {},
+    },
+  };
+}
+
+module.exports = { readProjectConfig, projectConfigPath, DEFAULT_CONFIG };
diff --git a/server/emit.js b/server/emit.js
new file mode 100644
index 0000000..050a239
--- /dev/null
+++ b/server/emit.js
@@ -0,0 +1,40 @@
+'use strict';
+
+// Internal event bus for OTel handlers, parser sentinels, and future status hooks.
+//
+// Phase D (OTel SDK init) registers subscribers; Phase E wires emit() calls in
+// forward.js / store.js. With no subscribers, emit() is an O(1) no-op — tier 0
+// pays zero cost.
+//
+// Handlers run synchronously and MUST NOT throw into the proxy code path; this
+// module wraps every dispatch in try/catch so a buggy subscriber cannot break
+// request forwarding.
+//
+// Defined events (payload shape stable across Phase 1):
+//   entry_completed   { entry }
+//   session_started   { sessionId, provider, inferred }
+//   parser_unknown    { provider, kind, token }
+//   parser_mismatch   { type, expected, got, entryId? }
+//   parser_error      { parser, errorType, message }
+
+const subscribers = new Map();
+
+function on(event, handler) {
+  if (typeof handler !== 'function') throw new TypeError('handler must be a function');
+  if (!subscribers.has(event)) subscribers.set(event, new Set());
+  subscribers.get(event).add(handler);
+  return () => subscribers.get(event)?.delete(handler);
+}
+
+function emit(event, payload) {
+  const set = subscribers.get(event);
+  if (!set || set.size === 0) return;
+  for (const handler of set) {
+    try { handler(payload); }
+    catch (err) {
+      try { console.error(`[emit] handler "${event}":`, err && err.message); } catch {}
+    }
+  }
+}
+
+module.exports = { on, emit };
diff --git a/server/forward.js b/server/forward.js
index 71a7360..23d3852 100644
--- a/server/forward.js
+++ b/server/forward.js
@@ -10,6 +10,7 @@ const helpers = require('./helpers');
 const { broadcast, broadcastSessionStatus, broadcastSessionTitleUpdate } = require('./sse-broadcast');
 const { appendSample, collectRatelimitHeaders } = require('./ratelimit-log');
 const hub = require('./hub');
+const emit = require('./emit');
 
 // For title-generator subagent responses, extract the clean title from the
 // JSON payload and (when attribution succeeds) stamp it onto the parent
@@ -80,8 +81,15 @@ function createTunnelAgent(proxyUrl) {
       }
       const tlsOpts = { socket, servername: options.servername || options.host };
       if (options.rejectUnauthorized !== undefined) tlsOpts.rejectUnauthorized = options.rejectUnauthorized;
-      const tlsSocket = tls.connect(tlsOpts, () => callback(null, tlsSocket));
-      tlsSocket.on('error', callback);
+      let connected = false;
+      const tlsSocket = tls.connect(tlsOpts, () => {
+        connected = true;
+        callback(null, tlsSocket);
+      });
+      tlsSocket.on('error', (err) => {
+        if (!connected) return callback(err);
+        console.error(`\x1b[31m❌ TUNNEL SOCKET ERROR: ${err.code || err.message}\x1b[0m`);
+      });
     });
 
     connectReq.on('error', callback);
@@ -377,6 +385,15 @@ function forwardRequest(ctx) {
     clientRes.end(JSON.stringify({ error: 'proxy_error', message: err.message }));
   });
 
+  // Late socket errors (EPIPE / ECONNRESET after the response has been received)
+  // are emitted on the underlying TLS/TCP socket and may not re-emit on the
+  // ClientRequest. Without a listener they crash the entire proxy process.
+  proxyReq.on('socket', (socket) => {
+    socket.on('error', (err) => {
+      console.error(`\x1b[31m❌ UPSTREAM SOCKET ERROR: ${err.code || err.message}\x1b[0m`);
+    });
+  });
+
   proxyReq.end(bodyToSend);
 }
 
@@ -599,6 +616,7 @@ function handleSSEResponse(ctx, proxyRes, clientRes) {
     store.trimEntries();
     store.propagateLoadedSkills(entry, sessionId);
     broadcast(entry);
+    emit.emit('entry_completed', { entry });
 
     // Persist to index (fire-and-forget after broadcast)
     const indexLine = JSON.stringify({
@@ -730,6 +748,7 @@ function handleOpenAISSE(ctx, proxyRes, clientRes) {
     store.entries.push(entry);
     store.trimEntries();
     broadcast(entry);
+    emit.emit('entry_completed', { entry });
 
     const indexLine = JSON.stringify({
       id, ts: ctx.ts, sessionId: reqSessionId,
@@ -875,6 +894,7 @@ function handleNonSSEResponse(ctx, proxyRes, clientRes) {
     store.trimEntries();
     store.propagateLoadedSkills(entry, sessionId);
     broadcast(entry);
+    emit.emit('entry_completed', { entry });
 
     const indexLine = JSON.stringify({
       id, ts: ctx.ts, sessionId,
diff --git a/server/index.js b/server/index.js
index c47ce40..2abc32a 100755
--- a/server/index.js
+++ b/server/index.js
@@ -18,40 +18,20 @@ const { authMiddleware } = require('./auth');
 const { extractAgentType, extractPromptAgentType, splitB2IntoBlocks } = require('./system-prompt');
 const { findSharedPrefix } = require('./delta-helpers');
 const providers = require('./providers');
-
-// ── CLI: parse flags and detect provider launchers ──
-const portIdx = process.argv.indexOf('--port');
-let explicitPort = false;
-if (portIdx !== -1) {
-  const portVal = process.argv[portIdx + 1];
-  const parsed = parseInt(portVal, 10);
-  if (!portVal || isNaN(parsed) || parsed < 1 || parsed > 65535) {
-    console.error('\x1b[31mError: --port requires a valid port number (1-65535)\x1b[0m');
-    process.exit(1);
-  }
-  config.PORT = parsed;
-  explicitPort = true;
-  process.argv.splice(portIdx, 2);
-}
-const hubMode = process.argv.includes('--hub-mode');
-if (hubMode) process.argv.splice(process.argv.indexOf('--hub-mode'), 1);
-const allowUpstreamLoop = process.argv.includes('--allow-upstream-loop') || process.env.CCXRAY_ALLOW_UPSTREAM_LOOP === '1';
-if (process.argv.includes('--allow-upstream-loop')) process.argv.splice(process.argv.indexOf('--allow-upstream-loop'), 1);
-const noBrowser = process.argv.includes('--no-browser');
-if (noBrowser) process.argv.splice(process.argv.indexOf('--no-browser'), 1);
-const cliCommand = process.argv[2];
-const unknownCommand = cliCommand
-  && cliCommand !== 'status'
-  && !cliCommand.startsWith('-')
-  && !providers.isAgentProvider(cliCommand);
-if (unknownCommand) {
-  console.error(`\x1b[31mError: unsupported provider "${cliCommand}". Supported providers: ${providers.supportedProviderList()}\x1b[0m`);
-  process.exit(1);
-}
-const agentCommand = providers.isAgentProvider(cliCommand) ? cliCommand : null;
-const agentMode = Boolean(agentCommand);
-const agentArgs = agentMode ? process.argv.slice(3) : [];
-const DISPLAY_NAME = providers.getDisplayName(agentCommand, process.env);
+const { parseArgs } = require('./cli');
+
+const {
+  port: cliPort,
+  explicitPort,
+  hubMode,
+  allowUpstreamLoop,
+  noBrowser,
+  agentCommand,
+  agentMode,
+  agentArgs,
+  displayName: DISPLAY_NAME,
+} = parseArgs();
+if (cliPort != null) config.PORT = cliPort;
 
 // In agent/hub mode, mute startup logs so they don't pollute output.
 const _origLog = console.log;
@@ -775,7 +755,6 @@ async function startServer() {
     if (acquired) hub.releaseForkLock();
     console.error(`\x1b[31m${err.message}\x1b[0m`);
     // Show last hub log lines so user doesn't have to open the file
-    const fs = require('fs');
     try {
       const log = fs.readFileSync(hub.HUB_LOG_PATH, 'utf8');
       const lines = log.trim().split('\n');
diff --git a/server/otel-health.js b/server/otel-health.js
new file mode 100644
index 0000000..f833696
--- /dev/null
+++ b/server/otel-health.js
@@ -0,0 +1,60 @@
+'use strict';
+
+// OTel export health state machine. Phase 2b: state shell only.
+// Bounded export queue (3.2), circuit breaker (3.3), log rotation (3.4),
+// and shutdown cap (3.5) land in later sub-phases of the OpenSpec change.
+//
+// States:
+//   disabled      — OTel never initialized (tier 0 or packages missing-and-tolerated)
+//   active        — SDK initialized, exports presumed working
+//   degraded      — SDK init failed or runtime non-recoverable; proxy continues
+//   circuit_open  — runtime export failures tripped the breaker; periodic half-open retry
+//
+// Only documented APIs may mutate state. Invalid transitions throw so bugs
+// surface in tests rather than silently corrupt observability.
+
+const STATES = Object.freeze(['disabled', 'active', 'degraded', 'circuit_open']);
+
+const VALID_TRANSITIONS = Object.freeze({
+  disabled: new Set(['active', 'degraded']),
+  active: new Set(['degraded', 'circuit_open', 'disabled']),
+  degraded: new Set(['active', 'circuit_open', 'disabled']),
+  circuit_open: new Set(['active', 'degraded', 'disabled']),
+});
+
+let currentState = 'disabled';
+let lastTransitionAt = Date.now();
+let lastReason = null;
+
+function getState() {
+  return currentState;
+}
+
+function getStatus() {
+  return {
+    state: currentState,
+    lastTransitionAt,
+    reason: lastReason,
+  };
+}
+
+function transition(to, { reason } = {}) {
+  if (!STATES.includes(to)) throw new Error(`otel-health: unknown state "${to}"`);
+  if (currentState === to) return false;
+  const allowed = VALID_TRANSITIONS[currentState];
+  if (!allowed.has(to)) {
+    throw new Error(`otel-health: invalid transition ${currentState} → ${to}`);
+  }
+  currentState = to;
+  lastTransitionAt = Date.now();
+  lastReason = (to === 'degraded' || to === 'circuit_open') ? (reason || null) : null;
+  return true;
+}
+
+function _resetForTests() {
+  currentState = 'disabled';
+  lastTransitionAt = Date.now();
+  lastReason = null;
+}
+
+module.exports = { STATES, getState, getStatus, transition, _resetForTests };
diff --git a/server/otel-lazy.js b/server/otel-lazy.js
new file mode 100644
index 0000000..d796e68
--- /dev/null
+++ b/server/otel-lazy.js
@@ -0,0 +1,35 @@
+'use strict';
+
+// Lazy require for OpenTelemetry packages.
+// Phase 1 of the OTel rollout: ccxray must run at tier 0 even when the
+// @opentelemetry/* packages are absent (e.g. user installed via a minimal
+// distribution). Callers ask for a package by name; we return null if it
+// cannot be resolved instead of throwing.
+
+const KNOWN_PACKAGES = new Set([
+  '@opentelemetry/api',
+  '@opentelemetry/resources',
+  '@opentelemetry/sdk-metrics',
+  '@opentelemetry/exporter-metrics-otlp-http',
+]);
+
+function tryRequire(name) {
+  if (!KNOWN_PACKAGES.has(name)) {
+    throw new Error(`otel-lazy: unknown package "${name}"`);
+  }
+  try {
+    return require(name);
+  } catch (err) {
+    if (err && err.code === 'MODULE_NOT_FOUND') return null;
+    throw err;
+  }
+}
+
+function isAvailable() {
+  for (const name of KNOWN_PACKAGES) {
+    if (tryRequire(name) == null) return false;
+  }
+  return true;
+}
+
+module.exports = { tryRequire, isAvailable, KNOWN_PACKAGES };
diff --git a/server/otel.js b/server/otel.js
new file mode 100644
index 0000000..653a787
--- /dev/null
+++ b/server/otel.js
@@ -0,0 +1,199 @@
+'use strict';
+
+// OTel SDK init + emit.js subscribers.
+//
+// Vertical-slice scope (Phase 1, first cut): tier 0 = full no-op. tier ≥ 1 +
+// packages present + endpoint configured → real MeterProvider with OTLP HTTP
+// exporter and the first metric family (token usage). tier ≥ 1 with packages
+// present but no endpoint → active state with no exporter (useful for staging
+// the wiring before pointing at a collector).
+//
+// Metrics registered in this slice (aligned with otel-export/spec.md):
+//   ccxray.tokens.input_total          (counter, unit=tokens)
+//   ccxray.tokens.output_total         (counter, unit=tokens)
+//   ccxray.tokens.cache_read_total     (counter, unit=tokens)
+//   ccxray.tokens.cache_creation_total (counter, unit=tokens)
+// Each is recorded with { provider, model } attributes. Cardinality budgets,
+// View API allow-lists, sentinel metrics, and the full cost/usage/quality
+// families land in later slices (§4.2–§4.9 of the OpenSpec change).
+//
+// Resource attribute `ccxray.source=ccxray-proxy` is always set so downstream
+// consumers can distinguish ccxray-emitted metrics from `claude_code.*` CLI
+// metrics that the user may also be exporting.
+//
+// shutdown() returns synchronously to disabled state (so existing callers
+// don't need to await) and fires the SDK provider.shutdown() in the
+// background with a 2-second hard cap — never blocks process exit.
+//
+// init() never throws into the caller; any failure transitions to degraded
+// with a reason and the proxy continues running.
+
+const emit = require('./emit');
+const defaultOtelLazy = require('./otel-lazy');
+const health = require('./otel-health');
+
+let initialized = false;
+let unsubscribers = [];
+let sdkContext = null; // { provider, reader, instruments } | null
+
+function init(config, deps = {}) {
+  if (initialized) return health.getState();
+  initialized = true;
+
+  const tier = (config && config.otel && Number.isInteger(config.otel.tier))
+    ? config.otel.tier
+    : 0;
+
+  if (tier <= 0) {
+    return health.getState();
+  }
+
+  const otelLazy = deps.otelLazy || defaultOtelLazy;
+  if (!otelLazy.isAvailable()) {
+    health.transition('degraded', { reason: 'opentelemetry packages not installed' });
+    return health.getState();
+  }
+
+  try {
+    if (config.otel.endpoint) {
+      sdkContext = initSdk(config, otelLazy);
+    }
+    registerHandlers();
+    health.transition('active');
+  } catch (err) {
+    sdkContext = null;
+    health.transition('degraded', { reason: `SDK init failed: ${err && err.message || err}` });
+  }
+  return health.getState();
+}
+
+function initSdk(config, otelLazy) {
+  const sdk = otelLazy.tryRequire('@opentelemetry/sdk-metrics');
+  const exp = otelLazy.tryRequire('@opentelemetry/exporter-metrics-otlp-http');
+  const res = otelLazy.tryRequire('@opentelemetry/resources');
+  if (!sdk || !exp || !res) {
+    throw new Error('required OTel package failed to resolve');
+  }
+
+  const exporter = new exp.OTLPMetricExporter({
+    url: config.otel.endpoint,
+    headers: config.otel.headers || {},
+  });
+
+  // Default 60s export interval, overridable for tests via env var.
+  const intervalMs = Number(process.env.CCXRAY_OTEL_EXPORT_INTERVAL_MS) || 60000;
+  const reader = new sdk.PeriodicExportingMetricReader({
+    exporter,
+    exportIntervalMillis: intervalMs,
+  });
+
+  const resource = res.resourceFromAttributes({
+    'ccxray.source': 'ccxray-proxy',
+    ...(config.otel.resource_attributes || {}),
+  });
+
+  const provider = new sdk.MeterProvider({ resource, readers: [reader] });
+
+  const meter = provider.getMeter('ccxray', '1');
+  const instruments = {
+    inputTokens: meter.createCounter('ccxray.tokens.input_total', {
+      description: 'Input tokens per completed entry',
+      unit: 'tokens',
+    }),
+    outputTokens: meter.createCounter('ccxray.tokens.output_total', {
+      description: 'Output tokens per completed entry',
+      unit: 'tokens',
+    }),
+    cacheReadTokens: meter.createCounter('ccxray.tokens.cache_read_total', {
+      description: 'Cache-read input tokens per completed entry',
+      unit: 'tokens',
+    }),
+    cacheCreationTokens: meter.createCounter('ccxray.tokens.cache_creation_total', {
+      description: 'Cache-creation input tokens per completed entry',
+      unit: 'tokens',
+    }),
+  };
+
+  return { provider, reader, instruments };
+}
+
+function registerHandlers() {
+  unsubscribers.push(emit.on('entry_completed', onEntryCompleted));
+  // Other event types land as later slices wire them up.
+  unsubscribers.push(emit.on('session_started', () => { /* tier ≥ 1 stub */ }));
+  unsubscribers.push(emit.on('parser_unknown', () => { /* tier ≥ 1 stub */ }));
+  unsubscribers.push(emit.on('parser_mismatch', () => { /* tier ≥ 1 stub */ }));
+  unsubscribers.push(emit.on('parser_error', () => { /* tier ≥ 1 stub */ }));
+}
+
+function onEntryCompleted(payload) {
+  if (!sdkContext) return;
+  const entry = payload && payload.entry;
+  const usage = entry && entry.usage;
+  if (!usage) return;
+
+  const attrs = {
+    provider: entry.provider || 'unknown',
+    model: entry.model || 'unknown',
+  };
+
+  const input = Number(usage.input_tokens) || 0;
+  const output = Number(usage.output_tokens) || 0;
+  const cacheRead = Number(usage.cache_read_input_tokens) || 0;
+  const cacheCreate = Number(usage.cache_creation_input_tokens) || 0;
+
+  sdkContext.instruments.inputTokens.add(input, attrs);
+  sdkContext.instruments.outputTokens.add(output, attrs);
+  sdkContext.instruments.cacheReadTokens.add(cacheRead, attrs);
+  sdkContext.instruments.cacheCreationTokens.add(cacheCreate, attrs);
+}
+
+// Returns a Promise but is safe to ignore. The synchronous portion (before the
+// first await below) is enough to make `health.getState() === 'disabled'` and
+// `initialized === false` visible to immediate follow-up calls — existing
+// `otel.shutdown()` callers that do not await still see the new state.
+async function shutdown() {
+  for (const off of unsubscribers) {
+    try { off(); } catch { /* ignore */ }
+  }
+  unsubscribers = [];
+
+  const ctx = sdkContext;
+  sdkContext = null;
+
+  if (health.getState() !== 'disabled') {
+    health.transition('disabled');
+  }
+  initialized = false;
+
+  if (ctx && ctx.provider && typeof ctx.provider.shutdown === 'function') {
+    try {
+      await Promise.race([
+        ctx.provider.shutdown(),
+        new Promise(resolve => setTimeout(resolve, 2000)),
+      ]);
+    } catch { /* never block process exit on shutdown errors */ }
+  }
+}
+
+// Force-flush exists so tests (and a future `ccxray status --otel` command)
+// can drain the reader on demand. Returns a Promise that resolves even on
+// failure — never throws to the caller.
+async function flush() {
+  if (!sdkContext || !sdkContext.provider) return;
+  try {
+    await sdkContext.provider.forceFlush();
+  } catch { /* ignore */ }
+}
+
+function _resetForTests() {
+  // Sync drop of everything for tests that do not await shutdown.
+  for (const off of unsubscribers) { try { off(); } catch {} }
+  unsubscribers = [];
+  sdkContext = null;
+  if (health.getState() !== 'disabled') health.transition('disabled');
+  initialized = false;
+  health._resetForTests();
+}
+
+module.exports = { init, shutdown, flush, _resetForTests };
diff --git a/test/config-loader.test.js b/test/config-loader.test.js
new file mode 100644
index 0000000..1d74e3a
--- /dev/null
+++ b/test/config-loader.test.js
@@ -0,0 +1,85 @@
+'use strict';
+
+const test = require('node:test');
+const assert = require('node:assert/strict');
+const fs = require('fs');
+const os = require('os');
+const path = require('path');
+
+const { readProjectConfig, DEFAULT_CONFIG } = require('../server/config-loader');
+const { tryRequire, isAvailable } = require('../server/otel-lazy');
+
+function mkTmp() {
+  return fs.mkdtempSync(path.join(os.tmpdir(), 'ccxray-cfg-'));
+}
+
+test('config-loader: returns default config when .ccxray.json is absent', () => {
+  const dir = mkTmp();
+  try {
+    const { config, source } = readProjectConfig(dir);
+    assert.equal(source, null);
+    assert.deepEqual(config, DEFAULT_CONFIG);
+    assert.equal(config.otel.enabled, false);
+    assert.equal(config.otel.tier, 0);
+  } finally {
+    fs.rmSync(dir, { recursive: true, force: true });
+  }
+});
+
+test('config-loader: reads otel block from .ccxray.json', () => {
+  const dir = mkTmp();
+  try {
+    fs.writeFileSync(path.join(dir, '.ccxray.json'), JSON.stringify({
+      otel: {
+        enabled: true,
+        tier: 1,
+        endpoint: 'http://collector.local:4318',
+        headers: { 'x-team': 'platform' },
+        resource_attributes: { 'service.name': 'ccxray-proxy' },
+      },
+    }));
+    const { config, source } = readProjectConfig(dir);
+    assert.ok(source && source.endsWith('.ccxray.json'));
+    assert.equal(config.otel.enabled, true);
+    assert.equal(config.otel.tier, 1);
+    assert.equal(config.otel.endpoint, 'http://collector.local:4318');
+    assert.equal(config.otel.headers['x-team'], 'platform');
+    assert.equal(config.otel.resource_attributes['service.name'], 'ccxray-proxy');
+  } finally {
+    fs.rmSync(dir, { recursive: true, force: true });
+  }
+});
+
+test('config-loader: malformed JSON throws with a descriptive error', () => {
+  const dir = mkTmp();
+  try {
+    fs.writeFileSync(path.join(dir, '.ccxray.json'), '{ not valid json');
+    assert.throws(() => readProjectConfig(dir), /not valid JSON/);
+  } finally {
+    fs.rmSync(dir, { recursive: true, force: true });
+  }
+});
+
+test('config-loader: tier defaults to 0 when value is non-integer', () => {
+  const dir = mkTmp();
+  try {
+    fs.writeFileSync(path.join(dir, '.ccxray.json'), JSON.stringify({ otel: { tier: 'one' } }));
+    const { config } = readProjectConfig(dir);
+    assert.equal(config.otel.tier, 0);
+  } finally {
+    fs.rmSync(dir, { recursive: true, force: true });
+  }
+});
+
+test('otel-lazy: tryRequire returns the package object when installed', () => {
+  const api = tryRequire('@opentelemetry/api');
+  assert.ok(api && typeof api === 'object', 'expected @opentelemetry/api to resolve');
+});
+
+test('otel-lazy: tryRequire rejects unknown package names', () => {
+  assert.throws(() => tryRequire('@opentelemetry/not-real'), /unknown package/);
+});
+
+test('otel-lazy: isAvailable returns true once all known packages resolve', () => {
+  assert.equal(isAvailable(), true);
+});
diff --git a/test/otel-init.test.js b/test/otel-init.test.js
new file mode 100644
index 0000000..7f60877
--- /dev/null
+++ b/test/otel-init.test.js
@@ -0,0 +1,82 @@
+'use strict';
+
+const test = require('node:test');
+const assert = require('node:assert/strict');
+
+const emit = require('../server/emit');
+const otel = require('../server/otel');
+const health = require('../server/otel-health');
+
+test.beforeEach(() => otel._resetForTests());
+test.afterEach(() => otel._resetForTests());
+
+test('otel.init: tier 0 stays disabled and registers no subscribers', () => {
+  let entryCompletedFired = false;
+  const off = emit.on('entry_completed', () => { entryCompletedFired = true; });
+  try {
+    const state = otel.init({ otel: { tier: 0 } });
+    assert.equal(state, 'disabled');
+
+    // Only our test subscriber is attached; otel.init must not have added one.
+    emit.emit('entry_completed', { entry: { id: 'x' } });
+    assert.equal(entryCompletedFired, true, 'test subscriber should still fire');
+    assert.equal(health.getState(), 'disabled');
+  } finally {
+    off();
+  }
+});
+
+test('otel.init: tier ≥ 1 with packages present → active', () => {
+  const state = otel.init({ otel: { tier: 1 } });
+  assert.equal(state, 'active');
+  assert.equal(health.getState(), 'active');
+});
+
+test('otel.init: tier ≥ 1 with packages absent → degraded with reason', () => {
+  const fakeLazy = { isAvailable: () => false, tryRequire: () => null };
+  const state = otel.init({ otel: { tier: 1 } }, { otelLazy: fakeLazy });
+  assert.equal(state, 'degraded');
+  const status = health.getStatus();
+  assert.equal(status.state, 'degraded');
+  assert.match(status.reason || '', /not installed/i);
+});
+
+test('otel.init: idempotent — second call returns current state without crashing', () => {
+  const first = otel.init({ otel: { tier: 1 } });
+  const second = otel.init({ otel: { tier: 1 } });
+  assert.equal(first, 'active');
+  assert.equal(second, 'active');
+});
+
+test('otel.shutdown: returns state to disabled and unsubscribes', () => {
+  otel.init({ otel: { tier: 1 } });
+  assert.equal(health.getState(), 'active');
+
+  // Verify subscribers exist by spying on a known event — when we emit,
+  // the otel no-op handler fires but does not throw. The handler itself
+  // is a no-op, so we just confirm shutdown clears state without error.
+  otel.shutdown();
+  assert.equal(health.getState(), 'disabled');
+
+  // After shutdown, init can run again.
+  const reinit = otel.init({ otel: { tier: 1 } });
+  assert.equal(reinit, 'active');
+});
+
+test('otel-health: rejects unknown states', () => {
+  assert.throws(() => health.transition('flying'), /unknown state/);
+});
+
+test('otel-health: rejects invalid transitions', () => {
+  health._resetForTests();
+  // disabled → circuit_open is not in the allow-list
+  assert.throws(() => health.transition('circuit_open'), /invalid transition/);
+});
+
+test('otel-health: transition clears reason when leaving error states', () => {
+  health._resetForTests();
+  health.transition('degraded', { reason: 'boom' });
+  assert.equal(health.getStatus().reason, 'boom');
+  health.transition('active');
+  assert.equal(health.getStatus().reason, null);
+});
diff --git a/test/otel-vertical.test.js b/test/otel-vertical.test.js
new file mode 100644
index 0000000..1746dcc
--- /dev/null
+++ b/test/otel-vertical.test.js
@@ -0,0 +1,171 @@
+'use strict';
+
+// Vertical-slice integration: a real OTel MeterProvider posts to an in-process
+// mock OTLP HTTP collector. Proves the full chain — init → emit → record →
+// PeriodicExportingMetricReader → OTLPMetricExporter → HTTP — is wired.
+//
+// Body content (protobuf) is not decoded here. Asserting (1) at least one POST
+// arrived at `/v1/metrics`, (2) content-type is the OTLP HTTP signature, (3)
+// the body is non-empty is enough to demo the rail. Decoded-content assertions
+// land with §10.3 once a protobuf transformer is on the test path.
+
+const test = require('node:test');
+const assert = require('node:assert/strict');
+const http = require('node:http');
+
+const emit = require('../server/emit');
+const otel = require('../server/otel');
+const health = require('../server/otel-health');
+
+function startMockCollector() {
+  const requests = [];
+  const server = http.createServer((req, res) => {
+    const chunks = [];
+    req.on('data', (c) => chunks.push(c));
+    req.on('end', () => {
+      requests.push({
+        method: req.method,
+        url: req.url,
+        contentType: req.headers['content-type'] || '',
+        contentLength: Buffer.concat(chunks).length,
+      });
+      res.writeHead(200, { 'Content-Type': 'application/x-protobuf' });
+      res.end();
+    });
+  });
+  return new Promise((resolve) => {
+    server.listen(0, '127.0.0.1', () => {
+      const { port } = server.address();
+      resolve({
+        url: `http://127.0.0.1:${port}/v1/metrics`,
+        requests,
+        close: () => new Promise((r) => server.close(() => r())),
+      });
+    });
+  });
+}
+
+test.beforeEach(() => otel._resetForTests());
+test.afterEach(async () => {
+  await otel.shutdown();
+});
+
+test('otel vertical slice: tier 1 + endpoint → exporter posts to collector', async () => {
+  const prevInterval = process.env.CCXRAY_OTEL_EXPORT_INTERVAL_MS;
+  // Long interval — we drain explicitly with flush() to avoid races.
+  process.env.CCXRAY_OTEL_EXPORT_INTERVAL_MS = '60000';
+  const collector = await startMockCollector();
+
+  try {
+    const state = otel.init({
+      otel: {
+        tier: 1,
+        endpoint: collector.url,
+        headers: {},
+        resource_attributes: { 'service.name': 'ccxray-test' },
+      },
+    });
+    assert.equal(state, 'active');
+    assert.equal(health.getState(), 'active');
+
+    emit.emit('entry_completed', {
+      entry: {
+        provider: 'anthropic',
+        model: 'claude-test-model',
+        usage: {
+          input_tokens: 100,
+          output_tokens: 50,
+          cache_read_input_tokens: 200,
+          cache_creation_input_tokens: 25,
+        },
+      },
+    });
+
+    await otel.flush();
+
+    // forceFlush triggers the exporter synchronously inside the reader. Give
+    // the HTTP request one tick to actually deliver to our server.
+    for (let i = 0; i < 50 && collector.requests.length === 0; i++) {
+      await new Promise((r) => setTimeout(r, 10));
+    }
+
+    assert.ok(collector.requests.length > 0, 'collector should have received at least one POST');
+    const first = collector.requests[0];
+    assert.equal(first.method, 'POST');
+    assert.equal(first.url, '/v1/metrics');
+    assert.match(first.contentType, /protobuf|json/);
+    assert.ok(first.contentLength > 0, 'collector POST body must be non-empty');
+  } finally {
+    await collector.close();
+    if (prevInterval === undefined) delete process.env.CCXRAY_OTEL_EXPORT_INTERVAL_MS;
+    else process.env.CCXRAY_OTEL_EXPORT_INTERVAL_MS = prevInterval;
+  }
+});
+
+test('otel vertical slice: tier 1 with no endpoint → active but no exporter', async () => {
+  const state = otel.init({ otel: { tier: 1 } });
+  assert.equal(state, 'active');
+
+  // No collector, no SDK context — emit must not throw, must not record.
+  emit.emit('entry_completed', {
+    entry: {
+      provider: 'anthropic',
+      model: 'claude-test-model',
+      usage: { input_tokens: 1, output_tokens: 1 },
+    },
+  });
+
+  await otel.flush(); // no-op, must not throw
+  assert.equal(health.getState(), 'active');
+});
+
+test('otel vertical slice: shutdown honors 2-second cap even when provider hangs', async () => {
+  const prevInterval = process.env.CCXRAY_OTEL_EXPORT_INTERVAL_MS;
+  process.env.CCXRAY_OTEL_EXPORT_INTERVAL_MS = '60000';
+
+  // Mock collector that hangs — never responds. Forces provider.shutdown() to
+  // block until the timeout race resolves.
+  const server = http.createServer((_req, _res) => { /* hang */ });
+  await new Promise((r) => server.listen(0, '127.0.0.1', r));
+  const { port } = server.address();
+  const url = `http://127.0.0.1:${port}/v1/metrics`;
+
+  try {
+    otel.init({ otel: { tier: 1, endpoint: url, headers: {} } });
+    emit.emit('entry_completed', {
+      entry: { provider: 'anthropic', model: 'm', usage: { input_tokens: 1, output_tokens: 1 } },
+    });
+
+    const t0 = Date.now();
+    await otel.shutdown();
+    const elapsed = Date.now() - t0;
+
+    // Hard cap is 2000ms; give 500ms scheduler slack.
+    assert.ok(elapsed < 2500, `shutdown took ${elapsed}ms, must respect 2s cap`);
+    assert.equal(health.getState(), 'disabled');
+  } finally {
+    // Forcibly close still-open sockets from the hung exporter request,
+    // otherwise server.close() waits for them to drain (~8s).
+    if (typeof server.closeAllConnections === 'function') server.closeAllConnections();
+    await new Promise((r) => server.close(() => r()));
+    if (prevInterval === undefined) delete process.env.CCXRAY_OTEL_EXPORT_INTERVAL_MS;
+    else process.env.CCXRAY_OTEL_EXPORT_INTERVAL_MS = prevInterval;
+  }
+});
+
+test('otel vertical slice: emit with no usage is a safe no-op', async () => {
+  const collector = await startMockCollector();
+  try {
+    otel.init({ otel: { tier: 1, endpoint: collector.url, headers: {} } });
+
+    // Entries without usage (e.g. proxy errors) must not break the handler.
+    emit.emit('entry_completed', { entry: { provider: 'anthropic', model: 'm' } });
+    emit.emit('entry_completed', { entry: null });
+    emit.emit('entry_completed', {});
+
+    await otel.flush();
+    assert.equal(health.getState(), 'active');
+  } finally {
+    await collector.close();
+  }
+});

面向	方案 A Metrics Only	方案 B + Synthetic Traces	方案 C 完整 payload	方案 D ★ Hybrid 反查
實作工時	1–2 天	3–5 天	1–2 週	2–3 天
使用者價值	中:cost / token 趨勢	高:turn timing 分析	看情境:power user	最高:聚合 + 細節都有
跟 CLI 內建 OTel 衝突	不衝突	span 重複	重複更嚴重	不衝突 (用 ccxray.* namespace)
Codex / Gemini 支援	是	是(唯一)	是(唯一)	是(唯一)
資料量 / 後端費用	很低	中	高,需取樣	低(~1KB/turn)
隱私風險	無	低	高	無(payload 不出機器)
取代 dashboard 的程度	完全不衝突	部分重疊	高度重疊	互補強化
需要使用者持續開 ccxray dashboard	不需要	不需要	不需要	反查時需要本地 log 還在
想 emit 的 metric	ccxray 現有來源	難度
`ccxray.tool.invocations_total`	response 內的 `tool_use` block(已經在解析)	低
`ccxray.mcp.invocations_total`	tool name 以 `mcp__<server>__<tool>` 為前綴(已有命名規則)	低
`ccxray.skill.activations_total`	system prompt 內的 skill 觸發 marker(`system-prompt.js` 已在解析)	中(需要確認 marker)
`ccxray.sessions_total`	`store.js` 的 session 推斷	低
`ccxray.tokens.* / cost.*`	`pricing.js` + response usage 欄位	低
依「使用者 / 團隊」拆分	需新增 `OTEL_RESOURCE_ATTRIBUTES=enduser.id=...` 設定指引	中(靠使用者設定環境變數)
輸入	結果
`"Authorization": "Bearer ${TOKEN}"` + `TOKEN=abc123`	✓ 載入,實際值 `Bearer abc123`
`"Authorization": "Bearer ${MISSING}"` + 未設 env	✗ Startup 失敗,訊息含 file path / line / 變數名 `MISSING`
`"Authorization": "Bearer abc123longtokenvalue..."`	✗ Schema 拒絕,建議改用 `${ENV_VAR}`
JSON syntax error	✗ Startup 失敗,訊息含 line / column
失敗類型	例子	處理
Config error	JSON syntax 錯、schema 違規、`${VAR}` 未解	啟動失敗(exit code != 0)
Init error	Endpoint URL 格式不合法	轉 degraded,ccxray 正常,status 顯示錯誤
Runtime error	Collector unreachable、auth fail、timeout	由 circuit breaker 處理,exponential backoff
家族	Metric	Attributes
Cost	`ccxray.tokens.input_total`	model, provider ^*
	`ccxray.tokens.output_total`	model, provider ^*
	`ccxray.tokens.cache_read_total`	model, provider ^*
	`ccxray.tokens.cache_creation_total`	model, provider ^*
	`ccxray.cost.usd_total`	model, provider ^*
	`ccxray.cache.hit_ratio`(gauge)	model, provider ^*
Usage	`ccxray.tool.invocations_total`	tool, provider
	`ccxray.mcp.invocations_total`	server, tool
	`ccxray.skill.activations_total`	skill, provider
	`ccxray.sessions_total`	provider
	`ccxray.agent_type.invocations_total`	type
Quality	`ccxray.errors_total`	type, provider
	`ccxray.stop_reason_total`	reason
	`ccxray.latency_ms`(histogram)	model, provider
	`ccxray.max_tokens_hit_total`	model
Patterns	`ccxray.context.utilization_pct`(histogram)
	`ccxray.auto_compact.triggered_total`
	`ccxray.subagent.invocations_total`
	`ccxray.tools_per_turn`(histogram)
Governance	`ccxray.permission_mode.usage_total`	mode
	`ccxray.dangerous_tool.invocations_total`	pattern
	`ccxray.file_writes_total`
	`ccxray.provider.distribution_total`	provider
Sentinels	`ccxray.metrics.overflow_total`	metric, attribute
	`ccxray.parser.unknown_tool_total`	provider
	`ccxray.parser.unknown_skill_marker_total`	provider
	`ccxray.parser.unknown_mcp_format_total`
	`ccxray.parser.fallback_used_total`	parser, reason
	`ccxray.parser.reconciliation_mismatch_total`	type
	`ccxray.parser.error_total`	parser, error_type
	`ccxray.otel.exports_dropped_total`	signal
	`ccxray.otel.state`(gauge)	state
CLI 對帳	`ccxray.reconciliation.token_diff_pct`(gauge)	model
Tier 觀測	`ccxray.otel.tier_distribution`	tier