Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
<!-- Version can be overridden from the command line: -p:Version=0.3.1
AssemblyVersion and FileVersion are derived automatically by the SDK
(prerelease suffixes like -beta001 are stripped for assembly versions). -->
<Version>0.10.40</Version>
<Version>0.10.42</Version>
</PropertyGroup>

<!-- NuGet package metadata (shared across all packable projects) -->
Expand Down
1 change: 1 addition & 0 deletions src/RockBot.Agent/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ async Task<IChatClient> BuildClientForTierAsync(LlmTierConfig config, string tie
agent.WithFeedback();
agent.WithSkills();
agent.WithKnowledgeGraph();
agent.WithFailureClusterStore();
agent.WithDreaming();
agent.AddToolHandler();
agent.AddMcpToolProxy();
Expand Down
28 changes: 28 additions & 0 deletions src/RockBot.Host.Abstractions/ClusterKey.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
namespace RockBot.Host;

/// <summary>
/// Identity of a tool-failure cluster. Server and tool are normalised to
/// lowercase so case differences from MCP responses don't fragment clusters.
/// See <c>design/self-repair.md</c> Phase 5.
/// </summary>
/// <param name="Server">MCP server name (lowercased on construction).</param>
/// <param name="Tool">Tool name on that server (lowercased on construction).</param>
/// <param name="ErrorClass">Deterministic class of error — usually a missing field name extracted from the error string, or <c>"unknown"</c>.</param>
public sealed record ClusterKey(string Server, string Tool, string ErrorClass)
{
public string Server { get; } = NormaliseLowerOrThrow(Server, nameof(Server));
public string Tool { get; } = NormaliseLowerOrThrow(Tool, nameof(Tool));
public string ErrorClass { get; } = ValidateOrThrow(ErrorClass, nameof(ErrorClass));

private static string NormaliseLowerOrThrow(string value, string paramName)
{
ArgumentException.ThrowIfNullOrWhiteSpace(value, paramName);
return value.ToLowerInvariant();
}

private static string ValidateOrThrow(string value, string paramName)
{
ArgumentException.ThrowIfNullOrWhiteSpace(value, paramName);
return value;
}
}
21 changes: 21 additions & 0 deletions src/RockBot.Host.Abstractions/FailureCluster.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
namespace RockBot.Host;

/// <summary>
/// Aggregate state for a stream of post-recovery tool failures sharing the same
/// <see cref="ClusterKey"/>. Tracked in-process by <see cref="IFailureClusterStore"/>
/// and persisted to the PVC so cluster history survives agent restarts.
/// See <c>design/self-repair.md</c> Phase 5.
/// </summary>
/// <param name="Key">Cluster identity (server, tool, error class).</param>
/// <param name="Count">Total number of failures recorded for this cluster.</param>
/// <param name="SessionIds">Distinct session ids that contributed at least one failure. Bounded by <see cref="FailureClusterOptions.MaxSessionIdsPerCluster"/>.</param>
/// <param name="FirstSeen">UTC timestamp of the first recorded failure.</param>
/// <param name="LastSeen">UTC timestamp of the most recent recorded failure.</param>
/// <param name="SampleErrorMessages">Most recent distinct error messages, oldest-first. Bounded by <see cref="FailureClusterOptions.MaxSampleMessages"/> with each entry truncated.</param>
public sealed record FailureCluster(
ClusterKey Key,
int Count,
IReadOnlySet<string> SessionIds,
DateTimeOffset FirstSeen,
DateTimeOffset LastSeen,
IReadOnlyList<string> SampleErrorMessages);
59 changes: 59 additions & 0 deletions src/RockBot.Host.Abstractions/FailureClusterOptions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
namespace RockBot.Host;

/// <summary>
/// Options for the failure cluster store. When <see cref="BasePath"/> is relative
/// it is resolved under <see cref="AgentProfileOptions.BasePath"/>, mirroring
/// <see cref="MemoryOptions"/>.
/// </summary>
public sealed class FailureClusterOptions
{
/// <summary>
/// Base directory for cluster state files. Defaults to <c>"telemetry"</c>.
/// When relative, resolved under the agent profile base path
/// (<c>/data/agent/telemetry</c> in K8s).
/// </summary>
public string BasePath { get; set; } = "telemetry";

/// <summary>
/// How often the in-memory cluster state is flushed to a snapshot file and
/// the JSONL log truncated. Default 30 seconds.
/// </summary>
public TimeSpan FlushInterval { get; set; } = TimeSpan.FromSeconds(30);

/// <summary>
/// Maximum number of sample error messages retained per cluster. Default 5.
/// Most recent messages are kept; older messages are dropped on overflow.
/// </summary>
public int MaxSampleMessages { get; set; } = 5;

/// <summary>
/// Maximum length per sample error message (characters). Longer messages are
/// truncated with an ellipsis. Default 512.
/// </summary>
public int MaxSampleMessageLength { get; set; } = 512;

/// <summary>
/// Maximum number of distinct session ids retained per cluster. Once this
/// cap is reached, additional sessions still increment <see cref="FailureCluster.Count"/>
/// but do not grow the set. Default 64.
/// </summary>
public int MaxSessionIdsPerCluster { get; set; } = 64;

/// <summary>
/// Minimum failure count for a cluster to be reported as escalatable.
/// Default 3 (matches the Phase 5 acceptance criterion).
/// </summary>
public int EscalationCountThreshold { get; set; } = 3;

/// <summary>
/// Minimum number of distinct sessions for a cluster to be reported as
/// escalatable. Default 2.
/// </summary>
public int EscalationSessionThreshold { get; set; } = 2;

/// <summary>
/// Maximum age of <see cref="FailureCluster.LastSeen"/> for a cluster to be
/// reported as escalatable. Default 24 hours.
/// </summary>
public TimeSpan EscalationWindow { get; set; } = TimeSpan.FromHours(24);
}
46 changes: 46 additions & 0 deletions src/RockBot.Host.Abstractions/IFailureClusterStore.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
namespace RockBot.Host;

/// <summary>
/// Tracks tool failure clusters in-process for hot reads/writes, with PVC-backed
/// persistence for crash recovery. The MCP gateway records every post-recovery
/// failure here; the dream service reads clusters to drive repair tickets.
/// Auto-recovered calls are NOT recorded — they live in the recovery telemetry
/// metrics counter only. See <c>design/self-repair.md</c> Phase 5.
/// </summary>
public interface IFailureClusterStore
{
/// <summary>
/// Increments or creates the cluster identified by <paramref name="key"/>,
/// adding <paramref name="sessionId"/> (when non-null) to the set of sessions
/// that have produced this failure and appending <paramref name="errorMessage"/>
/// to the bounded sample buffer.
/// </summary>
/// <param name="key">Cluster identity.</param>
/// <param name="sessionId">Originating session, or null when the call was outside a session context.</param>
/// <param name="errorMessage">Raw error text (truncated by the store).</param>
/// <param name="at">Timestamp of the failure (typically <see cref="DateTimeOffset.UtcNow"/>).</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task RecordAsync(
ClusterKey key,
string? sessionId,
string errorMessage,
DateTimeOffset at,
CancellationToken cancellationToken = default);

/// <summary>
/// Returns a snapshot of every cluster currently tracked, ordered by
/// <see cref="FailureCluster.LastSeen"/> descending.
/// </summary>
Task<IReadOnlyList<FailureCluster>> GetAllAsync(CancellationToken cancellationToken = default);

/// <summary>
/// Returns the subset of clusters that meet the escalation thresholds
/// configured in <see cref="FailureClusterOptions"/> — by default
/// <c>Count >= 3 &amp;&amp; SessionIds.Count >= 2 &amp;&amp; (now - LastSeen) &lt; 24h</c>.
/// </summary>
/// <param name="now">The reference time used to evaluate the recency window.</param>
/// <param name="cancellationToken">Cancellation token.</param>
Task<IReadOnlyList<FailureCluster>> GetEscalatableAsync(
DateTimeOffset now,
CancellationToken cancellationToken = default);
}
22 changes: 22 additions & 0 deletions src/RockBot.Host/AgentMemoryExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,28 @@ public static AgentHostBuilder WithFeedback(
return builder;
}

/// <summary>
/// Registers the in-process, PVC-backed failure cluster store. Records every
/// post-recovery MCP tool failure so DreamService can spot recurring patterns
/// and open repair tickets. Opt-in — call after <see cref="WithMemory"/>.
/// See <c>design/self-repair.md</c> Phase 5.
/// </summary>
public static AgentHostBuilder WithFailureClusterStore(
this AgentHostBuilder builder,
Action<FailureClusterOptions>? configure = null)
{
if (configure is not null)
builder.Services.Configure(configure);
else
builder.Services.Configure<FailureClusterOptions>(_ => { });

builder.Services.AddSingleton<FileFailureClusterStore>();
builder.Services.AddSingleton<IFailureClusterStore>(sp => sp.GetRequiredService<FileFailureClusterStore>());
builder.Services.AddSingleton<IHostedService>(sp => sp.GetRequiredService<FileFailureClusterStore>());

return builder;
}

/// <summary>
/// Registers the file-backed knowledge graph store for entity-relationship reasoning.
/// </summary>
Expand Down
44 changes: 44 additions & 0 deletions src/RockBot.Host/FailureErrorClassifier.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
using System.Text.RegularExpressions;

namespace RockBot.Host;

/// <summary>
/// Deterministically maps an MCP error string to an error class for
/// <see cref="ClusterKey.ErrorClass"/>. Mirrors the patterns Phase 1 uses to
/// extract a missing required field name; falls back to <c>"unknown"</c> when
/// no pattern matches.
/// See <c>design/self-repair.md</c> Phase 5.
/// </summary>
internal static class FailureErrorClassifier
{
public const string Unknown = "unknown";

private const RegexOptions Opts =
RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.CultureInvariant;

private static readonly Regex[] Patterns =
[
new(@"Required\s+parameter\s+['""]?(?<f>[A-Za-z_][A-Za-z0-9_]*)['""]?", Opts),
new(@"['""]?(?<f>[A-Za-z_][A-Za-z0-9_]*)['""]?\s+is\s+required\b", Opts),
new(@"missing\s+required\s+argument\s+['""]?(?<f>[A-Za-z_][A-Za-z0-9_]*)['""]?", Opts),
new(@"expected\s+field\s+['""]?(?<f>[A-Za-z_][A-Za-z0-9_]*)['""]?", Opts),
new(@"['""]?(?<f>[A-Za-z_][A-Za-z0-9_]*)['""]?\s*:\s*must\s+be\s+provided", Opts),
];

public static string Classify(string? errorText)
{
if (string.IsNullOrWhiteSpace(errorText)) return Unknown;

foreach (var rx in Patterns)
{
var m = rx.Match(errorText);
if (m.Success)
{
var name = m.Groups["f"].Value;
if (!string.IsNullOrEmpty(name)) return name;
}
}

return Unknown;
}
}
Loading
Loading