Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Directory.Build.props
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
<!-- Version can be overridden from the command line: -p:Version=0.3.1
AssemblyVersion and FileVersion are derived automatically by the SDK
(prerelease suffixes like -beta001 are stripped for assembly versions). -->
<Version>0.10.42</Version>
<Version>0.10.44</Version>
</PropertyGroup>

<!-- NuGet package metadata (shared across all packable projects) -->
Expand Down
1 change: 1 addition & 0 deletions src/RockBot.Agent/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,7 @@ async Task<IChatClient> BuildClientForTierAsync(LlmTierConfig config, string tie
agent.WithSkills();
agent.WithKnowledgeGraph();
agent.WithFailureClusterStore();
agent.WithRepairTickets();
agent.WithDreaming();
agent.AddToolHandler();
agent.AddMcpToolProxy();
Expand Down
38 changes: 38 additions & 0 deletions src/RockBot.Host.Abstractions/IRepairTargetApplier.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
using System.Text.Json;

namespace RockBot.Host;

/// <summary>
/// Applies a <see cref="RepairTicket"/>'s <c>Change</c> payload against its
/// <see cref="RepairTicket.Target"/>. One implementation per <see cref="RepairTarget"/>
/// value; the dream-service apply pass picks the matching applier from the
/// registered enumerable. See <c>design/self-repair.md</c> Phase 4.
/// </summary>
public interface IRepairTargetApplier
{
/// <summary>Which target this applier handles. The apply pass dispatches on this value.</summary>
RepairTarget Target { get; }

/// <summary>
/// Applies the change. Throws on malformed payloads — the dream-service apply
/// pass catches and records the failure as an Uncertain attempt rather than
/// propagating the exception.
/// </summary>
Task<RepairApplyOutcome> ApplyAsync(RepairTicket ticket, CancellationToken cancellationToken);
}

/// <summary>
/// Outcome of <see cref="IRepairTargetApplier.ApplyAsync"/>.
/// </summary>
/// <param name="AppliedDiff">
/// Structured description of what the applier did. Recorded on the
/// <see cref="RepairAttempt"/> so subsequent cycles can dedup by change-hash.
/// </param>
/// <param name="Revert">
/// Optional callback that undoes the change. Only set when the applier supports
/// reversal (currently only <c>SkillBody</c>); the dream-service apply pass invokes
/// it when the post-apply verify fails so a bad change cannot cascade.
/// </param>
public sealed record RepairApplyOutcome(
JsonElement AppliedDiff,
Func<CancellationToken, Task>? Revert);
24 changes: 24 additions & 0 deletions src/RockBot.Host.Abstractions/IRepairTicketStore.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
namespace RockBot.Host;

/// <summary>
/// Persistent store for <see cref="RepairTicket"/> artifacts. One JSON file per
/// ticket on the PVC; concurrent updates are safe via temp+rename writes.
/// See <c>design/self-repair.md</c> Phase 4.
/// </summary>
public interface IRepairTicketStore
{
/// <summary>Returns every ticket currently on disk, ordered by <see cref="RepairTicket.UpdatedAt"/> descending.</summary>
Task<IReadOnlyList<RepairTicket>> ListAsync(CancellationToken cancellationToken = default);

/// <summary>Returns tickets whose <see cref="RepairTicket.Status"/> is <see cref="RepairStatus.Open"/> or <see cref="RepairStatus.InProgress"/>.</summary>
Task<IReadOnlyList<RepairTicket>> ListOpenAsync(CancellationToken cancellationToken = default);

/// <summary>Returns the ticket with the given id, or <c>null</c> if no such file exists.</summary>
Task<RepairTicket?> GetAsync(string id, CancellationToken cancellationToken = default);

/// <summary>Creates or replaces the ticket on disk. Atomic via temp file + rename.</summary>
Task SaveAsync(RepairTicket ticket, CancellationToken cancellationToken = default);

/// <summary>Removes the ticket. No-op if no such file exists.</summary>
Task DeleteAsync(string id, CancellationToken cancellationToken = default);
}
22 changes: 22 additions & 0 deletions src/RockBot.Host.Abstractions/IRepairTicketVerifier.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
namespace RockBot.Host;

/// <summary>
/// Evaluates a <see cref="VerifyShape"/> against the live system to decide whether
/// a <see cref="RepairTicket"/>'s applied <c>Change</c> resolved the underlying
/// failure cluster. Repair-ticket verification is uncached — every attempt must
/// observe post-apply state, never a previous cycle's result.
/// </summary>
public interface IRepairTicketVerifier
{
/// <summary>Evaluates the verify shape with the implementation's default budget.</summary>
Task<VerifyResult> VerifyAsync(VerifyShape shape, CancellationToken cancellationToken = default);

/// <summary>
/// Evaluates the verify shape with an explicit wallclock budget. The apply pass uses
/// this overload to apply backoff after repeated timeouts on slow verify shapes
/// (e.g. tools that fan out across accounts). When <paramref name="budget"/> is null,
/// the implementation's default budget is used.
/// </summary>
Task<VerifyResult> VerifyAsync(VerifyShape shape, TimeSpan? budget, CancellationToken cancellationToken = default) =>
VerifyAsync(shape, cancellationToken);
}
8 changes: 8 additions & 0 deletions src/RockBot.Host.Abstractions/ISystemPromptBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,12 @@ public interface ISystemPromptBuilder
/// Builds the system prompt string.
/// </summary>
string Build(AgentProfile profile, AgentIdentity identity);

/// <summary>
/// Builds the system prompt string, optionally appending a category-scoped hint
/// from <c>{agent-profile}/prompt-hints/{category}.md</c>. Implementations that
/// don't support hints fall back to the parameterless overload.
/// See <c>design/self-repair.md</c> Phase 4 — <see cref="RepairTarget.PromptBuilderHint"/>.
/// </summary>
string Build(AgentProfile profile, AgentIdentity identity, string? category) => Build(profile, identity);
}
16 changes: 16 additions & 0 deletions src/RockBot.Host.Abstractions/RepairAttempt.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
using System.Text.Json;

namespace RockBot.Host;

/// <summary>
/// One apply+verify attempt against a <see cref="RepairTicket"/>. The ticket
/// retains the full attempt history so escalation summaries and dedup logic
/// can introspect prior tries.
/// </summary>
/// <param name="At">UTC timestamp of the attempt.</param>
/// <param name="AppliedDiff">Structured diff describing what the applier did. Shape depends on the target.</param>
/// <param name="Result">Outcome of the post-apply <see cref="VerifyShape"/> evaluation.</param>
public sealed record RepairAttempt(
DateTimeOffset At,
JsonElement AppliedDiff,
VerifyResult Result);
19 changes: 19 additions & 0 deletions src/RockBot.Host.Abstractions/RepairStatus.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
namespace RockBot.Host;

/// <summary>
/// Lifecycle state of a <see cref="RepairTicket"/>. See <c>design/self-repair.md</c> Phase 4.
/// </summary>
public enum RepairStatus
{
/// <summary>Created, no apply attempted yet (or pending another retry after a failed verify).</summary>
Open,

/// <summary>Apply has started for the current cycle; will transition to Resolved/Open/Escalated when the cycle finishes.</summary>
InProgress,

/// <summary>Verify succeeded after apply — the change is considered to have fixed the cluster.</summary>
Resolved,

/// <summary>Apply attempts exhausted (default 3) without a successful verify; surfaced via <c>repair-escalations-latest</c>.</summary>
Escalated,
}
21 changes: 21 additions & 0 deletions src/RockBot.Host.Abstractions/RepairTarget.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
namespace RockBot.Host;

/// <summary>
/// What a <see cref="RepairTicket"/> mutates when applied. Each target has a
/// matching <see cref="IRepairTargetApplier"/> implementation that interprets
/// the ticket's <c>Change</c> payload. See <c>design/self-repair.md</c> Phase 4.
/// </summary>
public enum RepairTarget
{
/// <summary>Edit a named skill's body — append, replace section, or delete section.</summary>
SkillBody,

/// <summary>Delete working-memory entries by key or key-prefix.</summary>
WorkingMemoryEvict,

/// <summary>Append a default value to <c>/data/agent/tool-defaults/{server}.json</c>.</summary>
ToolDefaultRegister,

/// <summary>Append or replace a hint section in <c>/data/agent/prompt-hints/{category}.md</c>.</summary>
PromptBuilderHint,
}
33 changes: 33 additions & 0 deletions src/RockBot.Host.Abstractions/RepairTicket.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
using System.Text.Json;

namespace RockBot.Host;

/// <summary>
/// A self-repair work item: "apply <see cref="Change"/> to <see cref="Target"/>,
/// then run <see cref="Verify"/> — if verify succeeds, mark resolved; otherwise
/// retry up to <c>RepairTicketOptions.MaxAttempts</c> times before escalating."
/// Persisted to PVC as one JSON file per ticket so updates are atomic via temp+rename.
/// See <c>design/self-repair.md</c> Phase 4.
/// </summary>
/// <param name="Id">Stable ticket identifier (used as filename and dedup key).</param>
/// <param name="PatternKey">
/// Canonical string form of the originating <see cref="ClusterKey"/> — <c>"server|tool|errorClass"</c>.
/// Used to dedup ticket creation against the same failure cluster.
/// </param>
/// <param name="Target">Which apply contract this ticket invokes.</param>
/// <param name="Change">JSON payload interpreted by the matching <see cref="IRepairTargetApplier"/>.</param>
/// <param name="Verify">Predicate that decides whether the change resolved the cluster.</param>
/// <param name="Attempts">Append-only attempt history. Empty for newly-created tickets.</param>
/// <param name="Status">Lifecycle state.</param>
/// <param name="CreatedAt">When the ticket was first opened.</param>
/// <param name="UpdatedAt">When the ticket was last persisted (apply, verify, status change).</param>
public sealed record RepairTicket(
string Id,
string PatternKey,
RepairTarget Target,
JsonElement Change,
VerifyShape Verify,
IReadOnlyList<RepairAttempt> Attempts,
RepairStatus Status,
DateTimeOffset CreatedAt,
DateTimeOffset UpdatedAt);
57 changes: 57 additions & 0 deletions src/RockBot.Host.Abstractions/RepairTicketOptions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
namespace RockBot.Host;

/// <summary>
/// Options for the closed-loop repair-ticket pipeline. When <see cref="BasePath"/>
/// is relative it is resolved under <see cref="AgentProfileOptions.BasePath"/>,
/// matching <see cref="FailureClusterOptions"/>.
/// See <c>design/self-repair.md</c> Phase 4.
/// </summary>
public sealed class RepairTicketOptions
{
/// <summary>
/// Whether the closed-loop repair-ticket passes (creation + apply) run during
/// each dream cycle. Default true. The store and appliers are still registered
/// when false so a future cycle can pick them up without restart.
/// </summary>
public bool Enabled { get; set; } = true;

/// <summary>
/// Base directory for ticket JSON files. Defaults to <c>"repair-tickets"</c>.
/// When relative, resolved under the agent profile base path
/// (<c>/data/agent/repair-tickets</c> in K8s).
/// </summary>
public string BasePath { get; set; } = "repair-tickets";

/// <summary>
/// Maximum number of failed verify attempts before a ticket is escalated.
/// Default 3. Uncertain verifies (gateway error, budget exceeded) do not count.
/// </summary>
public int MaxAttempts { get; set; } = 3;

/// <summary>
/// Maximum number of new tickets the LLM-driven creation pass may open in a
/// single dream cycle. Default 5. Bounds the blast radius of a bad LLM cycle.
/// </summary>
public int MaxTicketsPerCycle { get; set; } = 5;

/// <summary>
/// Working-memory key under which the apply pass writes the rolling escalation
/// summary. Default <c>repair-escalations-latest</c>. Overwritten each cycle that
/// produces an escalated ticket.
/// </summary>
public string EscalationWmKey { get; set; } = "repair-escalations-latest";

/// <summary>
/// TTL for the escalation working-memory entry. Default 7 days — long enough
/// for the user to see the escalation across multiple sessions, short enough
/// that stale entries self-purge.
/// </summary>
public TimeSpan EscalationWmTtl { get; set; } = TimeSpan.FromDays(7);

/// <summary>
/// Path to the repair-ticket creation directive file, relative to
/// <see cref="AgentProfileOptions.BasePath"/>. When the file does not exist,
/// a built-in fallback directive is used.
/// </summary>
public string CreationDirectivePath { get; set; } = "repair-ticket-creation.md";
}
9 changes: 8 additions & 1 deletion src/RockBot.Host.Abstractions/VerifyResult.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ namespace RockBot.Host;
/// </summary>
/// <param name="Outcome">Categorical outcome — drives whether the underlying claim is evicted, retained, or annotated.</param>
/// <param name="Detail">Optional diagnostic detail (error message, recovery trail) for logging and uncertainty annotations.</param>
/// <param name="TimedOut">
/// Set when <see cref="Outcome"/> is <see cref="VerifyOutcome.Uncertain"/> specifically because the
/// verifier exhausted its per-call wallclock budget. Lets the caller distinguish "tool is too slow"
/// from "executor missing" or "gateway error" so it can apply targeted retries (e.g. budget backoff)
/// rather than retrying every uncertain cause the same way.
/// </param>
public sealed record VerifyResult(
VerifyOutcome Outcome,
string? Detail = null);
string? Detail = null,
bool TimedOut = false);
21 changes: 20 additions & 1 deletion src/RockBot.Host/AgentContextBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@ public async Task<List<ChatMessage>> BuildAsync(
string? systemPromptOverride = null)
{
var profile = profileHolder.Profile;
var systemPrompt = systemPromptOverride ?? promptBuilder.Build(profile, agent);
// Derive a category for prompt-hint injection (Phase 4 PromptBuilderHint).
// Categories are the top-level segment of the working-memory namespace —
// "session", "patrol", "subagent" — so a hint file like
// /data/agent/prompt-hints/patrol.md is injected into patrol-task prompts only.
var category = DerivePromptCategory(workingMemoryNamespace);
var systemPrompt = systemPromptOverride ?? promptBuilder.Build(profile, agent, category);
var chatMessages = new List<ChatMessage>
{
new(ChatRole.System, systemPrompt),
Expand Down Expand Up @@ -711,4 +716,18 @@ private static string ClaimAnnotation(IReadOnlyDictionary<string, string> uncert
uncertain.TryGetValue(id, out var detail)
? $" [verifier-uncertain: {detail}]"
: string.Empty;

/// <summary>
/// Returns the top-level segment of the working-memory namespace as the prompt
/// category. Defaults to <c>"session"</c> when no namespace is supplied.
/// </summary>
internal static string? DerivePromptCategory(string? workingMemoryNamespace)
{
if (string.IsNullOrWhiteSpace(workingMemoryNamespace))
return "session";

var slash = workingMemoryNamespace.IndexOf('/');
var head = slash < 0 ? workingMemoryNamespace : workingMemoryNamespace[..slash];
return string.IsNullOrWhiteSpace(head) ? null : head;
}
}
28 changes: 28 additions & 0 deletions src/RockBot.Host/AgentMemoryExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,34 @@ public static AgentHostBuilder WithFailureClusterStore(
return builder;
}

/// <summary>
/// Registers the closed-loop repair-ticket pipeline (Phase 4 self-repair):
/// the file-backed <see cref="IRepairTicketStore"/>, the four
/// <see cref="IRepairTargetApplier"/> implementations, and the cache-free
/// <see cref="IRepairTicketVerifier"/>. Opt-in — call after
/// <see cref="WithFailureClusterStore"/>, <see cref="WithMemory"/>,
/// <see cref="WithSkills"/>, and <see cref="WithWorkingMemory"/>.
/// See <c>design/self-repair.md</c> Phase 4.
/// </summary>
public static AgentHostBuilder WithRepairTickets(
this AgentHostBuilder builder,
Action<RepairTicketOptions>? configure = null)
{
if (configure is not null)
builder.Services.Configure(configure);
else
builder.Services.Configure<RepairTicketOptions>(_ => { });

builder.Services.AddSingleton<IRepairTicketStore, FileRepairTicketStore>();
builder.Services.AddSingleton<IRepairTargetApplier, SkillBodyApplier>();
builder.Services.AddSingleton<IRepairTargetApplier, WorkingMemoryEvictApplier>();
builder.Services.AddSingleton<IRepairTargetApplier, ToolDefaultRegisterApplier>();
builder.Services.AddSingleton<IRepairTargetApplier, PromptBuilderHintApplier>();
builder.Services.AddSingleton<IRepairTicketVerifier, RepairTicketVerifier>();

return builder;
}

/// <summary>
/// Registers the file-backed knowledge graph store for entity-relationship reasoning.
/// </summary>
Expand Down
6 changes: 4 additions & 2 deletions src/RockBot.Host/CapabilityClaimVerifier.cs
Original file line number Diff line number Diff line change
Expand Up @@ -98,8 +98,10 @@ public async Task<VerifyResult> VerifyAsync(VerifyShape shape, CancellationToken
}
catch (OperationCanceledException) when (!ct.IsCancellationRequested)
{
result = new VerifyResult(VerifyOutcome.Uncertain,
$"verify budget exceeded ({_budget.TotalSeconds:F1}s)");
result = new VerifyResult(
VerifyOutcome.Uncertain,
$"verify budget exceeded ({_budget.TotalSeconds:F1}s)",
TimedOut: true);
}
catch (OperationCanceledException)
{
Expand Down
Loading
Loading