Skip to content

Commit 465e6ae

Browse files
hkiratclaude
andcommitted
fix: detect dead agent processes and reduce stale timeout to 10 minutes
The agent could get stuck forever if the node process died mid-loop (OOM, network error, stuck execSync) without writing a done/error entry. The only recovery was a 1-hour safety valve. Now syncAgentProgress checks if the agent process is actually alive via pgrep. If the process is dead but no done/error was logged, it cleans up immediately so the user can retry. Also reduced the safety valve from 1 hour to 10 minutes. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent c751cb7 commit 465e6ae

1 file changed

Lines changed: 48 additions & 4 deletions

File tree

apps/server/src/services/agent.service.ts

Lines changed: 48 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ const CONFIG_PATH = `${VENDI_DIR}/agent-config.json`;
1010
const SCRIPT_PATH = `${VENDI_DIR}/agent.mjs`;
1111
const LOG_PATH = `${VENDI_DIR}/agent-log.jsonl`;
1212

13-
const AGENT_STALE_MS = 60 * 60 * 1000; // 1 hour safety valve
13+
const AGENT_STALE_MS = 10 * 60 * 1000; // 10 minute safety valve
1414

1515
interface StartAgentConfig {
1616
sessionId: string;
@@ -168,7 +168,7 @@ export async function syncAgentProgress(sessionId: string): Promise<void> {
168168
data: {
169169
sessionId,
170170
role: "SYSTEM",
171-
content: "Agent timed out after 1 hour. You can send a new message to try again.",
171+
content: "Agent timed out. You can send a new message to try again.",
172172
},
173173
});
174174
await prisma.session.update({
@@ -181,21 +181,65 @@ export async function syncAgentProgress(sessionId: string): Promise<void> {
181181

182182
// Read log file from sandbox
183183
let logContent = "";
184+
let sandbox: Sandbox;
184185
try {
185-
const sandbox = await Sandbox.connect(session.sandboxId);
186+
sandbox = await Sandbox.connect(session.sandboxId);
186187
logContent = String(await sandbox.files.read(LOG_PATH));
187188
} catch {
188189
// Sandbox might not be reachable — skip this cycle
189190
return;
190191
}
191192

192193
const entries = parseLogFile(logContent);
193-
if (entries.length === 0) return;
194194

195195
// Check for done/error entry
196196
const doneEntry = entries.find((e) => e.type === "done");
197197
const errorEntry = entries.find((e) => e.type === "error");
198198

199+
// If no done/error yet, check if the agent process is actually still alive
200+
if (!doneEntry && !errorEntry) {
201+
try {
202+
const psResult = await sandbox.commands.run(
203+
"pgrep -f 'node.*agent\\.mjs' > /dev/null 2>&1 && echo ALIVE || echo DEAD",
204+
{ requestTimeoutMs: 5_000 }
205+
);
206+
if (psResult.stdout.trim() === "DEAD" && entries.length > 0) {
207+
console.log(`[Agent] Agent process died without writing done/error for session ${sessionId}`);
208+
// Treat as a crashed agent — synthesize an error entry
209+
const toolCalls: ToolCallEntry[] = entries
210+
.filter((e) => e.type === "tool_call")
211+
.map((e) => ({
212+
id: e.id || "",
213+
name: e.name || "",
214+
args: e.args || {},
215+
result: e.result || "",
216+
timestamp: e.timestamp || "",
217+
}));
218+
219+
if (session.agentWorkingMsgId) {
220+
await prisma.chatMessage.delete({ where: { id: session.agentWorkingMsgId } }).catch(() => {});
221+
}
222+
await prisma.chatMessage.create({
223+
data: {
224+
sessionId,
225+
role: "SYSTEM",
226+
content: "The agent encountered an unexpected error. You can send a new message to try again.",
227+
metadata: toolCalls.length > 0 ? JSON.parse(JSON.stringify({ toolCalls })) : undefined,
228+
},
229+
});
230+
await prisma.session.update({
231+
where: { id: sessionId },
232+
data: { agentRunId: null, agentWorkingMsgId: null, agentRunStartedAt: null },
233+
});
234+
return;
235+
}
236+
} catch {
237+
// pgrep check failed — ignore, will retry next sweep
238+
}
239+
240+
if (entries.length === 0) return;
241+
}
242+
199243
// Extract tool calls for display
200244
const toolCalls: ToolCallEntry[] = entries
201245
.filter((e) => e.type === "tool_call")

0 commit comments

Comments
 (0)