From ef9b48f5f5cb4b1bb801ea72fcc6437fb58c120e Mon Sep 17 00:00:00 2001 From: Russell Dempsey <1173416+SgtPooki@users.noreply.github.com> Date: Thu, 14 May 2026 14:44:28 -0400 Subject: [PATCH] fix(retrieval): surface silent IPNI timeouts in logs Two adjacent observability bugs left IPNI failures invisible whenever the outer pg-boss retrieval-job timeout fired: 1. IpniVerificationService.verify checked signal?.aborted before verificationSignal.aborted. On a race where both signals were asserted by the time the catch handler ran, signal.throwIfAborted() re-threw before the ipni_verification_timed_out log could fire. Inner check now runs first so the inner timeout log fires whenever the inner timeout signal aborted, regardless of outer state. 2. RetrievalService.verifyIpniForRetrieval's catch returned a silent failure.timedout when signal?.aborted, with no log. Added retrieval_ipni_verification_timed_out so the retrieval-side caller records the abort. Tracks dealbot#524 option C. Pairs with #538 (parallel IPNI + transport). --- .../ipni/ipni-verification.service.spec.ts | 31 +++++++++++++++++++ .../src/ipni/ipni-verification.service.ts | 11 ++++--- .../src/retrieval/retrieval.service.spec.ts | 19 ++++++++++++ .../src/retrieval/retrieval.service.ts | 10 ++++++ 4 files changed, 67 insertions(+), 4 deletions(-) diff --git a/apps/backend/src/ipni/ipni-verification.service.spec.ts b/apps/backend/src/ipni/ipni-verification.service.spec.ts index e80a29be..c5825f9f 100644 --- a/apps/backend/src/ipni/ipni-verification.service.spec.ts +++ b/apps/backend/src/ipni/ipni-verification.service.spec.ts @@ -117,6 +117,37 @@ describe("IpniVerificationService", () => { expect(result.rootCIDVerified).toBe(false); }); + it("logs ipni_verification_timed_out even when external signal also aborted in the same tick", async () => { + const service = new IpniVerificationService(); + const loggerError = vi.spyOn((service as unknown as { logger: { error: (m: object) => void } }).logger, "error"); + const abortController = new AbortController(); + waitForIpniProviderResultsMock.mockImplementation( + async (_cid: CID, options: { signal?: AbortSignal } | undefined) => + await new Promise((_resolve, reject) => { + options?.signal?.addEventListener( + "abort", + () => { + // Race: outer signal aborts before the catch handler runs. + abortController.abort(new Error("outer aborted in same tick")); + reject(new Error("inner aborted")); + }, + { once: true }, + ); + }), + ); + + const result = await service.verify({ + rootCid, + storageProvider: buildStorageProvider(), + timeoutMs: 20, + pollIntervalMs: 2_000, + signal: abortController.signal, + }); + + expect(result.rootCIDVerified).toBe(false); + expect(loggerError).toHaveBeenCalledWith(expect.objectContaining({ event: "ipni_verification_timed_out" })); + }); + it("is capped by the external deal signal", async () => { const service = new IpniVerificationService(); const abortController = new AbortController(); diff --git a/apps/backend/src/ipni/ipni-verification.service.ts b/apps/backend/src/ipni/ipni-verification.service.ts index 3d7d52f9..10a7e276 100644 --- a/apps/backend/src/ipni/ipni-verification.service.ts +++ b/apps/backend/src/ipni/ipni-verification.service.ts @@ -69,10 +69,10 @@ export class IpniVerificationService { expectedProviders, signal: verificationSignal, }).catch((error) => { - if (signal?.aborted) { - signal.throwIfAborted(); - } - if (verificationSignal.aborted) { + // Inner-timeout check runs first so the `ipni_verification_timed_out` event + // fires even when the outer job signal aborts in the same tick. Otherwise + // the inner-timeout log path is unreachable whenever outer < inner. + if (timeoutSignal.aborted) { failureReason = `IPNI verification timed out after ${timeoutMs}ms`; this.logger.error({ event: "ipni_verification_timed_out", @@ -89,6 +89,9 @@ export class IpniVerificationService { }); return false; } + if (signal?.aborted) { + signal.throwIfAborted(); + } const errorMessage = error instanceof Error ? error.message : String(error); failureReason = errorMessage; this.logger.error({ diff --git a/apps/backend/src/retrieval/retrieval.service.spec.ts b/apps/backend/src/retrieval/retrieval.service.spec.ts index 97a9381a..a467e92c 100644 --- a/apps/backend/src/retrieval/retrieval.service.spec.ts +++ b/apps/backend/src/retrieval/retrieval.service.spec.ts @@ -505,6 +505,25 @@ describe("RetrievalService parallel IPNI + transport", () => { expect(mockDiscoverabilityMetrics.recordStatus).toHaveBeenCalledWith(labels, "failure.timedout"); }); + it("logs retrieval_ipni_verification_timed_out when outer signal aborts and IPNI verify throws", async () => { + service = await createService(); + setupCommonMocks(); + const loggerWarn = vi.spyOn((service as unknown as { logger: { warn: (m: object) => void } }).logger, "warn"); + const abortController = new AbortController(); + mockIpniVerificationService.verify.mockImplementation(async () => { + abortController.abort(new Error("outer timeout")); + throw new Error("ipni aborted"); + }); + mockRetrievalAddonsService.testAllRetrievalMethods.mockResolvedValue(successfulTransport); + + await service.performAllRetrievals(buildDealWithIpni(), abortController.signal); + + expect(loggerWarn).toHaveBeenCalledWith( + expect.objectContaining({ event: "retrieval_ipni_verification_timed_out" }), + ); + expect(mockRetrievalMetrics.recordStatus).toHaveBeenCalledWith(labels, "failure.timedout"); + }); + it("runs IPNI and transport concurrently", async () => { service = await createService(); setupCommonMocks(); diff --git a/apps/backend/src/retrieval/retrieval.service.ts b/apps/backend/src/retrieval/retrieval.service.ts index 3de73eaf..fea7c897 100644 --- a/apps/backend/src/retrieval/retrieval.service.ts +++ b/apps/backend/src/retrieval/retrieval.service.ts @@ -473,6 +473,16 @@ export class RetrievalService { } catch (error) { if (signal?.aborted) { const failureStatus = "failure.timedout"; + this.logger.warn({ + event: "retrieval_ipni_verification_timed_out", + message: "Retrieval IPNI verification aborted by outer job timeout", + dealId, + providerId: provider.providerId, + providerName: provider.name, + providerAddress: provider.address, + ipfsRootCID: ipniContext.rootCid.toString(), + error: toStructuredError(error), + }); this.discoverabilityMetrics.recordStatus(providerLabels, failureStatus); return { ok: false, failureStatus }; }