diff --git a/cli/golem-cli/src/command_handler/interactive.rs b/cli/golem-cli/src/command_handler/interactive.rs index e24761bcea..e97289d85e 100644 --- a/cli/golem-cli/src/command_handler/interactive.rs +++ b/cli/golem-cli/src/command_handler/interactive.rs @@ -228,6 +228,14 @@ impl InteractiveHandler { ) } + pub fn confirm_interrupt_ephemeral_agent(&self) -> anyhow::Result { + self.confirm( + false, + "The target agent is ephemeral. Interrupting it will stop the current invocation and it cannot be resumed. Continue?", + None, + ) + } + pub fn confirm_reset_allow_incompatible_component_update( &self, component_name: &ComponentName, diff --git a/cli/golem-cli/src/command_handler/worker/mod.rs b/cli/golem-cli/src/command_handler/worker/mod.rs index 387a97f0d2..e7ecfe2711 100644 --- a/cli/golem-cli/src/command_handler/worker/mod.rs +++ b/cli/golem-cli/src/command_handler/worker/mod.rs @@ -1123,6 +1123,20 @@ impl WorkerCommandHandler { .component_by_agent_name_match(&agent_name_match) .await?; + if component + .metadata + .agent_types() + .iter() + .find(|agent_type| agent_type.type_name == agent_name_match.agent_type_name) + .is_some_and(|agent_type| agent_type.mode == AgentMode::Ephemeral) + && !self + .ctx + .interactive_handler() + .confirm_interrupt_ephemeral_agent()? + { + bail!(NonSuccessfulExit); + } + log_action( "Interrupting", format!("agent {}", format_agent_name_match(&agent_name_match)), diff --git a/cli/golem-cli/wit/deps/golem-1.x/golem-oplog.wit b/cli/golem-cli/wit/deps/golem-1.x/golem-oplog.wit index d8f57d7ad4..4121788c31 100644 --- a/cli/golem-cli/wit/deps/golem-1.x/golem-oplog.wit +++ b/cli/golem-cli/wit/deps/golem-1.x/golem-oplog.wit @@ -432,6 +432,19 @@ interface oplog { resource-name: string } + record ephemeral-sleep-too-long { + requested-nanos: u64, + max-nanos: u64 + } + + record ephemeral-fuel-exhausted { + overdraft-limit: u64 + } + + record ephemeral-cannot-suspend { + reason: string + } + /// Describes the error that occurred in the agent variant worker-error { unknown(string), @@ -448,7 +461,10 @@ interface oplog { exceeded-rpc-call-limit, node-out-of-filesystem-storage, agent-exceeded-filesystem-storage-limit, - agent-terminated-by-quota(agent-terminated-by-quota-error) + agent-terminated-by-quota(agent-terminated-by-quota-error), + ephemeral-sleep-too-long(ephemeral-sleep-too-long), + ephemeral-fuel-exhausted(ephemeral-fuel-exhausted), + ephemeral-cannot-suspend(ephemeral-cannot-suspend) } record raw-create-parameters { diff --git a/golem-api-grpc/proto/golem/worker/worker_error.proto b/golem-api-grpc/proto/golem/worker/worker_error.proto index 6161822027..78918bf6d9 100644 --- a/golem-api-grpc/proto/golem/worker/worker_error.proto +++ b/golem-api-grpc/proto/golem/worker/worker_error.proto @@ -21,6 +21,9 @@ message AgentError { ExceededHttpCallLimit exceeded_http_call_limit = 13; ExceededRpcCallLimit exceeded_rpc_call_limit = 14; AgentTerminatedByQuota agent_terminated_by_quota = 15; + EphemeralSleepTooLong ephemeral_sleep_too_long = 16; + EphemeralFuelExhausted ephemeral_fuel_exhausted = 17; + EphemeralCannotSuspend ephemeral_cannot_suspend = 18; } } @@ -68,3 +71,16 @@ message AgentTerminatedByQuota { golem.common.EnvironmentId environment_id = 1; string resource_name = 2; } + +message EphemeralSleepTooLong { + uint64 requested_nanos = 1; + uint64 max_nanos = 2; +} + +message EphemeralFuelExhausted { + uint64 overdraft_limit = 1; +} + +message EphemeralCannotSuspend { + string reason = 1; +} diff --git a/golem-common/src/model/oplog/protobuf.rs b/golem-common/src/model/oplog/protobuf.rs index eb2a2ab6d3..eb48c220f2 100644 --- a/golem-common/src/model/oplog/protobuf.rs +++ b/golem-common/src/model/oplog/protobuf.rs @@ -50,7 +50,8 @@ use crate::model::oplog::public_oplog_entry::{ SnapshotParams, StartSpanParams, SuccessfulUpdateParams, SuspendParams, }; use crate::model::oplog::{ - AgentTerminatedByQuotaError, DurableFunctionType, OplogEntry, PersistenceLevel, + AgentTerminatedByQuotaError, DurableFunctionType, EphemeralCannotSuspendError, + EphemeralFuelExhaustedError, EphemeralSleepTooLongError, OplogEntry, PersistenceLevel, }; use crate::model::quota::ResourceName; use crate::model::regions::OplogRegion; @@ -129,6 +130,22 @@ impl TryFrom for AgentError { resource_name: ResourceName(inner.resource_name), })) } + Error::EphemeralSleepTooLong(inner) => { + Ok(Self::EphemeralSleepTooLong(EphemeralSleepTooLongError { + requested_nanos: inner.requested_nanos, + max_nanos: inner.max_nanos, + })) + } + Error::EphemeralFuelExhausted(inner) => { + Ok(Self::EphemeralFuelExhausted(EphemeralFuelExhaustedError { + overdraft_limit: inner.overdraft_limit, + })) + } + Error::EphemeralCannotSuspend(inner) => { + Ok(Self::EphemeralCannotSuspend(EphemeralCannotSuspendError { + reason: inner.reason, + })) + } } } } @@ -184,6 +201,21 @@ impl From for golem_api_grpc::proto::golem::worker::AgentError { resource_name: details.resource_name.0, }) } + AgentError::EphemeralSleepTooLong(EphemeralSleepTooLongError { + requested_nanos, + max_nanos, + }) => Error::EphemeralSleepTooLong(grpc_worker::EphemeralSleepTooLong { + requested_nanos, + max_nanos, + }), + AgentError::EphemeralFuelExhausted(EphemeralFuelExhaustedError { overdraft_limit }) => { + Error::EphemeralFuelExhausted(grpc_worker::EphemeralFuelExhausted { + overdraft_limit, + }) + } + AgentError::EphemeralCannotSuspend(EphemeralCannotSuspendError { reason }) => { + Error::EphemeralCannotSuspend(grpc_worker::EphemeralCannotSuspend { reason }) + } }; Self { error: Some(error) } } diff --git a/golem-common/src/model/oplog/raw_types.rs b/golem-common/src/model/oplog/raw_types.rs index ecd467316e..117e5a9dd7 100644 --- a/golem-common/src/model/oplog/raw_types.rs +++ b/golem-common/src/model/oplog/raw_types.rs @@ -355,6 +355,25 @@ pub struct AgentTerminatedByQuotaError { pub resource_name: ResourceName, } +#[derive(Clone, Debug, PartialEq, Eq, Hash, BinaryCodec, IntoValue, FromValue)] +#[wit(name = "ephemeral-sleep-too-long", owner = "golem:api@1.5.0/oplog")] +pub struct EphemeralSleepTooLongError { + pub requested_nanos: u64, + pub max_nanos: u64, +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash, BinaryCodec, IntoValue, FromValue)] +#[wit(name = "ephemeral-fuel-exhausted", owner = "golem:api@1.5.0/oplog")] +pub struct EphemeralFuelExhaustedError { + pub overdraft_limit: u64, +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash, BinaryCodec, IntoValue, FromValue)] +#[wit(name = "ephemeral-cannot-suspend", owner = "golem:api@1.5.0/oplog")] +pub struct EphemeralCannotSuspendError { + pub reason: String, +} + /// Describes the error that occurred in the worker #[derive(Clone, Debug, PartialEq, Eq, Hash, BinaryCodec, IntoValue, FromValue)] #[wit(name = "worker-error", owner = "golem:api@1.5.0/oplog")] @@ -381,6 +400,12 @@ pub enum AgentError { AgentExceededFilesystemStorageLimit, // The agent was terminated by a quota with the terminae enforcement action (permanent) AgentTerminatedByQuota(AgentTerminatedByQuotaError), + // Ephemeral agents cannot suspend and the requested sleep exceeded the configured maximum + EphemeralSleepTooLong(EphemeralSleepTooLongError), + // Ephemeral agent exhausted its per-invocation fuel overdraft allowance + EphemeralFuelExhausted(EphemeralFuelExhaustedError), + // Ephemeral agent reached a condition that would suspend a durable agent + EphemeralCannotSuspend(EphemeralCannotSuspendError), } impl AgentError { @@ -401,6 +426,9 @@ impl AgentError { Self::NodeOutOfFilesystemStorage => "Out of storage space", Self::AgentExceededFilesystemStorageLimit => "Exceeded plan storage limit", Self::AgentTerminatedByQuota(_) => "Terminated by quota", + Self::EphemeralSleepTooLong(_) => "Ephemeral sleep too long", + Self::EphemeralFuelExhausted(_) => "Ephemeral fuel exhausted", + Self::EphemeralCannotSuspend(_) => "Ephemeral agent cannot suspend", } } diff --git a/golem-common/wit/deps/golem-1.x/golem-oplog.wit b/golem-common/wit/deps/golem-1.x/golem-oplog.wit index d8f57d7ad4..4121788c31 100644 --- a/golem-common/wit/deps/golem-1.x/golem-oplog.wit +++ b/golem-common/wit/deps/golem-1.x/golem-oplog.wit @@ -432,6 +432,19 @@ interface oplog { resource-name: string } + record ephemeral-sleep-too-long { + requested-nanos: u64, + max-nanos: u64 + } + + record ephemeral-fuel-exhausted { + overdraft-limit: u64 + } + + record ephemeral-cannot-suspend { + reason: string + } + /// Describes the error that occurred in the agent variant worker-error { unknown(string), @@ -448,7 +461,10 @@ interface oplog { exceeded-rpc-call-limit, node-out-of-filesystem-storage, agent-exceeded-filesystem-storage-limit, - agent-terminated-by-quota(agent-terminated-by-quota-error) + agent-terminated-by-quota(agent-terminated-by-quota-error), + ephemeral-sleep-too-long(ephemeral-sleep-too-long), + ephemeral-fuel-exhausted(ephemeral-fuel-exhausted), + ephemeral-cannot-suspend(ephemeral-cannot-suspend) } record raw-create-parameters { diff --git a/golem-debugging-service/config/debug-worker-executor.sample.env b/golem-debugging-service/config/debug-worker-executor.sample.env index c90557307d..32d43b7364 100644 --- a/golem-debugging-service/config/debug-worker-executor.sample.env +++ b/golem-debugging-service/config/debug-worker-executor.sample.env @@ -38,6 +38,7 @@ GOLEM__KEY_VALUE_STORAGE__CONFIG__RETRIES__MAX_DELAY="2s" GOLEM__KEY_VALUE_STORAGE__CONFIG__RETRIES__MAX_JITTER_FACTOR=0.15 GOLEM__KEY_VALUE_STORAGE__CONFIG__RETRIES__MIN_DELAY="100ms" GOLEM__KEY_VALUE_STORAGE__CONFIG__RETRIES__MULTIPLIER=2.0 +GOLEM__LIMITS__EPHEMERAL_FUEL_OVERDRAFT_MULTIPLIER=100 GOLEM__LIMITS__EPOCH_INTERVAL="10ms" GOLEM__LIMITS__EPOCH_TICKS=1 GOLEM__LIMITS__EVENT_BROADCAST_CAPACITY=1024 @@ -112,6 +113,7 @@ GOLEM__RETRY__MAX_JITTER_FACTOR=0.15 GOLEM__RETRY__MIN_DELAY="100ms" GOLEM__RETRY__MULTIPLIER=3.0 GOLEM__SCHEDULER__REFRESH_INTERVAL="2s" +GOLEM__SUSPEND__EPHEMERAL_MAX_SLEEP="1m" GOLEM__SUSPEND__SUSPEND_AFTER="10s" GOLEM__TRACING__CONSOLE=false GOLEM__TRACING__DTOR_FRIENDLY=false @@ -196,6 +198,7 @@ GOLEM__KEY_VALUE_STORAGE__CONFIG__RETRIES__MAX_DELAY="2s" GOLEM__KEY_VALUE_STORAGE__CONFIG__RETRIES__MAX_JITTER_FACTOR=0.15 GOLEM__KEY_VALUE_STORAGE__CONFIG__RETRIES__MIN_DELAY="100ms" GOLEM__KEY_VALUE_STORAGE__CONFIG__RETRIES__MULTIPLIER=2.0 +GOLEM__LIMITS__EPHEMERAL_FUEL_OVERDRAFT_MULTIPLIER=100 GOLEM__LIMITS__EPOCH_INTERVAL="10ms" GOLEM__LIMITS__EPOCH_TICKS=1 GOLEM__LIMITS__EVENT_BROADCAST_CAPACITY=1024 @@ -270,6 +273,7 @@ GOLEM__RETRY__MAX_JITTER_FACTOR=0.15 GOLEM__RETRY__MIN_DELAY="100ms" GOLEM__RETRY__MULTIPLIER=3.0 GOLEM__SCHEDULER__REFRESH_INTERVAL="2s" +GOLEM__SUSPEND__EPHEMERAL_MAX_SLEEP="1m" GOLEM__SUSPEND__SUSPEND_AFTER="10s" GOLEM__TRACING__CONSOLE=false GOLEM__TRACING__DTOR_FRIENDLY=false diff --git a/golem-debugging-service/config/debug-worker-executor.toml b/golem-debugging-service/config/debug-worker-executor.toml index fbdeddd7b0..fa42d101d9 100644 --- a/golem-debugging-service/config/debug-worker-executor.toml +++ b/golem-debugging-service/config/debug-worker-executor.toml @@ -71,6 +71,7 @@ min_delay = "100ms" multiplier = 2.0 [limits] +ephemeral_fuel_overdraft_multiplier = 100 epoch_interval = "10ms" epoch_ticks = 1 event_broadcast_capacity = 1024 @@ -187,6 +188,7 @@ multiplier = 3.0 refresh_interval = "2s" [suspend] +ephemeral_max_sleep = "1m" suspend_after = "10s" [tracing] @@ -314,6 +316,7 @@ without_time = false # multiplier = 2.0 # # [limits] +# ephemeral_fuel_overdraft_multiplier = 100 # epoch_interval = "10ms" # epoch_ticks = 1 # event_broadcast_capacity = 1024 @@ -430,6 +433,7 @@ without_time = false # refresh_interval = "2s" # # [suspend] +# ephemeral_max_sleep = "1m" # suspend_after = "10s" # # [tracing] diff --git a/golem-debugging-service/src/debug_context.rs b/golem-debugging-service/src/debug_context.rs index 98f7d98331..9318028ebd 100644 --- a/golem-debugging-service/src/debug_context.rs +++ b/golem-debugging-service/src/debug_context.rs @@ -25,7 +25,7 @@ use golem_common::model::component::ComponentRevision; use golem_common::model::invocation_context::{ self, AttributeValue, InvocationContextStack, SpanId, }; -use golem_common::model::oplog::TimestampedUpdateDescription; +use golem_common::model::oplog::{AgentError, TimestampedUpdateDescription}; use golem_common::model::{ AgentId, AgentInvocation, AgentInvocationOutput, AgentStatusRecord, IdempotencyKey, OwnedAgentId, Timestamp, @@ -98,8 +98,8 @@ impl DurableWorkerCtxView for DebugContext { #[async_trait] impl FuelManagement for DebugContext { - fn ensure_fuel(&mut self, _current_level: u64) -> bool { - true + fn ensure_fuel(&mut self, _current_level: u64) -> Result<(), AgentError> { + Ok(()) } fn return_fuel(&mut self, _current_level: u64) -> u64 { diff --git a/golem-wasm/wit/deps/golem-1.x/golem-oplog.wit b/golem-wasm/wit/deps/golem-1.x/golem-oplog.wit index d8f57d7ad4..4121788c31 100644 --- a/golem-wasm/wit/deps/golem-1.x/golem-oplog.wit +++ b/golem-wasm/wit/deps/golem-1.x/golem-oplog.wit @@ -432,6 +432,19 @@ interface oplog { resource-name: string } + record ephemeral-sleep-too-long { + requested-nanos: u64, + max-nanos: u64 + } + + record ephemeral-fuel-exhausted { + overdraft-limit: u64 + } + + record ephemeral-cannot-suspend { + reason: string + } + /// Describes the error that occurred in the agent variant worker-error { unknown(string), @@ -448,7 +461,10 @@ interface oplog { exceeded-rpc-call-limit, node-out-of-filesystem-storage, agent-exceeded-filesystem-storage-limit, - agent-terminated-by-quota(agent-terminated-by-quota-error) + agent-terminated-by-quota(agent-terminated-by-quota-error), + ephemeral-sleep-too-long(ephemeral-sleep-too-long), + ephemeral-fuel-exhausted(ephemeral-fuel-exhausted), + ephemeral-cannot-suspend(ephemeral-cannot-suspend) } record raw-create-parameters { diff --git a/golem-worker-executor-test-utils/src/lib.rs b/golem-worker-executor-test-utils/src/lib.rs index 72dcd72620..723c868855 100644 --- a/golem-worker-executor-test-utils/src/lib.rs +++ b/golem-worker-executor-test-utils/src/lib.rs @@ -39,8 +39,8 @@ use golem_common::model::invocation_context::{ AttributeValue, InvocationContextSpan, InvocationContextStack, SpanId, }; use golem_common::model::oplog::{ - OplogEntry, PayloadId, PersistenceLevel, RawOplogPayload, TimestampedUpdateDescription, - types::ObjectMetadata, + AgentError, OplogEntry, PayloadId, PersistenceLevel, RawOplogPayload, + TimestampedUpdateDescription, types::ObjectMetadata, }; use golem_common::model::plan::PlanId; use golem_common::model::worker::{AgentConfigEntryDto, AgentMetadataDto}; @@ -733,8 +733,8 @@ impl wasmtime_wasi::p2::bindings::cli::environment::Host for TestWorkerCtx { #[async_trait] impl FuelManagement for TestWorkerCtx { - fn ensure_fuel(&mut self, _current_level: u64) -> bool { - true + fn ensure_fuel(&mut self, _current_level: u64) -> Result<(), AgentError> { + Ok(()) } fn return_fuel(&mut self, _current_level: u64) -> u64 { diff --git a/golem-worker-executor/config/worker-executor.sample.env b/golem-worker-executor/config/worker-executor.sample.env index fc2b9e88e6..8c24b34873 100644 --- a/golem-worker-executor/config/worker-executor.sample.env +++ b/golem-worker-executor/config/worker-executor.sample.env @@ -55,6 +55,7 @@ GOLEM__KEY_VALUE_STORAGE__CONFIG__RETRIES__MAX_DELAY="2s" GOLEM__KEY_VALUE_STORAGE__CONFIG__RETRIES__MAX_JITTER_FACTOR=0.15 GOLEM__KEY_VALUE_STORAGE__CONFIG__RETRIES__MIN_DELAY="100ms" GOLEM__KEY_VALUE_STORAGE__CONFIG__RETRIES__MULTIPLIER=2.0 +GOLEM__LIMITS__EPHEMERAL_FUEL_OVERDRAFT_MULTIPLIER=100 GOLEM__LIMITS__EPOCH_INTERVAL="10ms" GOLEM__LIMITS__EPOCH_TICKS=1 GOLEM__LIMITS__EVENT_BROADCAST_CAPACITY=1024 @@ -148,6 +149,7 @@ GOLEM__SHARD_MANAGER__RETRIES_ON_UNAVAILABLE__MAX_JITTER_FACTOR=0.15 GOLEM__SHARD_MANAGER__RETRIES_ON_UNAVAILABLE__MIN_DELAY="100ms" GOLEM__SHARD_MANAGER__RETRIES_ON_UNAVAILABLE__MULTIPLIER=2.0 GOLEM__SHARD_MANAGER__TLS__TYPE="Disabled" +GOLEM__SUSPEND__EPHEMERAL_MAX_SLEEP="1m" GOLEM__SUSPEND__SUSPEND_AFTER="10s" GOLEM__TRACING__CONSOLE=false GOLEM__TRACING__DTOR_FRIENDLY=false @@ -264,6 +266,7 @@ GOLEM__INDEXED_STORAGE__CONFIG__RETRIES__MAX_JITTER_FACTOR=0.15 GOLEM__INDEXED_STORAGE__CONFIG__RETRIES__MIN_DELAY="100ms" GOLEM__INDEXED_STORAGE__CONFIG__RETRIES__MULTIPLIER=2.0 GOLEM__KEY_VALUE_STORAGE__TYPE="InMemory" +GOLEM__LIMITS__EPHEMERAL_FUEL_OVERDRAFT_MULTIPLIER=100 GOLEM__LIMITS__EPOCH_INTERVAL="10ms" GOLEM__LIMITS__EPOCH_TICKS=1 GOLEM__LIMITS__EVENT_BROADCAST_CAPACITY=1024 @@ -357,6 +360,7 @@ GOLEM__SHARD_MANAGER__RETRIES_ON_UNAVAILABLE__MAX_JITTER_FACTOR=0.15 GOLEM__SHARD_MANAGER__RETRIES_ON_UNAVAILABLE__MIN_DELAY="100ms" GOLEM__SHARD_MANAGER__RETRIES_ON_UNAVAILABLE__MULTIPLIER=2.0 GOLEM__SHARD_MANAGER__TLS__TYPE="Disabled" +GOLEM__SUSPEND__EPHEMERAL_MAX_SLEEP="1m" GOLEM__SUSPEND__SUSPEND_AFTER="10s" GOLEM__TRACING__CONSOLE=false GOLEM__TRACING__DTOR_FRIENDLY=false @@ -443,6 +447,7 @@ GOLEM__HTTP_CLIENT__CONFIG__MAX_IDLE_PER_HOST=8 GOLEM__HTTP_CLIENT__CONFIG__MAX_TOTAL_CONNECTIONS=200 GOLEM__INDEXED_STORAGE__TYPE="InMemory" GOLEM__KEY_VALUE_STORAGE__TYPE="InMemory" +GOLEM__LIMITS__EPHEMERAL_FUEL_OVERDRAFT_MULTIPLIER=100 GOLEM__LIMITS__EPOCH_INTERVAL="10ms" GOLEM__LIMITS__EPOCH_TICKS=1 GOLEM__LIMITS__EVENT_BROADCAST_CAPACITY=1024 @@ -536,6 +541,7 @@ GOLEM__SHARD_MANAGER__RETRIES_ON_UNAVAILABLE__MAX_JITTER_FACTOR=0.15 GOLEM__SHARD_MANAGER__RETRIES_ON_UNAVAILABLE__MIN_DELAY="100ms" GOLEM__SHARD_MANAGER__RETRIES_ON_UNAVAILABLE__MULTIPLIER=2.0 GOLEM__SHARD_MANAGER__TLS__TYPE="Disabled" +GOLEM__SUSPEND__EPHEMERAL_MAX_SLEEP="1m" GOLEM__SUSPEND__SUSPEND_AFTER="10s" GOLEM__TRACING__CONSOLE=false GOLEM__TRACING__DTOR_FRIENDLY=false diff --git a/golem-worker-executor/config/worker-executor.toml b/golem-worker-executor/config/worker-executor.toml index 3a00c999c4..27558e5b9c 100644 --- a/golem-worker-executor/config/worker-executor.toml +++ b/golem-worker-executor/config/worker-executor.toml @@ -100,6 +100,7 @@ min_delay = "100ms" multiplier = 2.0 [limits] +ephemeral_fuel_overdraft_multiplier = 100 epoch_interval = "10ms" epoch_ticks = 1 event_broadcast_capacity = 1024 @@ -247,6 +248,7 @@ type = "Disabled" [shard_manager.tls.config] [suspend] +ephemeral_max_sleep = "1m" suspend_after = "10s" [tracing] @@ -417,6 +419,7 @@ without_time = false # [key_value_storage.config] # # [limits] +# ephemeral_fuel_overdraft_multiplier = 100 # epoch_interval = "10ms" # epoch_ticks = 1 # event_broadcast_capacity = 1024 @@ -564,6 +567,7 @@ without_time = false # [shard_manager.tls.config] # # [suspend] +# ephemeral_max_sleep = "1m" # suspend_after = "10s" # # [tracing] @@ -704,6 +708,7 @@ without_time = false # [key_value_storage.config] # # [limits] +# ephemeral_fuel_overdraft_multiplier = 100 # epoch_interval = "10ms" # epoch_ticks = 1 # event_broadcast_capacity = 1024 @@ -851,6 +856,7 @@ without_time = false # [shard_manager.tls.config] # # [suspend] +# ephemeral_max_sleep = "1m" # suspend_after = "10s" # # [tracing] diff --git a/golem-worker-executor/src/durable_host/durability.rs b/golem-worker-executor/src/durable_host/durability.rs index 0a177a52e2..d1909bb1c0 100644 --- a/golem-worker-executor/src/durable_host/durability.rs +++ b/golem-worker-executor/src/durable_host/durability.rs @@ -715,7 +715,8 @@ impl DurabilityHost for DurableWorkerCtx { let retry_config = self.state.config.retry.clone(); let in_atomic_region = !self.state.active_atomic_regions.is_empty(); - let trap_type = TrapType::from_error::(&failure, current_retry_point); + let trap_type = + TrapType::from_error::(&failure, current_retry_point, self.agent_mode()); // Try semantic policy resolution first let policies = self.state.named_retry_policies().await; diff --git a/golem-worker-executor/src/durable_host/io/poll.rs b/golem-worker-executor/src/durable_host/io/poll.rs index 49c9dbfc92..635ba57d02 100644 --- a/golem-worker-executor/src/durable_host/io/poll.rs +++ b/golem-worker-executor/src/durable_host/io/poll.rs @@ -14,17 +14,19 @@ use crate::durable_host::durability::InFunctionRetryHost; use crate::durable_host::{Durability, DurabilityHost, DurableWorkerCtx, SuspendForSleep}; +use crate::metrics::ephemeral::{dec_promise_waiting, inc_promise_waiting}; use crate::workerctx::WorkerCtx; use chrono::{Duration, Utc}; -use futures::future::Either; use futures::pin_mut; use golem_common::model::Timestamp; +use golem_common::model::agent::AgentMode; use golem_common::model::oplog::host_functions::{IoPollPoll, IoPollReady}; +use golem_common::model::oplog::{AgentError, EphemeralSleepTooLongError}; use golem_common::model::oplog::{ DurableFunctionType, HostRequestNoInput, HostRequestPollCount, HostResponsePollReady, HostResponsePollResult, }; -use golem_service_base::error::worker_executor::InterruptKind; +use golem_service_base::error::worker_executor::{InterruptKind, WorkerExecutorError}; use tracing::debug; use wasmtime::component::Resource; use wasmtime_wasi::IoView as _; @@ -112,7 +114,7 @@ impl Host for DurableWorkerCtx { // check if all pollables are promise backed. In this case we can suspend immediately // This check only needs to be done in live mode, as we will never even persist the oplog entry for polling // if we suspended in the last pass. Doing it this way also prevents us from initializing the promises until we are actually in live mode. - if self.durable_execution_state().is_live { + if self.durable_execution_state().is_live && self.agent_mode() != AgentMode::Ephemeral { let promise_backed_pollables = self.state.promise_backed_pollables.read().await; let mut all_blocked = true; @@ -148,6 +150,32 @@ impl Host for DurableWorkerCtx { .create_await_interrupt_signal(); let count = in_.len(); + let record_ephemeral_promise_wait = if self.agent_mode() == AgentMode::Ephemeral { + let promise_backed_pollables = self.state.promise_backed_pollables.read().await; + let mut all_blocked = true; + + for res in &in_ { + if let Some(promise_handle) = promise_backed_pollables.get(&res.rep()) { + let ready = promise_handle.get_handle().await.is_ready().await; + if ready { + all_blocked = false; + break; + } + } else { + all_blocked = false; + break; + } + } + + all_blocked && !in_.is_empty() + } else { + false + }; + let ephemeral_poll_timeout = if self.agent_mode() == AgentMode::Ephemeral { + Some(self.state.config.suspend.ephemeral_max_sleep) + } else { + None + }; let result = { let mut view = self.as_wasi_view(); @@ -155,11 +183,32 @@ impl Host for DurableWorkerCtx { let poll = Host::poll(&mut io_data, in_); pin_mut!(poll); - let either_result = futures::future::select(poll, interrupt_signal).await; - match either_result { - Either::Left((result, _)) => result, - Either::Right((interrupt_kind, _)) => { - return Err(wasmtime::Error::from_anyhow(interrupt_kind.into())); + let _promise_waiting = PromiseWaiting::new(record_ephemeral_promise_wait); + + if let Some(timeout_duration) = ephemeral_poll_timeout { + let timeout = tokio::time::sleep(timeout_duration); + pin_mut!(timeout); + + tokio::select! { + result = &mut poll => { + result + } + interrupt_kind = interrupt_signal => { + return Err(wasmtime::Error::from_anyhow(interrupt_kind.into())); + } + _ = &mut timeout => { + let max_nanos = std_duration_to_nanos(timeout_duration); + return Err(ephemeral_sleep_too_long_error(max_nanos, max_nanos)); + } + } + } else { + tokio::select! { + result = &mut poll => { + result + } + interrupt_kind = interrupt_signal => { + return Err(wasmtime::Error::from_anyhow(interrupt_kind.into())); + } } } }; @@ -183,15 +232,63 @@ impl Host for DurableWorkerCtx { match result { Ok(result) => result.result.map_err(wasmtime::Error::msg), Err(duration) => { - self.state.sleep_until(Utc::now() + duration).await?; - Err(wasmtime::Error::from_anyhow( - InterruptKind::Suspend(Timestamp::now_utc()).into(), - )) + if self.agent_mode() == AgentMode::Ephemeral { + let max = self.state.config.suspend.ephemeral_max_sleep; + Err(ephemeral_sleep_too_long_error( + duration_to_nanos(duration), + std_duration_to_nanos(max), + )) + } else { + self.state.sleep_until(Utc::now() + duration).await?; + Err(wasmtime::Error::from_anyhow( + InterruptKind::Suspend(Timestamp::now_utc()).into(), + )) + } } } } } +struct PromiseWaiting(bool); + +impl PromiseWaiting { + fn new(enabled: bool) -> Self { + if enabled { + inc_promise_waiting(); + } + Self(enabled) + } +} + +impl Drop for PromiseWaiting { + fn drop(&mut self) { + if self.0 { + dec_promise_waiting(); + } + } +} + +fn ephemeral_sleep_too_long_error(requested_nanos: u64, max_nanos: u64) -> wasmtime::Error { + wasmtime::Error::from_anyhow(anyhow::anyhow!(WorkerExecutorError::InvocationFailed { + error: AgentError::EphemeralSleepTooLong(EphemeralSleepTooLongError { + requested_nanos, + max_nanos, + }), + stderr: String::new(), + })) +} + +fn duration_to_nanos(duration: Duration) -> u64 { + duration + .to_std() + .map(std_duration_to_nanos) + .unwrap_or(u64::MAX) +} + +fn std_duration_to_nanos(duration: std::time::Duration) -> u64 { + duration.as_nanos().min(u64::MAX as u128) as u64 +} + fn is_suspend_for_sleep(result: &Result) -> Option { if let Err(err) = result { // Walk the error source chain, since wasmtime::Error may wrap the original error diff --git a/golem-worker-executor/src/durable_host/mod.rs b/golem-worker-executor/src/durable_host/mod.rs index 86da9bf391..8ddb669228 100644 --- a/golem-worker-executor/src/durable_host/mod.rs +++ b/golem-worker-executor/src/durable_host/mod.rs @@ -38,6 +38,7 @@ use self::golem::v1x::GetPromiseResultEntry; use crate::durable_host::durability::collect_named_retry_policies; use crate::durable_host::io::{ManagedStdErr, ManagedStdIn, ManagedStdOut}; use crate::durable_host::replay_state::{OplogEntryLookupResult, ReplayState}; +use crate::metrics::ephemeral::record_non_suspending_failure; use crate::metrics::storage::{ STORAGE_TYPE_FILESYSTEM, record_storage_bytes_deleted, record_storage_bytes_written, }; @@ -336,6 +337,10 @@ impl DurableWorkerCtx { let stdin = ManagedStdIn::disabled(); let stdout = ManagedStdOut::from_stdout(tokio::io::stdout()); let stderr = ManagedStdErr::from_stderr(tokio::io::stderr()); + let suspend_threshold = match execution_status.read().unwrap().agent_mode() { + AgentMode::Durable => config.suspend.suspend_after, + AgentMode::Ephemeral => config.suspend.ephemeral_max_sleep, + }; let (wasi, io_ctx, table) = wasi_host::create_context( &[] as &[&str], worker_dir.path().to_path_buf(), @@ -343,7 +348,7 @@ impl DurableWorkerCtx { stdout, stderr, |duration| wasmtime::Error::from(SuspendForSleep(duration)), - config.suspend.suspend_after, + suspend_threshold, ) .map_err(|e| WorkerExecutorError::runtime(format!("Could not create WASI context: {e}")))?; let mut wasi_http = WasiHttpCtx::new(); @@ -738,6 +743,13 @@ impl DurableWorkerCtx { error: AgentError::AgentTerminatedByQuota(_), .. } => RetryDecision::None, + TrapType::Error { + error: + AgentError::EphemeralSleepTooLong(_) + | AgentError::EphemeralFuelExhausted(_) + | AgentError::EphemeralCannotSuspend(_), + .. + } => RetryDecision::None, TrapType::Error { error: AgentError::InternalError(_), .. @@ -820,6 +832,9 @@ impl DurableWorkerCtx { AgentError::Unknown(_) => "unknown", AgentError::TransientError(_) => "transient-error", AgentError::AgentTerminatedByQuota(_) => "agent-terminated-by-quota", + AgentError::EphemeralSleepTooLong(_) => "ephemeral-sleep-too-long", + AgentError::EphemeralFuelExhausted(_) => "ephemeral-fuel-exhausted", + AgentError::EphemeralCannotSuspend(_) => "ephemeral-cannot-suspend", } } @@ -2227,6 +2242,21 @@ impl InvocationHooks for DurableWorkerCtx { ) -> RetryDecision { let current_idempotency_key = self.get_current_idempotency_key().await; + if let TrapType::Error { error, .. } = trap_type { + match error { + AgentError::EphemeralSleepTooLong(_) => { + record_non_suspending_failure("sleep-too-long") + } + AgentError::EphemeralFuelExhausted(_) => { + record_non_suspending_failure("fuel-exhausted") + } + AgentError::EphemeralCannotSuspend(_) => { + record_non_suspending_failure("cannot-suspend") + } + _ => {} + } + } + // Special case: jumping is always immediate and may not have a non-detached status. if matches!(trap_type, TrapType::Interrupt(InterruptKind::Jump)) { return RetryDecision::Immediate; @@ -2832,6 +2862,7 @@ impl ExternalOperations for DurableWorkerCtx { Err(error) => Some(TrapType::from_error::( &anyhow!(error), OplogIndex::INITIAL, + store.as_context().data().agent_mode(), )), }; let decision = match trap_type { diff --git a/golem-worker-executor/src/durable_host/quota/mod.rs b/golem-worker-executor/src/durable_host/quota/mod.rs index b369a5b560..f99843aa2c 100644 --- a/golem-worker-executor/src/durable_host/quota/mod.rs +++ b/golem-worker-executor/src/durable_host/quota/mod.rs @@ -23,6 +23,7 @@ use crate::preview2::golem_quota::types::QuotaTokenRecord; use crate::services::quota::LeaseInterest; use crate::workerctx::WorkerCtx; use chrono::{DateTime, TimeDelta, TimeZone, Utc}; +use golem_common::model::agent::AgentMode; use golem_common::model::environment::EnvironmentId; use golem_common::model::oplog::DurableFunctionType; use golem_common::model::oplog::host_functions; @@ -193,12 +194,15 @@ impl HostQuotaToken for DurableWorkerCtx { let agent_created_by = self.created_by(); let owned_agent_id = self.owned_agent_id().clone(); let scheduler_service = self.scheduler_service(); + let agent_mode = self.agent_mode(); debug!( "Throttling agent due to failed quota reservation ({estimated_wait_nanos:?})" ); let quota_token = self.table().get(&self_)?; - if let Some(estimated_wait_nanos) = estimated_wait_nanos { + if agent_mode == AgentMode::Durable + && let Some(estimated_wait_nanos) = estimated_wait_nanos + { // schedule a continuation for when we expect the quota to be ready to serve us. If it still doesn't have capacity // we will just end up suspending again. let estimated_wait_nanos_i64 = if estimated_wait_nanos > i64::MAX as u64 diff --git a/golem-worker-executor/src/metrics.rs b/golem-worker-executor/src/metrics.rs index 26421bcbe1..591c5b6870 100644 --- a/golem-worker-executor/src/metrics.rs +++ b/golem-worker-executor/src/metrics.rs @@ -347,6 +347,11 @@ pub mod resources { register_counter!("fuel_borrow_total", "Total amount of fuel borrowed").unwrap(); static ref FUEL_RETURN_TOTAL: Counter = register_counter!("fuel_return_total", "Total amount of fuel returned").unwrap(); + static ref EPHEMERAL_OVERDRAFT_FUEL_TOTAL: Counter = register_counter!( + "ephemeral_overdraft_fuel_total", + "Total amount of ephemeral overdraft fuel consumed" + ) + .unwrap(); } pub fn record_fuel_borrow(amount: u64) { @@ -356,6 +361,43 @@ pub mod resources { pub fn record_fuel_return(amount: u64) { FUEL_RETURN_TOTAL.inc_by(amount as f64); } + + pub fn record_ephemeral_overdraft_fuel(amount: u64) { + EPHEMERAL_OVERDRAFT_FUEL_TOTAL.inc_by(amount as f64); + } +} + +pub mod ephemeral { + use lazy_static::lazy_static; + use prometheus::*; + + lazy_static! { + static ref EPHEMERAL_PROMISE_WAITING: Gauge = register_gauge!( + "ephemeral_promise_waiting", + "Number of ephemeral agents currently waiting on promises" + ) + .unwrap(); + static ref EPHEMERAL_NON_SUSPENDING_FAILURE_TOTAL: CounterVec = register_counter_vec!( + "ephemeral_non_suspending_failure_total", + "Number of ephemeral failures that replace suspension", + &["reason"] + ) + .unwrap(); + } + + pub fn inc_promise_waiting() { + EPHEMERAL_PROMISE_WAITING.inc(); + } + + pub fn dec_promise_waiting() { + EPHEMERAL_PROMISE_WAITING.dec(); + } + + pub fn record_non_suspending_failure(reason: &'static str) { + EPHEMERAL_NON_SUSPENDING_FAILURE_TOTAL + .with_label_values(&[reason]) + .inc(); + } } pub mod storage { diff --git a/golem-worker-executor/src/model/mod.rs b/golem-worker-executor/src/model/mod.rs index 787013a216..7cb1de8c6c 100644 --- a/golem-worker-executor/src/model/mod.rs +++ b/golem-worker-executor/src/model/mod.rs @@ -22,7 +22,9 @@ use golem_common::model::component::ComponentRevision; use golem_common::model::invocation_context::{ AttributeValue, InvocationContextSpan, InvocationContextStack, SpanId, TraceId, }; -use golem_common::model::oplog::{AgentError, AgentTerminatedByQuotaError, PersistenceLevel}; +use golem_common::model::oplog::{ + AgentError, AgentTerminatedByQuotaError, EphemeralCannotSuspendError, PersistenceLevel, +}; use golem_common::model::regions::DeletedRegions; use golem_common::model::worker::TypedAgentConfigEntry; use golem_common::model::{ @@ -251,7 +253,11 @@ pub enum TrapType { } impl TrapType { - pub fn from_error(error: &anyhow::Error, retry_from: OplogIndex) -> TrapType { + pub fn from_error( + error: &anyhow::Error, + retry_from: OplogIndex, + agent_mode: AgentMode, + ) -> TrapType { use crate::durable_host::durability::{ClassifiedHostError, HostFailureKind}; match error.root_cause().downcast_ref::() { @@ -316,14 +322,35 @@ impl TrapType { retry_from, } } - // Monthly budget exhausted → suspend and retry when replenished. - // Maps to TryStop which writes a Suspend oplog entry and transitions - // the worker to Suspended status. Some(GolemSpecificWasmTrap::WorkerMonthlyHttpCallBudgetExhausted) => { - TrapType::Interrupt(InterruptKind::Suspend(Timestamp::now_utc())) + match agent_mode { + AgentMode::Durable => TrapType::Interrupt(InterruptKind::Suspend( + Timestamp::now_utc(), + )), + AgentMode::Ephemeral => TrapType::Error { + error: AgentError::EphemeralCannotSuspend( + EphemeralCannotSuspendError { + reason: "monthly HTTP budget exhausted".to_string(), + }, + ), + retry_from, + }, + } } Some(GolemSpecificWasmTrap::WorkerMonthlyRpcCallBudgetExhausted) => { - TrapType::Interrupt(InterruptKind::Suspend(Timestamp::now_utc())) + match agent_mode { + AgentMode::Durable => TrapType::Interrupt(InterruptKind::Suspend( + Timestamp::now_utc(), + )), + AgentMode::Ephemeral => TrapType::Error { + error: AgentError::EphemeralCannotSuspend( + EphemeralCannotSuspendError { + reason: "monthly RPC budget exhausted".to_string(), + }, + ), + retry_from, + }, + } } Some(GolemSpecificWasmTrap::AgentTerminatedByQuota { environment_id, @@ -339,7 +366,19 @@ impl TrapType { }, Some(GolemSpecificWasmTrap::AgentThrottledByQuota { timestamp, .. - }) => TrapType::Interrupt(InterruptKind::Suspend(*timestamp)), + }) => match agent_mode { + AgentMode::Durable => { + TrapType::Interrupt(InterruptKind::Suspend(*timestamp)) + } + AgentMode::Ephemeral => TrapType::Error { + error: AgentError::EphemeralCannotSuspend( + EphemeralCannotSuspendError { + reason: "throttled by quota".to_string(), + }, + ), + retry_from, + }, + }, None => match error.root_cause().downcast_ref::() { Some(WorkerExecutorError::InvalidRequest { details }) => { TrapType::Error { @@ -359,6 +398,12 @@ impl TrapType { retry_from, } } + Some(WorkerExecutorError::InvocationFailed { error, .. }) => { + TrapType::Error { + error: error.clone(), + retry_from, + } + } _ => { // Search the full error chain for ClassifiedHostError if let Some(classified) = error @@ -771,6 +816,41 @@ mod tests { use tracing::info; use uuid::Uuid; + #[test] + fn monthly_http_budget_suspends_durable_agents() { + let trap = TrapType::from_error::( + &anyhow::anyhow!( + golem_service_base::error::worker_executor::GolemSpecificWasmTrap::WorkerMonthlyHttpCallBudgetExhausted + ), + OplogIndex::INITIAL, + AgentMode::Durable, + ); + + assert!(matches!( + trap, + TrapType::Interrupt(InterruptKind::Suspend(_)) + )); + } + + #[test] + fn monthly_http_budget_fails_ephemeral_agents_without_suspend() { + let trap = TrapType::from_error::( + &anyhow::anyhow!( + golem_service_base::error::worker_executor::GolemSpecificWasmTrap::WorkerMonthlyHttpCallBudgetExhausted + ), + OplogIndex::INITIAL, + AgentMode::Ephemeral, + ); + + assert!(matches!( + trap, + TrapType::Error { + error: AgentError::EphemeralCannotSuspend(EphemeralCannotSuspendError { reason }), + .. + } if reason == "monthly HTTP budget exhausted" + )); + } + fn example_trace_id_1() -> TraceId { TraceId::from_string("4bf92f3577b34da6a3ce929d0e0e4736").unwrap() } diff --git a/golem-worker-executor/src/model/public_oplog/wit.rs b/golem-worker-executor/src/model/public_oplog/wit.rs index a73b2ca317..066c16ad0e 100644 --- a/golem-worker-executor/src/model/public_oplog/wit.rs +++ b/golem-worker-executor/src/model/public_oplog/wit.rs @@ -32,7 +32,8 @@ use golem_common::model::oplog::public_oplog_entry::{ WriteRemoteTransactionParameters, }; use golem_common::model::oplog::{ - AgentInvocationOutputParameters, AgentTerminatedByQuotaError, FallibleResultParameters, + AgentInvocationOutputParameters, AgentTerminatedByQuotaError, EphemeralCannotSuspendError, + EphemeralFuelExhaustedError, EphemeralSleepTooLongError, FallibleResultParameters, JsonSnapshotData, MultipartPartData, MultipartSnapshotData, PublicOplogEntry, PublicSnapshotData, PublicUpdateDescription, RawSnapshotData, SaveSnapshotResultParameters, SnapshotBasedUpdateParameters, @@ -709,6 +710,22 @@ impl From for golem_common::model::oplog::AgentError { resource_name: ResourceName(inner.resource_name), }) } + oplog::WorkerError::EphemeralSleepTooLong(inner) => { + Self::EphemeralSleepTooLong(EphemeralSleepTooLongError { + requested_nanos: inner.requested_nanos, + max_nanos: inner.max_nanos, + }) + } + oplog::WorkerError::EphemeralFuelExhausted(inner) => { + Self::EphemeralFuelExhausted(EphemeralFuelExhaustedError { + overdraft_limit: inner.overdraft_limit, + }) + } + oplog::WorkerError::EphemeralCannotSuspend(inner) => { + Self::EphemeralCannotSuspend(EphemeralCannotSuspendError { + reason: inner.reason, + }) + } } } } diff --git a/golem-worker-executor/src/services/golem_config.rs b/golem-worker-executor/src/services/golem_config.rs index f584f2b01f..6cd17bc452 100644 --- a/golem-worker-executor/src/services/golem_config.rs +++ b/golem-worker-executor/src/services/golem_config.rs @@ -288,6 +288,7 @@ pub struct Limits { pub event_broadcast_capacity: usize, pub event_history_size: usize, pub fuel_to_borrow: u64, + pub ephemeral_fuel_overdraft_multiplier: u64, #[serde(with = "humantime_serde")] pub epoch_interval: Duration, pub epoch_ticks: u64, @@ -324,6 +325,11 @@ impl SafeDisplay for Limits { self.event_history_size ); let _ = writeln!(&mut result, "fuel to borrow: {}", self.fuel_to_borrow); + let _ = writeln!( + &mut result, + "ephemeral fuel overdraft multiplier: {}", + self.ephemeral_fuel_overdraft_multiplier + ); let _ = writeln!( &mut result, "epoch interval: {}", @@ -451,11 +457,16 @@ impl GolemConfig { pub struct SuspendConfig { #[serde(with = "humantime_serde")] pub suspend_after: Duration, + #[serde(with = "humantime_serde")] + pub ephemeral_max_sleep: Duration, } impl SafeDisplay for SuspendConfig { fn to_safe_string(&self) -> String { - format!("suspend after: {:?}", self.suspend_after) + format!( + "suspend after: {:?}, ephemeral max sleep: {:?}", + self.suspend_after, self.ephemeral_max_sleep + ) } } @@ -1339,6 +1350,7 @@ impl Default for Limits { event_broadcast_capacity: 1024, event_history_size: 128, fuel_to_borrow: 10000, + ephemeral_fuel_overdraft_multiplier: 100, epoch_interval: Duration::from_millis(10), epoch_ticks: 1, max_oplog_query_pages_size: 100, @@ -1370,6 +1382,7 @@ impl Default for SuspendConfig { fn default() -> Self { Self { suspend_after: Duration::from_secs(10), + ephemeral_max_sleep: Duration::from_secs(60), } } } diff --git a/golem-worker-executor/src/services/resource_limits.rs b/golem-worker-executor/src/services/resource_limits.rs index 809409280e..37ac45e40c 100644 --- a/golem-worker-executor/src/services/resource_limits.rs +++ b/golem-worker-executor/src/services/resource_limits.rs @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::metrics::resources::{record_fuel_borrow, record_fuel_return}; +use crate::metrics::resources::{ + record_ephemeral_overdraft_fuel, record_fuel_borrow, record_fuel_return, +}; use crate::services::golem_config::ResourceLimitsConfig; use async_trait::async_trait; use chrono::Utc; @@ -243,6 +245,10 @@ impl AtomicResourceEntry { } } + pub fn has_effective_fuel(&self) -> bool { + self.effective_fuel() > 0 + } + pub fn return_fuel(&self, amount: u64) { let amt_i64 = amount.min(i64::MAX as u64) as i64; self.delta @@ -253,6 +259,20 @@ impl AtomicResourceEntry { record_fuel_return(amount); } + pub fn record_overdraft_debt(&self, amount: u64) { + if amount == 0 { + return; + } + + let amt_i64 = amount.min(i64::MAX as u64) as i64; + self.delta + .fetch_update(Ordering::AcqRel, Ordering::Acquire, |d| { + Some(d.saturating_add(amt_i64)) + }) + .ok(); + record_ephemeral_overdraft_fuel(amount); + } + pub fn max_memory_limit(&self) -> usize { self.max_memory.load(Ordering::Acquire) } @@ -811,6 +831,15 @@ mod tests { let _ = entry.delta.load(Ordering::Acquire); } + #[test] + fn record_overdraft_debt_increases_delta_by_actual_consumed_amount() { + let entry = AtomicResourceEntry::new(1000, 0, usize::MAX, u64::MAX, u64::MAX); + entry.record_overdraft_debt(2000); + + assert_eq!(entry.delta.load(Ordering::Acquire), 2000); + assert_eq!(entry.effective_fuel(), 3000); + } + #[test] fn max_memory_limit_returns_stored_value() { let entry = AtomicResourceEntry::new(0, 65536, usize::MAX, u64::MAX, u64::MAX); diff --git a/golem-worker-executor/src/worker/invocation.rs b/golem-worker-executor/src/worker/invocation.rs index c69824ee19..b94463f3ba 100644 --- a/golem-worker-executor/src/worker/invocation.rs +++ b/golem-worker-executor/src/worker/invocation.rs @@ -16,8 +16,8 @@ use crate::metrics::wasm::{record_invocation, record_invocation_consumption}; use crate::model::TrapType; use crate::workerctx::{PublicWorkerIo, WorkerCtx}; use golem_common::model::agent::AgentError as AgentInvocationError; -use golem_common::model::agent::ParsedAgentId; use golem_common::model::agent::UntypedDataValue; +use golem_common::model::agent::{AgentMode, ParsedAgentId}; use golem_common::model::agent::{ DataSchema, ElementSchema, NamedElementSchema, UntypedElementValue, }; @@ -347,11 +347,13 @@ async fn invoke( } Err(err) => { let retry_from = store.data().get_current_retry_point().await; + let agent_mode = store.data().agent_mode(); let err: anyhow::Error = err.into(); Ok(InvokeResult::from_error::( consumed_fuel, &err, retry_from, + agent_mode, )) } } @@ -411,11 +413,13 @@ async fn drop_resource( Ok(_) => wrap_output_as_agent_result(kind, None, consumed_fuel), Err(err) => { let retry_from = store.data().get_current_retry_point().await; + let agent_mode = store.data().agent_mode(); let err: anyhow::Error = err.into(); Ok(InvokeResult::from_error::( consumed_fuel, &err, retry_from, + agent_mode, )) } } @@ -503,8 +507,9 @@ impl InvokeResult { consumed_fuel: u64, error: &anyhow::Error, retry_from: OplogIndex, + agent_mode: AgentMode, ) -> Self { - match TrapType::from_error::(error, retry_from) { + match TrapType::from_error::(error, retry_from, agent_mode) { TrapType::Interrupt(kind) => InvokeResult::Interrupted { consumed_fuel, interrupt_kind: kind, diff --git a/golem-worker-executor/src/worker/invocation_loop.rs b/golem-worker-executor/src/worker/invocation_loop.rs index d5739d4e20..0023b56b51 100644 --- a/golem-worker-executor/src/worker/invocation_loop.rs +++ b/golem-worker-executor/src/worker/invocation_loop.rs @@ -906,6 +906,7 @@ impl Invocation<'_, Ctx> { Err(error) => Some(TrapType::from_error::( &anyhow!(error), OplogIndex::INITIAL, + self.parent.agent_mode(), )), }; let decision = match trap_type { diff --git a/golem-worker-executor/src/worker/mod.rs b/golem-worker-executor/src/worker/mod.rs index 1bb7b5aa2a..5fc1525abd 100644 --- a/golem-worker-executor/src/worker/mod.rs +++ b/golem-worker-executor/src/worker/mod.rs @@ -2634,9 +2634,18 @@ impl RunningWorker { store.epoch_deadline_callback(move |mut store| { let current_level = store.get_fuel().unwrap_or(0); let data_mut = store.data_mut(); - if !data_mut.ensure_fuel(current_level) { - warn!("Could not borrow more fuel, suspending"); - return Err(InterruptKind::Suspend(Timestamp::now_utc()).into()); + if let Err(error) = data_mut.ensure_fuel(current_level) { + if data_mut.agent_mode() == AgentMode::Ephemeral { + warn!(error = ?error, "Could not borrow more fuel for ephemeral agent"); + return Err(WorkerExecutorError::InvocationFailed { + error, + stderr: String::new(), + } + .into()); + } else { + warn!("Could not borrow more fuel, suspending"); + return Err(InterruptKind::Suspend(Timestamp::now_utc()).into()); + } } match data_mut.check_interrupt() { diff --git a/golem-worker-executor/src/worker/status.rs b/golem-worker-executor/src/worker/status.rs index 452b58d9d9..72543cfa79 100644 --- a/golem-worker-executor/src/worker/status.rs +++ b/golem-worker-executor/src/worker/status.rs @@ -1048,6 +1048,9 @@ fn is_worker_error_retriable( AgentError::NodeOutOfFilesystemStorage => true, AgentError::AgentExceededFilesystemStorageLimit => false, AgentError::AgentTerminatedByQuota(_) => false, + AgentError::EphemeralSleepTooLong(_) => false, + AgentError::EphemeralFuelExhausted(_) => false, + AgentError::EphemeralCannotSuspend(_) => false, } } diff --git a/golem-worker-executor/src/workerctx/default.rs b/golem-worker-executor/src/workerctx/default.rs index c3dbf3453d..dc5a0da30c 100644 --- a/golem-worker-executor/src/workerctx/default.rs +++ b/golem-worker-executor/src/workerctx/default.rs @@ -60,7 +60,10 @@ use golem_common::model::component::{CanonicalFilePath, ComponentRevision}; use golem_common::model::invocation_context::{ self, AttributeValue, InvocationContextStack, SpanId, }; -use golem_common::model::oplog::TimestampedUpdateDescription; +use golem_common::model::oplog::{ + AgentError, EphemeralCannotSuspendError, EphemeralFuelExhaustedError, + TimestampedUpdateDescription, +}; use golem_common::model::{ AgentId, AgentInvocation, AgentInvocationOutput, AgentStatusRecord, IdempotencyKey, OwnedAgentId, @@ -105,14 +108,23 @@ struct FuelTracker { pub(self) prepaid_gauge_floor: u64, /// Number of fuel units borrowed from the account pool per batch. pub(self) fuel_to_borrow: u64, + /// Maximum fuel that can be locally overdrafted by one ephemeral invocation. + pub(self) ephemeral_overdraft_limit: u64, + /// Locally prepaid overdraft fuel for the current invocation. + pub(self) ephemeral_overdraft_prepaid: u64, + /// Whether the currently outstanding partial batch came from local ephemeral overdraft. + pub(self) last_borrow_was_ephemeral_overdraft: bool, } impl FuelTracker { - pub(self) fn new(fuel_to_borrow: u64) -> Self { + pub(self) fn new(fuel_to_borrow: u64, ephemeral_overdraft_limit: u64) -> Self { Self { gauge_at_last_return: u64::MAX, prepaid_gauge_floor: u64::MAX, fuel_to_borrow, + ephemeral_overdraft_limit, + ephemeral_overdraft_prepaid: 0, + last_borrow_was_ephemeral_overdraft: false, } } @@ -140,6 +152,52 @@ impl FuelTracker { self.prepaid_gauge_floor = current_gauge.saturating_sub(self.fuel_to_borrow); } + pub(self) fn on_account_borrow_success(&mut self, current_gauge: u64) { + self.on_borrow_success(current_gauge); + self.last_borrow_was_ephemeral_overdraft = false; + } + + pub(self) fn try_borrow_ephemeral_overdraft( + &mut self, + current_gauge: u64, + amount: u64, + ) -> Result<(), AgentError> { + let next_prepaid = self.ephemeral_overdraft_prepaid.saturating_add(amount); + if next_prepaid > self.ephemeral_overdraft_limit { + Err(AgentError::EphemeralFuelExhausted( + EphemeralFuelExhaustedError { + overdraft_limit: self.ephemeral_overdraft_limit, + }, + )) + } else { + self.ephemeral_overdraft_prepaid = next_prepaid; + self.on_borrow_success(current_gauge); + self.last_borrow_was_ephemeral_overdraft = true; + Ok(()) + } + } + + pub(self) fn overdraft_limit(&self) -> u64 { + self.ephemeral_overdraft_limit + } + + pub(self) fn last_borrow_was_ephemeral_overdraft(&self) -> bool { + self.last_borrow_was_ephemeral_overdraft + } + + pub(self) fn consumed_ephemeral_overdraft(&mut self, unused: u64) -> u64 { + let consumed_overdraft = if self.last_borrow_was_ephemeral_overdraft { + self.ephemeral_overdraft_prepaid.saturating_sub(unused) + } else { + self.ephemeral_overdraft_prepaid + }; + + self.ephemeral_overdraft_prepaid = 0; + self.last_borrow_was_ephemeral_overdraft = false; + + consumed_overdraft + } + /// How much unused pre-paid fuel to return to the account pool at invocation end. /// /// Because we only borrow when the previous batch is exhausted, at invocation @@ -183,7 +241,13 @@ impl Context { Self { durable_ctx: golem_ctx, resource_limit_entry, - fuel_tracker: FuelTracker::new(config.limits.fuel_to_borrow), + fuel_tracker: FuelTracker::new( + config.limits.fuel_to_borrow, + config + .limits + .fuel_to_borrow + .saturating_mul(config.limits.ephemeral_fuel_overdraft_multiplier), + ), } } @@ -212,27 +276,59 @@ impl DurableWorkerCtxView for Context { #[async_trait] impl FuelManagement for Context { - fn ensure_fuel(&mut self, current_level: u64) -> bool { + fn ensure_fuel(&mut self, current_level: u64) -> Result<(), AgentError> { if !self.fuel_tracker.needs_borrow(current_level) { - return true; + return Ok(()); } let amount_to_borrow = self.fuel_tracker.determine_amount_to_borrow(current_level); let success = self.resource_limit_entry.borrow_fuel(amount_to_borrow); if success { - self.fuel_tracker.on_borrow_success(current_level); - debug!("borrowed {amount_to_borrow} fuel"); + self.fuel_tracker.on_account_borrow_success(current_level); + debug!(amount = amount_to_borrow, "Borrowed fuel"); + Ok(()) + } else if self.agent_mode() == AgentMode::Ephemeral { + if !self.resource_limit_entry.has_effective_fuel() { + return Err(AgentError::EphemeralFuelExhausted( + EphemeralFuelExhaustedError { + overdraft_limit: self.fuel_tracker.overdraft_limit(), + }, + )); + } + + self.fuel_tracker + .try_borrow_ephemeral_overdraft(current_level, amount_to_borrow) + .inspect(|_| { + debug!( + amount = amount_to_borrow, + "Borrowed ephemeral overdraft fuel" + ); + }) + } else { + Err(AgentError::EphemeralCannotSuspend( + EphemeralCannotSuspendError { + reason: "fuel exhausted".to_string(), + }, + )) } - success } fn return_fuel(&mut self, current_level: u64) -> u64 { let unused = self.fuel_tracker.unused_to_return(current_level); - if unused > 0 { + if unused > 0 && !self.fuel_tracker.last_borrow_was_ephemeral_overdraft() { self.resource_limit_entry.return_fuel(unused); - debug!("returned {} fuel", unused); + debug!(amount = unused, "Returned fuel"); + } + let consumed_overdraft = self.fuel_tracker.consumed_ephemeral_overdraft(unused); + if consumed_overdraft > 0 { + self.resource_limit_entry + .record_overdraft_debt(consumed_overdraft); + debug!( + amount = consumed_overdraft, + "Recorded ephemeral overdraft fuel debt" + ); } let consumed = self.fuel_tracker.on_return(current_level); - debug!("reset fuel mark to {}", current_level); + debug!(current_level, "Reset fuel mark"); consumed } } @@ -911,7 +1007,7 @@ mod tests { const INITIAL: u64 = u64::MAX; // wasmtime gauge starting value fn fuel_tracker() -> FuelTracker { - FuelTracker::new(FUEL_TO_BORROW) + FuelTracker::new(FUEL_TO_BORROW, FUEL_TO_BORROW * 100) } #[test] diff --git a/golem-worker-executor/src/workerctx/mod.rs b/golem-worker-executor/src/workerctx/mod.rs index 0fa6ba8a39..213f989272 100644 --- a/golem-worker-executor/src/workerctx/mod.rs +++ b/golem-worker-executor/src/workerctx/mod.rs @@ -49,7 +49,7 @@ use golem_common::model::component::{CanonicalFilePath, ComponentRevision}; use golem_common::model::invocation_context::{ AttributeValue, InvocationContextSpan, InvocationContextStack, SpanId, }; -use golem_common::model::oplog::TimestampedUpdateDescription; +use golem_common::model::oplog::{AgentError, TimestampedUpdateDescription}; use golem_common::model::{ AgentId, AgentInvocation, AgentInvocationOutput, AgentStatusRecord, IdempotencyKey, OplogIndex, OwnedAgentId, @@ -222,8 +222,8 @@ pub trait WorkerCtx: pub trait FuelManagement { /// Ensures fuel is available for continued execution, borrowing a new batch /// from the account pool if the current pre-paid batch is exhausted. - /// Returns false if the account has no remaining fuel. - fn ensure_fuel(&mut self, current_level: u64) -> bool; + /// Returns an error if the account has no remaining fuel. + fn ensure_fuel(&mut self, current_level: u64) -> Result<(), AgentError>; /// Returns the amount of fuel consumed since the last call to return_fuel. fn return_fuel(&mut self, current_level: u64) -> u64; diff --git a/plugins/otlp-exporter.wasm b/plugins/otlp-exporter.wasm index 1767cb43d2..ac16fe667d 100644 Binary files a/plugins/otlp-exporter.wasm and b/plugins/otlp-exporter.wasm differ diff --git a/plugins/otlp-exporter/AGENTS.md b/plugins/otlp-exporter/AGENTS.md index f91830eb43..5c6165baf9 100644 --- a/plugins/otlp-exporter/AGENTS.md +++ b/plugins/otlp-exporter/AGENTS.md @@ -67,6 +67,7 @@ This project includes coding-agent skills in `.agents/skills/`. Load a skill whe | `golem-undo-agent-state` | Reverting agent state by undoing operations | | `golem-interrupt-resume-agent` | Interrupting and resuming a Golem agent | | `golem-test-crash-recovery` | Simulating a crash on an agent for testing crash recovery | +| `golem-integration-test-setup` | Setting up a dedicated Golem environment for integration testing — isolated local server, test environment in golem.yaml, dynamic port discovery, and non-interactive deploys | | `golem-cancel-queued-invocation` | Canceling a pending (queued) invocation on an agent | | `golem-delete-agent` | Deleting an agent instance | | `golem-interactive-repl-rust` | Using the Golem REPL for interactive testing and scripting of agents | @@ -89,6 +90,20 @@ Key concepts: - Invocations are processed **sequentially in a single thread** — no concurrency within a single agent, no need for locks - Agents can **spawn other agents** and communicate with them via **RPC** (see Agent-to-Agent Communication) - An agent is created implicitly on first invocation — no separate creation step needed +- **Futures cannot outlive invocations** — every `Future` spawned during an invocation must complete (be `.await`ed or driven to completion) before the invocation returns; do not store unresolved futures in agent state to poll them from a later invocation + +## Durability & Automatic Retries + +Golem **automatically retries** failed operations using durable execution. **Do not add manual retry loops, `loop { match ... }` retry patterns, or backoff utilities in agent code** — let operations fail and Golem will retry them. A built-in default policy (3 retries, exponential backoff with jitter, clamped to [100ms, 1s]) applies when no user-defined policy matches. + +The following are retried transparently: + +- **HTTP requests** to external services (via `wstd::http`, `golem-wasi-http`, `wasi:http`, etc.) +- **RPC calls** between agents +- **Database / storage calls** — `golem:rdbms/postgres`, `golem:rdbms/mysql`, `golem:rdbms/ignite2`, `wasi:blobstore`, `wasi:keyvalue` +- **Panics** at the top level of an agent method — the worker is restarted and the invocation is replayed from the oplog, with all previously-recorded side effects skipped + +Only customize when the *strategy* needs to change (different backoff, give-up conditions, per-status-code policies). For that, see the `golem-retry-policies-rust` skill. ## Project Structure diff --git a/plugins/otlp-exporter/components-rust/otlp-exporter/src/helpers.rs b/plugins/otlp-exporter/components-rust/otlp-exporter/src/helpers.rs index e740486d7a..7ecd82c2cf 100644 --- a/plugins/otlp-exporter/components-rust/otlp-exporter/src/helpers.rs +++ b/plugins/otlp-exporter/components-rust/otlp-exporter/src/helpers.rs @@ -82,5 +82,8 @@ pub(crate) fn worker_error_to_string(e: &WorkerError) -> String { WorkerError::NodeOutOfFilesystemStorage => "node out of filesystem storage".to_string(), WorkerError::AgentExceededFilesystemStorageLimit => "agent exceeded filesystem storage limit".to_string(), WorkerError::AgentTerminatedByQuota(_) => "agent terminated by quota".to_string(), + WorkerError::EphemeralSleepTooLong(_) => "ephemeral sleep too long".to_string(), + WorkerError::EphemeralFuelExhausted(_) => "ephemeral fuel exhausted".to_string(), + WorkerError::EphemeralCannotSuspend(_) => "ephemeral cannot suspend".to_string(), } } diff --git a/plugins/otlp-exporter/components-rust/otlp-exporter/src/processing.rs b/plugins/otlp-exporter/components-rust/otlp-exporter/src/processing.rs index 174da14dd3..aceffe1bec 100644 --- a/plugins/otlp-exporter/components-rust/otlp-exporter/src/processing.rs +++ b/plugins/otlp-exporter/components-rust/otlp-exporter/src/processing.rs @@ -275,6 +275,15 @@ fn worker_error_variant_name(e: &golem_rust::bindings::golem::api::oplog::Worker golem_rust::bindings::golem::api::oplog::WorkerError::AgentTerminatedByQuota(_) => { "AgentTerminatedByQuota".to_string() } + golem_rust::bindings::golem::api::oplog::WorkerError::EphemeralSleepTooLong(_) => { + "EphemeralSleepTooLong".to_string() + } + golem_rust::bindings::golem::api::oplog::WorkerError::EphemeralFuelExhausted(_) => { + "EphemeralFuelExhausted".to_string() + } + golem_rust::bindings::golem::api::oplog::WorkerError::EphemeralCannotSuspend(_) => { + "EphemeralCannotSuspend".to_string() + } } } diff --git a/sdks/moonbit/golem_sdk/wit/deps/golem-1.x/golem-oplog.wit b/sdks/moonbit/golem_sdk/wit/deps/golem-1.x/golem-oplog.wit index d8f57d7ad4..4121788c31 100644 --- a/sdks/moonbit/golem_sdk/wit/deps/golem-1.x/golem-oplog.wit +++ b/sdks/moonbit/golem_sdk/wit/deps/golem-1.x/golem-oplog.wit @@ -432,6 +432,19 @@ interface oplog { resource-name: string } + record ephemeral-sleep-too-long { + requested-nanos: u64, + max-nanos: u64 + } + + record ephemeral-fuel-exhausted { + overdraft-limit: u64 + } + + record ephemeral-cannot-suspend { + reason: string + } + /// Describes the error that occurred in the agent variant worker-error { unknown(string), @@ -448,7 +461,10 @@ interface oplog { exceeded-rpc-call-limit, node-out-of-filesystem-storage, agent-exceeded-filesystem-storage-limit, - agent-terminated-by-quota(agent-terminated-by-quota-error) + agent-terminated-by-quota(agent-terminated-by-quota-error), + ephemeral-sleep-too-long(ephemeral-sleep-too-long), + ephemeral-fuel-exhausted(ephemeral-fuel-exhausted), + ephemeral-cannot-suspend(ephemeral-cannot-suspend) } record raw-create-parameters { diff --git a/sdks/rust/golem-rust/wit/deps/golem-1.x/golem-oplog.wit b/sdks/rust/golem-rust/wit/deps/golem-1.x/golem-oplog.wit index d8f57d7ad4..4121788c31 100644 --- a/sdks/rust/golem-rust/wit/deps/golem-1.x/golem-oplog.wit +++ b/sdks/rust/golem-rust/wit/deps/golem-1.x/golem-oplog.wit @@ -432,6 +432,19 @@ interface oplog { resource-name: string } + record ephemeral-sleep-too-long { + requested-nanos: u64, + max-nanos: u64 + } + + record ephemeral-fuel-exhausted { + overdraft-limit: u64 + } + + record ephemeral-cannot-suspend { + reason: string + } + /// Describes the error that occurred in the agent variant worker-error { unknown(string), @@ -448,7 +461,10 @@ interface oplog { exceeded-rpc-call-limit, node-out-of-filesystem-storage, agent-exceeded-filesystem-storage-limit, - agent-terminated-by-quota(agent-terminated-by-quota-error) + agent-terminated-by-quota(agent-terminated-by-quota-error), + ephemeral-sleep-too-long(ephemeral-sleep-too-long), + ephemeral-fuel-exhausted(ephemeral-fuel-exhausted), + ephemeral-cannot-suspend(ephemeral-cannot-suspend) } record raw-create-parameters { diff --git a/sdks/scala/wit/deps/golem-1.x/golem-oplog.wit b/sdks/scala/wit/deps/golem-1.x/golem-oplog.wit index d8f57d7ad4..4121788c31 100644 --- a/sdks/scala/wit/deps/golem-1.x/golem-oplog.wit +++ b/sdks/scala/wit/deps/golem-1.x/golem-oplog.wit @@ -432,6 +432,19 @@ interface oplog { resource-name: string } + record ephemeral-sleep-too-long { + requested-nanos: u64, + max-nanos: u64 + } + + record ephemeral-fuel-exhausted { + overdraft-limit: u64 + } + + record ephemeral-cannot-suspend { + reason: string + } + /// Describes the error that occurred in the agent variant worker-error { unknown(string), @@ -448,7 +461,10 @@ interface oplog { exceeded-rpc-call-limit, node-out-of-filesystem-storage, agent-exceeded-filesystem-storage-limit, - agent-terminated-by-quota(agent-terminated-by-quota-error) + agent-terminated-by-quota(agent-terminated-by-quota-error), + ephemeral-sleep-too-long(ephemeral-sleep-too-long), + ephemeral-fuel-exhausted(ephemeral-fuel-exhausted), + ephemeral-cannot-suspend(ephemeral-cannot-suspend) } record raw-create-parameters { diff --git a/sdks/scala/wit/dts/golem_api_1_5_0_oplog.d.ts b/sdks/scala/wit/dts/golem_api_1_5_0_oplog.d.ts index 2b729782f1..b991fcb4bb 100644 --- a/sdks/scala/wit/dts/golem_api_1_5_0_oplog.d.ts +++ b/sdks/scala/wit/dts/golem_api_1_5_0_oplog.d.ts @@ -476,6 +476,16 @@ declare module 'golem:api/oplog@1.5.0' { environmentId: EnvironmentId; resourceName: string; }; + export type EphemeralSleepTooLong = { + requestedNanos: bigint; + maxNanos: bigint; + }; + export type EphemeralFuelExhausted = { + overdraftLimit: bigint; + }; + export type EphemeralCannotSuspend = { + reason: string; + }; /** * Describes the error that occurred in the agent */ @@ -531,6 +541,18 @@ declare module 'golem:api/oplog@1.5.0' { { tag: 'agent-terminated-by-quota' val: AgentTerminatedByQuotaError + } | + { + tag: 'ephemeral-sleep-too-long' + val: EphemeralSleepTooLong + } | + { + tag: 'ephemeral-fuel-exhausted' + val: EphemeralFuelExhausted + } | + { + tag: 'ephemeral-cannot-suspend' + val: EphemeralCannotSuspend }; export type RawCreateParameters = { timestamp: Datetime; diff --git a/sdks/ts/packages/golem-ts-sdk/types/golem_api_1_5_0_oplog.d.ts b/sdks/ts/packages/golem-ts-sdk/types/golem_api_1_5_0_oplog.d.ts index 2b729782f1..b991fcb4bb 100644 --- a/sdks/ts/packages/golem-ts-sdk/types/golem_api_1_5_0_oplog.d.ts +++ b/sdks/ts/packages/golem-ts-sdk/types/golem_api_1_5_0_oplog.d.ts @@ -476,6 +476,16 @@ declare module 'golem:api/oplog@1.5.0' { environmentId: EnvironmentId; resourceName: string; }; + export type EphemeralSleepTooLong = { + requestedNanos: bigint; + maxNanos: bigint; + }; + export type EphemeralFuelExhausted = { + overdraftLimit: bigint; + }; + export type EphemeralCannotSuspend = { + reason: string; + }; /** * Describes the error that occurred in the agent */ @@ -531,6 +541,18 @@ declare module 'golem:api/oplog@1.5.0' { { tag: 'agent-terminated-by-quota' val: AgentTerminatedByQuotaError + } | + { + tag: 'ephemeral-sleep-too-long' + val: EphemeralSleepTooLong + } | + { + tag: 'ephemeral-fuel-exhausted' + val: EphemeralFuelExhausted + } | + { + tag: 'ephemeral-cannot-suspend' + val: EphemeralCannotSuspend }; export type RawCreateParameters = { timestamp: Datetime; diff --git a/sdks/ts/wit/deps/golem-1.x/golem-oplog.wit b/sdks/ts/wit/deps/golem-1.x/golem-oplog.wit index d8f57d7ad4..4121788c31 100644 --- a/sdks/ts/wit/deps/golem-1.x/golem-oplog.wit +++ b/sdks/ts/wit/deps/golem-1.x/golem-oplog.wit @@ -432,6 +432,19 @@ interface oplog { resource-name: string } + record ephemeral-sleep-too-long { + requested-nanos: u64, + max-nanos: u64 + } + + record ephemeral-fuel-exhausted { + overdraft-limit: u64 + } + + record ephemeral-cannot-suspend { + reason: string + } + /// Describes the error that occurred in the agent variant worker-error { unknown(string), @@ -448,7 +461,10 @@ interface oplog { exceeded-rpc-call-limit, node-out-of-filesystem-storage, agent-exceeded-filesystem-storage-limit, - agent-terminated-by-quota(agent-terminated-by-quota-error) + agent-terminated-by-quota(agent-terminated-by-quota-error), + ephemeral-sleep-too-long(ephemeral-sleep-too-long), + ephemeral-fuel-exhausted(ephemeral-fuel-exhausted), + ephemeral-cannot-suspend(ephemeral-cannot-suspend) } record raw-create-parameters { diff --git a/wit/deps/golem-1.x/golem-oplog.wit b/wit/deps/golem-1.x/golem-oplog.wit index d8f57d7ad4..4121788c31 100644 --- a/wit/deps/golem-1.x/golem-oplog.wit +++ b/wit/deps/golem-1.x/golem-oplog.wit @@ -432,6 +432,19 @@ interface oplog { resource-name: string } + record ephemeral-sleep-too-long { + requested-nanos: u64, + max-nanos: u64 + } + + record ephemeral-fuel-exhausted { + overdraft-limit: u64 + } + + record ephemeral-cannot-suspend { + reason: string + } + /// Describes the error that occurred in the agent variant worker-error { unknown(string), @@ -448,7 +461,10 @@ interface oplog { exceeded-rpc-call-limit, node-out-of-filesystem-storage, agent-exceeded-filesystem-storage-limit, - agent-terminated-by-quota(agent-terminated-by-quota-error) + agent-terminated-by-quota(agent-terminated-by-quota-error), + ephemeral-sleep-too-long(ephemeral-sleep-too-long), + ephemeral-fuel-exhausted(ephemeral-fuel-exhausted), + ephemeral-cannot-suspend(ephemeral-cannot-suspend) } record raw-create-parameters {