Skip to content

Commit b644e54

Browse files
authored
Allow cleaning up old replicasets while the new one is not ready (#59)
1 parent dbcdd7a commit b644e54

File tree

5 files changed

+57
-10
lines changed

5 files changed

+57
-10
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "restate-operator"
3-
version = "1.8.1"
3+
version = "1.8.2"
44
authors = ["restate.dev"]
55
edition = "2021"
66
rust-version = "1.86"

charts/restate-operator-helm/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@ apiVersion: v2
22
name: restate-operator-helm
33
description: An operator for Restate clusters
44
type: application
5-
version: "1.8.1"
5+
version: "1.8.2"

src/controllers/restatedeployment/controller.rs

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -330,13 +330,44 @@ impl RestateDeployment {
330330
.cloned()
331331
.unwrap_or_default()
332332
}) {
333-
if let Some(cluster_name) = &self.spec.restate.register.cluster {
334-
// wait for the cluster to be ready before registering to it
335-
validate_cluster_status(rsc_api, cluster_name).await?;
336-
};
333+
let valid = async {
334+
if let Some(cluster_name) = &self.spec.restate.register.cluster {
335+
// wait for the cluster to be ready before registering to it
336+
validate_cluster_status(rsc_api, cluster_name).await?;
337+
}
337338

338-
// wait for the replicaset to be ready before registering it
339-
validate_replica_set_status(replicaset.status.as_ref(), self.spec.replicas)?;
339+
// wait for the replicaset to be ready before registering it
340+
validate_replica_set_status(replicaset.status.as_ref(), self.spec.replicas)?;
341+
342+
Ok(())
343+
}
344+
.await;
345+
346+
match valid {
347+
Ok(()) => {}
348+
// there is a chicken and egg situation if the cluster is out of capacity; the new version can't become ready until
349+
// old versions are removed. so we remove them aggressively here
350+
Err(ready_err @ Error::DeploymentNotReady { .. }) => {
351+
match reconcilers::replicaset::cleanup_old_replicasets(
352+
namespace,
353+
&ctx,
354+
&rs_api,
355+
&my_uid,
356+
self,
357+
&deployments,
358+
Some(&versioned_name), // exclude the replicaset which may not be registered
359+
)
360+
.await
361+
{
362+
Ok((_, _)) => return Err(ready_err),
363+
Err(cleanup_err) => {
364+
error!("Failed to clean up old replicasets while waiting for current replicaset to become ready: {cleanup_err}");
365+
return Err(ready_err);
366+
}
367+
}
368+
}
369+
Err(err) => return Err(err),
370+
}
340371

341372
// Register the latest version with Restate cluster using the service URL
342373
let deployment_id = self
@@ -385,6 +416,7 @@ impl RestateDeployment {
385416
&my_uid,
386417
self,
387418
&deployments,
419+
Some(&versioned_name),
388420
)
389421
.await?;
390422

@@ -667,6 +699,7 @@ impl RestateDeployment {
667699
&my_uid,
668700
self,
669701
&deployments,
702+
None,
670703
)
671704
.await?;
672705

src/controllers/restatedeployment/reconcilers/replicaset.rs

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,15 +162,29 @@ pub async fn cleanup_old_replicasets(
162162
rsd_uid: &str,
163163
rsd: &RestateDeployment,
164164
deployments: &HashMap<String, bool>,
165+
except_rs: Option<&str>,
165166
) -> Result<(i32, Option<chrono::DateTime<chrono::Utc>>)> {
166167
let replicasets_cell = std::cell::Cell::new(Vec::new());
167168

168169
let _ = ctx.replicasets_store.find(|rs| {
170+
let rs_namespace = match &rs.metadata.namespace.as_deref() {
171+
Some("") | None => "default",
172+
Some(ns) => ns,
173+
};
169174
// replicasets in the same ns
170-
if rs.metadata.namespace.as_deref() != Some(namespace) {
175+
if rs_namespace != namespace {
171176
return false;
172177
}
173178

179+
// not the current version if we are actively trying to register it
180+
if let Some(except_rs) = except_rs {
181+
let rs_name = rs.name_any();
182+
183+
if rs_name == except_rs {
184+
return false;
185+
}
186+
}
187+
174188
// replicasets owned by this restatedeployment (we make no attempt to handle orphaned ones if a rsd was deleted with --cascade=orphan and then recreated)
175189
if !rs.owner_references().iter().any(|reference| {
176190
reference.uid == rsd_uid && reference.kind == RestateDeployment::kind(&())

0 commit comments

Comments
 (0)