From 56cce7d9618c96be32b6decf860a2a3477220e32 Mon Sep 17 00:00:00 2001 From: Satbir Chahal Date: Mon, 24 Nov 2025 13:00:38 -0800 Subject: [PATCH 1/6] fix(target-allocator): Extend target allocator CA certificate duration to prevent renewal race --- .chloggen/fix-ca-cert-renewal-race.yaml | 19 +++++++++++++++++++ .../manifests/targetallocator/certificate.go | 5 +++++ .../targetallocator/certificate_test.go | 4 ++++ 3 files changed, 28 insertions(+) create mode 100644 .chloggen/fix-ca-cert-renewal-race.yaml diff --git a/.chloggen/fix-ca-cert-renewal-race.yaml b/.chloggen/fix-ca-cert-renewal-race.yaml new file mode 100644 index 0000000000..59a09d9683 --- /dev/null +++ b/.chloggen/fix-ca-cert-renewal-race.yaml @@ -0,0 +1,19 @@ +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: bug_fix + +# The name of the component, or a single word describing the area of concern, (e.g. collector, target allocator, auto-instrumentation, opamp, github action) +component: target allocator + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Fix CA certificate renewal race condition by extending CA certificate duration to 1 year + +# One or more tracking issues related to the change +issues: [4441] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: | + The CA certificate now has a 1-year duration (instead of the default 90 days) to prevent race conditions + where client and server certificates could be signed by different CA versions during simultaneous renewal. + This ensures the CA remains stable while dependent certificates renew regularly. diff --git a/internal/manifests/targetallocator/certificate.go b/internal/manifests/targetallocator/certificate.go index 46d0bd710f..8eba3d92f2 100644 --- a/internal/manifests/targetallocator/certificate.go +++ b/internal/manifests/targetallocator/certificate.go @@ -5,6 +5,7 @@ package targetallocator import ( "fmt" + "time" cmv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1" cmmeta "github.com/cert-manager/cert-manager/pkg/apis/meta/v1" @@ -28,6 +29,10 @@ func CACertificate(params Params) *cmv1.Certificate { Spec: cmv1.CertificateSpec{ IsCA: true, CommonName: naming.CACertificate(params.TargetAllocator.Name), + // Set CA certificate to 1 year (much longer than the default 90-day duration of client/server certs) + // to prevent renewal race conditions where client and server certificates might be signed by different + // CA versions during simultaneous renewal. This ensures the CA remains stable while dependent certificates renew. + Duration: &metav1.Duration{Duration: 8760 * time.Hour}, // 1 year Subject: &cmv1.X509Subject{ OrganizationalUnits: []string{"opentelemetry-operator"}, }, diff --git a/internal/manifests/targetallocator/certificate_test.go b/internal/manifests/targetallocator/certificate_test.go index bf6b0e280a..fa1638d5ed 100644 --- a/internal/manifests/targetallocator/certificate_test.go +++ b/internal/manifests/targetallocator/certificate_test.go @@ -5,6 +5,7 @@ package targetallocator import ( "testing" + "time" "github.com/stretchr/testify/assert" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -83,6 +84,9 @@ func TestCACertificate(t *testing.T) { assert.Equal(t, "Issuer", caCert.Spec.IssuerRef.Kind) assert.Equal(t, []string{"opentelemetry-operator"}, caCert.Spec.Subject.OrganizationalUnits) assert.Equal(t, tt.expectedLabels, caCert.Labels) + // Verify CA certificate has 1 year duration to prevent renewal race conditions + assert.NotNil(t, caCert.Spec.Duration) + assert.Equal(t, 8760*time.Hour, caCert.Spec.Duration.Duration) }) } } From cfe2797ca3ef40071b6ac13524df1378eb3cf500 Mon Sep 17 00:00:00 2001 From: Satbir Chahal Date: Mon, 24 Nov 2025 13:25:27 -0800 Subject: [PATCH 2/6] fix TestBuildTargetAllocator/mtls test --- internal/controllers/builder_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/controllers/builder_test.go b/internal/controllers/builder_test.go index 762b406d1b..7d37124a6c 100644 --- a/internal/controllers/builder_test.go +++ b/internal/controllers/builder_test.go @@ -5,6 +5,7 @@ package controllers import ( "testing" + "time" cmv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1" cmmetav1 "github.com/cert-manager/cert-manager/pkg/apis/meta/v1" @@ -3261,6 +3262,7 @@ prometheus_cr: OrganizationalUnits: []string{"opentelemetry-operator"}, }, CommonName: "test-ca-cert", + Duration: &metav1.Duration{Duration: 8760 * time.Hour}, IsCA: true, SecretName: "test-ca-cert", IssuerRef: cmmetav1.ObjectReference{ From 1186ee270f27f0c0b7f6c22caa9983b0755e7475 Mon Sep 17 00:00:00 2001 From: Satbir Chahal Date: Wed, 26 Nov 2025 14:26:54 -0800 Subject: [PATCH 3/6] set CA certificate renewBefore to 100 days for safe renewal window vs client/server certs --- internal/controllers/builder_test.go | 9 +++++---- internal/manifests/targetallocator/certificate.go | 5 +++++ internal/manifests/targetallocator/certificate_test.go | 3 +++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/internal/controllers/builder_test.go b/internal/controllers/builder_test.go index 7d37124a6c..707bc45d5a 100644 --- a/internal/controllers/builder_test.go +++ b/internal/controllers/builder_test.go @@ -3261,10 +3261,11 @@ prometheus_cr: Subject: &cmv1.X509Subject{ OrganizationalUnits: []string{"opentelemetry-operator"}, }, - CommonName: "test-ca-cert", - Duration: &metav1.Duration{Duration: 8760 * time.Hour}, - IsCA: true, - SecretName: "test-ca-cert", + CommonName: "test-ca-cert", + Duration: &metav1.Duration{Duration: 8760 * time.Hour}, + RenewBefore: &metav1.Duration{Duration: 2400 * time.Hour}, + IsCA: true, + SecretName: "test-ca-cert", IssuerRef: cmmetav1.ObjectReference{ Name: "test-self-signed-issuer", Kind: "Issuer", diff --git a/internal/manifests/targetallocator/certificate.go b/internal/manifests/targetallocator/certificate.go index 8eba3d92f2..a5b5189b53 100644 --- a/internal/manifests/targetallocator/certificate.go +++ b/internal/manifests/targetallocator/certificate.go @@ -33,6 +33,11 @@ func CACertificate(params Params) *cmv1.Certificate { // to prevent renewal race conditions where client and server certificates might be signed by different // CA versions during simultaneous renewal. This ensures the CA remains stable while dependent certificates renew. Duration: &metav1.Duration{Duration: 8760 * time.Hour}, // 1 year + // Set renewBefore to 100 days (longer than the 90-day client/server cert duration) to ensure: + // 1. CA renewal doesn't coincide with client/server renewal cycles (which occur every ~60 days at the 2/3 point of their 90-day lifetime) + // 2. The CA always has sufficient remaining validity (≥100 days) to safely issue client/server certificates with 90-day lifetimes + // 3. Client/server certificates can never outlive the CA certificate that signed them + RenewBefore: &metav1.Duration{Duration: 2400 * time.Hour}, // 100 days Subject: &cmv1.X509Subject{ OrganizationalUnits: []string{"opentelemetry-operator"}, }, diff --git a/internal/manifests/targetallocator/certificate_test.go b/internal/manifests/targetallocator/certificate_test.go index fa1638d5ed..ed8e99a394 100644 --- a/internal/manifests/targetallocator/certificate_test.go +++ b/internal/manifests/targetallocator/certificate_test.go @@ -87,6 +87,9 @@ func TestCACertificate(t *testing.T) { // Verify CA certificate has 1 year duration to prevent renewal race conditions assert.NotNil(t, caCert.Spec.Duration) assert.Equal(t, 8760*time.Hour, caCert.Spec.Duration.Duration) + // Verify CA certificate renewBefore is set to 100 days (longer than client/server cert duration) + assert.NotNil(t, caCert.Spec.RenewBefore) + assert.Equal(t, 2400*time.Hour, caCert.Spec.RenewBefore.Duration) }) } } From 3e33c644404616d338171e6fa8da83e3eb88207d Mon Sep 17 00:00:00 2001 From: Satbir Chahal Date: Sun, 30 Nov 2025 09:21:44 -0800 Subject: [PATCH 4/6] use constants (explicitly set) and move/tweak comments to those defs --- internal/controllers/builder_test.go | 5 ++- .../manifests/targetallocator/certificate.go | 35 ++++++++++++++----- .../targetallocator/certificate_test.go | 11 ++++-- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/internal/controllers/builder_test.go b/internal/controllers/builder_test.go index 707bc45d5a..677e1c0d63 100644 --- a/internal/controllers/builder_test.go +++ b/internal/controllers/builder_test.go @@ -5,7 +5,6 @@ package controllers import ( "testing" - "time" cmv1 "github.com/cert-manager/cert-manager/pkg/apis/certmanager/v1" cmmetav1 "github.com/cert-manager/cert-manager/pkg/apis/meta/v1" @@ -3262,8 +3261,8 @@ prometheus_cr: OrganizationalUnits: []string{"opentelemetry-operator"}, }, CommonName: "test-ca-cert", - Duration: &metav1.Duration{Duration: 8760 * time.Hour}, - RenewBefore: &metav1.Duration{Duration: 2400 * time.Hour}, + Duration: &metav1.Duration{Duration: targetallocator.CACertDuration}, + RenewBefore: &metav1.Duration{Duration: targetallocator.CACertRenewBefore}, IsCA: true, SecretName: "test-ca-cert", IssuerRef: cmmetav1.ObjectReference{ diff --git a/internal/manifests/targetallocator/certificate.go b/internal/manifests/targetallocator/certificate.go index a5b5189b53..1b16ced73a 100644 --- a/internal/manifests/targetallocator/certificate.go +++ b/internal/manifests/targetallocator/certificate.go @@ -15,6 +15,27 @@ import ( "github.com/open-telemetry/opentelemetry-operator/internal/naming" ) +const ( + // ClientCertDuration is the validity period for client and server certificates (90 days). + // cert-manager defaults to renewing at 2/3 of the duration (~60 days), ensuring certificates + // are refreshed well before expiration (we'll keep it at 90d explicitly). + ClientCertDuration = time.Hour * 24 * 90 + + // CACertRenewBefore defines when the CA certificate should begin renewal (181 days before expiry). + // Set to 2x ClientCertDuration + 1 day to ensure: + // 1. CA renewal doesn't coincide with client/server renewal cycles (which occur every 60 days: day 60, 120, 180, 240, 300, 360, 420, 480, 540...) + // 2. Without the +1 day offset, CA would renew at day 540 (when 180 days remain), colliding with the 9th client cert renewal + // 3. With +1 day, CA renews at day 539 (when 181 days remain), avoiding the race condition + // 4. The CA always has sufficient remaining validity (≥181 days) to safely issue 90-day client/server certificates + CACertRenewBefore = ClientCertDuration*2 + 24*time.Hour + + // CACertDuration is the validity period for the CA certificate (720 days = ~2 years). + // Set to 8x ClientCertDuration to prevent renewal race conditions where client and server + // certificates might be signed by different CA versions during simultaneous renewal. + // This ensures the CA remains stable through multiple client/server certificate renewal cycles. + CACertDuration = ClientCertDuration * 8 +) + // / CACertificate returns a CA Certificate for the given instance. func CACertificate(params Params) *cmv1.Certificate { name := naming.CACertificate(params.TargetAllocator.Name) @@ -29,15 +50,9 @@ func CACertificate(params Params) *cmv1.Certificate { Spec: cmv1.CertificateSpec{ IsCA: true, CommonName: naming.CACertificate(params.TargetAllocator.Name), - // Set CA certificate to 1 year (much longer than the default 90-day duration of client/server certs) - // to prevent renewal race conditions where client and server certificates might be signed by different - // CA versions during simultaneous renewal. This ensures the CA remains stable while dependent certificates renew. - Duration: &metav1.Duration{Duration: 8760 * time.Hour}, // 1 year - // Set renewBefore to 100 days (longer than the 90-day client/server cert duration) to ensure: - // 1. CA renewal doesn't coincide with client/server renewal cycles (which occur every ~60 days at the 2/3 point of their 90-day lifetime) - // 2. The CA always has sufficient remaining validity (≥100 days) to safely issue client/server certificates with 90-day lifetimes - // 3. Client/server certificates can never outlive the CA certificate that signed them - RenewBefore: &metav1.Duration{Duration: 2400 * time.Hour}, // 100 days + // Use longer duration and renewBefore to prevent renewal race conditions with client/server certs + Duration: &metav1.Duration{Duration: CACertDuration}, + RenewBefore: &metav1.Duration{Duration: CACertRenewBefore}, Subject: &cmv1.X509Subject{ OrganizationalUnits: []string{"opentelemetry-operator"}, }, @@ -62,6 +77,7 @@ func ServingCertificate(params Params) *cmv1.Certificate { Labels: labels, }, Spec: cmv1.CertificateSpec{ + Duration: &metav1.Duration{Duration: ClientCertDuration}, DNSNames: []string{ naming.TAService(params.TargetAllocator.Name), fmt.Sprintf("%s.%s.svc", naming.TAService(params.TargetAllocator.Name), params.TargetAllocator.Namespace), @@ -95,6 +111,7 @@ func ClientCertificate(params Params) *cmv1.Certificate { Labels: labels, }, Spec: cmv1.CertificateSpec{ + Duration: &metav1.Duration{Duration: ClientCertDuration}, DNSNames: []string{ naming.TAService(params.TargetAllocator.Name), fmt.Sprintf("%s.%s.svc", naming.TAService(params.TargetAllocator.Name), params.TargetAllocator.Namespace), diff --git a/internal/manifests/targetallocator/certificate_test.go b/internal/manifests/targetallocator/certificate_test.go index ed8e99a394..ea4ab3816b 100644 --- a/internal/manifests/targetallocator/certificate_test.go +++ b/internal/manifests/targetallocator/certificate_test.go @@ -5,7 +5,6 @@ package targetallocator import ( "testing" - "time" "github.com/stretchr/testify/assert" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -86,10 +85,10 @@ func TestCACertificate(t *testing.T) { assert.Equal(t, tt.expectedLabels, caCert.Labels) // Verify CA certificate has 1 year duration to prevent renewal race conditions assert.NotNil(t, caCert.Spec.Duration) - assert.Equal(t, 8760*time.Hour, caCert.Spec.Duration.Duration) + assert.Equal(t, CACertDuration, caCert.Spec.Duration.Duration) // Verify CA certificate renewBefore is set to 100 days (longer than client/server cert duration) assert.NotNil(t, caCert.Spec.RenewBefore) - assert.Equal(t, 2400*time.Hour, caCert.Spec.RenewBefore.Duration) + assert.Equal(t, CACertRenewBefore, caCert.Spec.RenewBefore.Duration) }) } } @@ -151,6 +150,9 @@ func TestServingCertificate(t *testing.T) { assert.ElementsMatch(t, tt.expectedDNSNames, servingCert.Spec.DNSNames) assert.ElementsMatch(t, tt.expectedOrganizationUnit, servingCert.Spec.Subject.OrganizationalUnits) assert.Equal(t, tt.expectedLabels, servingCert.Labels) + // Verify serving certificate duration is set to 90 days + assert.NotNil(t, servingCert.Spec.Duration) + assert.Equal(t, ClientCertDuration, servingCert.Spec.Duration.Duration) }) } } @@ -212,6 +214,9 @@ func TestClientCertificate(t *testing.T) { assert.ElementsMatch(t, tt.expectedDNSNames, clientCert.Spec.DNSNames) assert.ElementsMatch(t, tt.expectedOrganizationUnit, clientCert.Spec.Subject.OrganizationalUnits) assert.Equal(t, tt.expectedLabels, clientCert.Labels) + // Verify client certificate duration is set to 90 days + assert.NotNil(t, clientCert.Spec.Duration) + assert.Equal(t, ClientCertDuration, clientCert.Spec.Duration.Duration) }) } } From 860e3aed48d19ca05a4db31c3ea09b6ed7be9400 Mon Sep 17 00:00:00 2001 From: Satbir Chahal Date: Sun, 30 Nov 2025 09:31:39 -0800 Subject: [PATCH 5/6] add period to pass make lint --- internal/manifests/targetallocator/certificate.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/manifests/targetallocator/certificate.go b/internal/manifests/targetallocator/certificate.go index 1b16ced73a..4677e25ce9 100644 --- a/internal/manifests/targetallocator/certificate.go +++ b/internal/manifests/targetallocator/certificate.go @@ -23,10 +23,10 @@ const ( // CACertRenewBefore defines when the CA certificate should begin renewal (181 days before expiry). // Set to 2x ClientCertDuration + 1 day to ensure: - // 1. CA renewal doesn't coincide with client/server renewal cycles (which occur every 60 days: day 60, 120, 180, 240, 300, 360, 420, 480, 540...) - // 2. Without the +1 day offset, CA would renew at day 540 (when 180 days remain), colliding with the 9th client cert renewal - // 3. With +1 day, CA renews at day 539 (when 181 days remain), avoiding the race condition - // 4. The CA always has sufficient remaining validity (≥181 days) to safely issue 90-day client/server certificates + // 1. CA renewal doesn't coincide with client/server renewal cycles (which occur every 60 days: day 60, 120, 180, 240, 300, 360, 420, 480, 540...). + // 2. Without the +1 day offset, CA would renew at day 540 (when 180 days remain), colliding with the 9th client cert renewal. + // 3. With +1 day, CA renews at day 539 (when 181 days remain), avoiding the race condition. + // 4. The CA always has sufficient remaining validity (≥181 days) to safely issue 90-day client/server certificates. CACertRenewBefore = ClientCertDuration*2 + 24*time.Hour // CACertDuration is the validity period for the CA certificate (720 days = ~2 years). From 27d735225dd27f30b7eb679b7204b4f066d280fc Mon Sep 17 00:00:00 2001 From: Satbir Chahal Date: Mon, 1 Dec 2025 09:11:27 -0800 Subject: [PATCH 6/6] update unit tests since we now explicitly set client/server cert duration --- internal/controllers/builder_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/controllers/builder_test.go b/internal/controllers/builder_test.go index 677e1c0d63..4886235940 100644 --- a/internal/controllers/builder_test.go +++ b/internal/controllers/builder_test.go @@ -3311,6 +3311,7 @@ prometheus_cr: Subject: &cmv1.X509Subject{ OrganizationalUnits: []string{"opentelemetry-operator"}, }, + Duration: &metav1.Duration{Duration: targetallocator.ClientCertDuration}, DNSNames: []string{ "test-targetallocator", "test-targetallocator.test.svc", @@ -3344,6 +3345,7 @@ prometheus_cr: Subject: &cmv1.X509Subject{ OrganizationalUnits: []string{"opentelemetry-operator"}, }, + Duration: &metav1.Duration{Duration: targetallocator.ClientCertDuration}, DNSNames: []string{ "test-targetallocator", "test-targetallocator.test.svc",