diff --git a/changelog/20251216_fix_tls_monitoring.md b/changelog/20251216_fix_tls_monitoring.md new file mode 100644 index 000000000..ec63eeb2e --- /dev/null +++ b/changelog/20251216_fix_tls_monitoring.md @@ -0,0 +1,6 @@ +--- +kind: fix +date: 2025-12-16 +--- + +* Fixed an issue where monitoring agents would fail after disabling TLS on a MongoDB deployment. diff --git a/controllers/om/deployment.go b/controllers/om/deployment.go index 3b75e19fe..f1aa4ae98 100644 --- a/controllers/om/deployment.go +++ b/controllers/om/deployment.go @@ -6,6 +6,7 @@ import ( "fmt" "math" "regexp" + "slices" "github.com/blang/semver" "github.com/spf13/cast" @@ -251,23 +252,19 @@ func (d Deployment) MergeShardedCluster(opts DeploymentShardedClusterMergeOption return shardsScheduledForRemoval, nil } -// AddMonitoringAndBackup adds monitoring and backup agents to each process -// The automation agent will update the agents versions to the latest version automatically +// ConfigureMonitoringAndBackup configures monitoring and backup agents for each process. +// This is called on every reconcile to ensure the monitoring/backup config matches the desired state. +// The automation agent will update the agents versions to the latest version automatically. // Note, that these two are deliberately combined as all clients (standalone, rs etc.) need both backup and monitoring -// together -func (d Deployment) AddMonitoringAndBackup(log *zap.SugaredLogger, tls bool, caFilepath string) { +// together. +func (d Deployment) ConfigureMonitoringAndBackup(log *zap.SugaredLogger, tls bool, caFilepath string) { if len(d.getProcesses()) == 0 { return } - d.AddMonitoring(log, tls, caFilepath) + d.ConfigureMonitoring(log, tls, caFilepath) d.addBackup(log) } -// DEPRECATED: this shouldn't be used as it may panic because of different underlying type; use GetReplicaSets instead -func (d Deployment) ReplicaSets() []ReplicaSet { - return d["replicaSets"].([]ReplicaSet) -} - func (d Deployment) GetReplicaSetByName(name string) ReplicaSet { for _, rs := range d.GetReplicaSets() { if rs.Name() == name { @@ -277,48 +274,40 @@ func (d Deployment) GetReplicaSetByName(name string) ReplicaSet { return nil } -// AddMonitoring adds monitoring agents for all processes in the deployment -func (d Deployment) AddMonitoring(log *zap.SugaredLogger, tls bool, caFilePath string) { +// ConfigureMonitoring configures monitoring agents for all processes in the deployment. +// This is called on every reconcile to ensure the monitoring config matches the desired state. +func (d Deployment) ConfigureMonitoring(log *zap.SugaredLogger, tls bool, caFilePath string) { if len(d.getProcesses()) == 0 { return } + monitoringVersions := d.getMonitoringVersions() for _, p := range d.getProcesses() { - found := false - var monitoringVersion map[string]interface{} - for _, m := range monitoringVersions { - monitoringVersion = m.(map[string]interface{}) - if monitoringVersion["hostname"] == p.HostName() { - found = true - break - } - } + hostname := p.HostName() + pemKeyFile := p.EnsureTLSConfig()["PEMKeyFile"] - if !found { - monitoringVersion = map[string]interface{}{ - "hostname": p.HostName(), + foundIdx := slices.IndexFunc(monitoringVersions, func(m interface{}) bool { + return m.(map[string]interface{})["hostname"] == hostname + }) + + if foundIdx == -1 { + mv := map[string]interface{}{ + "hostname": hostname, "name": MonitoringAgentDefaultVersion, } - log.Debugw("Added monitoring agent configuration", "host", p.HostName(), "tls", tls) - monitoringVersions = append(monitoringVersions, monitoringVersion) - } - - monitoringVersion["hostname"] = p.HostName() - - if tls { - additionalParams := map[string]string{ - "useSslForAllConnections": "true", - "sslTrustedServerCertificates": caFilePath, + if tls { + mv["additionalParams"] = NewTLSParams(caFilePath, pemKeyFile) } - - pemKeyFile := p.EnsureTLSConfig()["PEMKeyFile"] - if pemKeyFile != nil { - additionalParams["sslClientCertificate"] = pemKeyFile.(string) + log.Debugw("Added monitoring agent configuration", "host", hostname, "tls", tls) + monitoringVersions = append(monitoringVersions, mv) + } else { + mv := monitoringVersions[foundIdx].(map[string]interface{}) + if tls { + mv["additionalParams"] = NewTLSParams(caFilePath, pemKeyFile) + } else { + ClearTLSParamsFromMonitoringVersion(mv) } - - monitoringVersion["additionalParams"] = additionalParams } - } d.setMonitoringVersions(monitoringVersions) } diff --git a/controllers/om/deployment/testing_utils.go b/controllers/om/deployment/testing_utils.go index 79f0ebf82..0a5333ccc 100644 --- a/controllers/om/deployment/testing_utils.go +++ b/controllers/om/deployment/testing_utils.go @@ -37,7 +37,7 @@ func CreateFromReplicaSet(mongoDBImage string, forceEnterprise bool, rs *mdb.Mon lastConfig.ToMap(), zap.S(), ) - d.AddMonitoringAndBackup(zap.S(), rs.Spec.GetSecurity().IsTLSEnabled(), util.CAFilePathInContainer) + d.ConfigureMonitoringAndBackup(zap.S(), rs.Spec.GetSecurity().IsTLSEnabled(), util.CAFilePathInContainer) d.ConfigureTLS(rs.Spec.GetSecurity(), util.CAFilePathInContainer) return d } diff --git a/controllers/om/deployment_test.go b/controllers/om/deployment_test.go index 281731187..2dad50bdb 100644 --- a/controllers/om/deployment_test.go +++ b/controllers/om/deployment_test.go @@ -489,12 +489,12 @@ func TestConfiguringTlsProcessFromOpsManager(t *testing.T) { } } -func TestAddMonitoring(t *testing.T) { +func TestConfigureMonitoring(t *testing.T) { d := NewDeployment() rs0 := buildRsByProcesses("my-rs", createReplicaSetProcessesCount(3, "my-rs")) d.MergeReplicaSet(rs0, nil, nil, zap.S()) - d.AddMonitoring(zap.S(), false, util.CAFilePathInContainer) + d.ConfigureMonitoring(zap.S(), false, util.CAFilePathInContainer) expectedMonitoringVersions := []interface{}{ map[string]interface{}{"hostname": "my-rs-0.some.host", "name": MonitoringAgentDefaultVersion}, @@ -504,16 +504,16 @@ func TestAddMonitoring(t *testing.T) { assert.Equal(t, expectedMonitoringVersions, d.getMonitoringVersions()) // adding again - nothing changes - d.AddMonitoring(zap.S(), false, util.CAFilePathInContainer) + d.ConfigureMonitoring(zap.S(), false, util.CAFilePathInContainer) assert.Equal(t, expectedMonitoringVersions, d.getMonitoringVersions()) } -func TestAddMonitoringTls(t *testing.T) { +func TestConfigureMonitoringTls(t *testing.T) { d := NewDeployment() rs0 := buildRsByProcesses("my-rs", createReplicaSetProcessesCount(3, "my-rs")) d.MergeReplicaSet(rs0, nil, nil, zap.S()) - d.AddMonitoring(zap.S(), true, util.CAFilePathInContainer) + d.ConfigureMonitoring(zap.S(), true, util.CAFilePathInContainer) expectedAdditionalParams := map[string]string{ "useSslForAllConnections": "true", @@ -528,10 +528,39 @@ func TestAddMonitoringTls(t *testing.T) { assert.Equal(t, expectedMonitoringVersions, d.getMonitoringVersions()) // adding again - nothing changes - d.AddMonitoring(zap.S(), false, util.CAFilePathInContainer) + d.ConfigureMonitoring(zap.S(), true, util.CAFilePathInContainer) assert.Equal(t, expectedMonitoringVersions, d.getMonitoringVersions()) } +func TestConfigureMonitoringTLSDisable(t *testing.T) { + d := NewDeployment() + + rs0 := buildRsByProcesses("my-rs", createReplicaSetProcessesCount(3, "my-rs")) + d.MergeReplicaSet(rs0, nil, nil, zap.S()) + d.ConfigureMonitoring(zap.S(), true, util.CAFilePathInContainer) + + // verify TLS is present in additionalParams + expectedAdditionalParams := map[string]string{ + "useSslForAllConnections": "true", + "sslTrustedServerCertificates": util.CAFilePathInContainer, + } + expectedMonitoringVersionsWithTls := []interface{}{ + map[string]interface{}{"hostname": "my-rs-0.some.host", "name": MonitoringAgentDefaultVersion, "additionalParams": expectedAdditionalParams}, + map[string]interface{}{"hostname": "my-rs-1.some.host", "name": MonitoringAgentDefaultVersion, "additionalParams": expectedAdditionalParams}, + map[string]interface{}{"hostname": "my-rs-2.some.host", "name": MonitoringAgentDefaultVersion, "additionalParams": expectedAdditionalParams}, + } + assert.Equal(t, expectedMonitoringVersionsWithTls, d.getMonitoringVersions()) + + // disabling TLS should clear additionalParams (CLOUDP-351614) + d.ConfigureMonitoring(zap.S(), false, util.CAFilePathInContainer) + expectedMonitoringVersionsWithoutTls := []interface{}{ + map[string]interface{}{"hostname": "my-rs-0.some.host", "name": MonitoringAgentDefaultVersion}, + map[string]interface{}{"hostname": "my-rs-1.some.host", "name": MonitoringAgentDefaultVersion}, + map[string]interface{}{"hostname": "my-rs-2.some.host", "name": MonitoringAgentDefaultVersion}, + } + assert.Equal(t, expectedMonitoringVersionsWithoutTls, d.getMonitoringVersions()) +} + func TestAddBackup(t *testing.T) { d := NewDeployment() diff --git a/controllers/om/monitoring_tls.go b/controllers/om/monitoring_tls.go new file mode 100644 index 000000000..6ac18c915 --- /dev/null +++ b/controllers/om/monitoring_tls.go @@ -0,0 +1,41 @@ +package om + +// TLS param keys for monitoring additionalParams. +const ( + TLSParamUseSsl = "useSslForAllConnections" + TLSParamTrustedCert = "sslTrustedServerCertificates" + TLSParamClientCert = "sslClientCertificate" +) + +// NewTLSParams creates and returns a new map with TLS parameters. +func NewTLSParams(caFilePath string, pemKeyFile interface{}) map[string]string { + params := map[string]string{ + TLSParamUseSsl: "true", + TLSParamTrustedCert: caFilePath, + } + if pemKeyFile != nil && pemKeyFile.(string) != "" { + params[TLSParamClientCert] = pemKeyFile.(string) + } + return params +} + +func clearTLSParamsFromMap[V any](params map[string]V) { + delete(params, TLSParamUseSsl) + delete(params, TLSParamTrustedCert) + delete(params, TLSParamClientCert) +} + +// ClearTLSParams removes TLS-specific parameters from the given params map. +func ClearTLSParams(params map[string]string) { + clearTLSParamsFromMap(params) +} + +// ClearTLSParamsFromMonitoringVersion removes TLS-specific fields from the monitoring +// version's additionalParams. +func ClearTLSParamsFromMonitoringVersion(monitoringVersion map[string]interface{}) { + params, ok := monitoringVersion["additionalParams"].(map[string]interface{}) + if !ok { + return + } + clearTLSParamsFromMap(params) +} diff --git a/controllers/operator/appdbreplicaset_controller.go b/controllers/operator/appdbreplicaset_controller.go index 86f61b52b..1aedefb0d 100644 --- a/controllers/operator/appdbreplicaset_controller.go +++ b/controllers/operator/appdbreplicaset_controller.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "path" + "slices" "sort" "strconv" "strings" @@ -87,8 +88,8 @@ const ( // Used to convey to the operator to force reconfigure agent. At the moment // it is used for DR in case of Multi-Cluster AppDB when after a cluster outage // there is no primary in the AppDB deployment. - ForceReconfigureAnnotation = "mongodb.com/v1.forceReconfigure" - + ForceReconfigureAnnotation = "mongodb.com/v1.forceReconfigure" + trueString = "true" ForcedReconfigureAlreadyPerformedAnnotation = "mongodb.com/v1.forceReconfigurePerformed" ) @@ -717,7 +718,7 @@ func (r *ReconcileAppDbReplicaSet) ReconcileAppDB(ctx context.Context, opsManage opsManager.Annotations = map[string]string{} } - if val, ok := opsManager.Annotations[ForceReconfigureAnnotation]; ok && val == "true" { + if val, ok := opsManager.Annotations[ForceReconfigureAnnotation]; ok && val == trueString { annotationsToAdd := map[string]string{ForcedReconfigureAlreadyPerformedAnnotation: timeutil.Now()} err := annotations.SetAnnotations(ctx, opsManager, annotationsToAdd, r.client) @@ -1248,7 +1249,7 @@ func (r *ReconcileAppDbReplicaSet) buildAppDbAutomationConfig(ctx context.Contex // it checks this with the user provided annotation and if the operator has actually performed a force reconfigure already func shouldPerformForcedReconfigure(annotations map[string]string) bool { if val, ok := annotations[ForceReconfigureAnnotation]; ok { - if val == "true" { + if val == trueString { if _, ok := annotations[ForcedReconfigureAlreadyPerformedAnnotation]; !ok { return true } @@ -1427,36 +1428,36 @@ func addMonitoring(ac *automationconfig.AutomationConfig, log *zap.SugaredLogger if len(ac.Processes) == 0 { return } + monitoringVersions := ac.MonitoringVersions for _, p := range ac.Processes { - found := false - for _, m := range monitoringVersions { - if m.Hostname == p.HostName { - found = true - break - } - } - if !found { - monitoringVersion := automationconfig.MonitoringVersion{ - Hostname: p.HostName, + hostname := p.HostName + pemKeyFile := p.Args26.Get("net.tls.certificateKeyFile").String() + + foundIdx := slices.IndexFunc(monitoringVersions, func(m automationconfig.MonitoringVersion) bool { + return m.Hostname == hostname + }) + + if foundIdx == -1 { + mv := automationconfig.MonitoringVersion{ + Hostname: hostname, Name: om.MonitoringAgentDefaultVersion, } if tls { - additionalParams := map[string]string{ - "useSslForAllConnections": "true", - "sslTrustedServerCertificates": appdbCAFilePath, - } - pemKeyFile := p.Args26.Get("net.tls.certificateKeyFile") - if pemKeyFile != nil { - additionalParams["sslClientCertificate"] = pemKeyFile.String() - } - monitoringVersion.AdditionalParams = additionalParams + mv.AdditionalParams = om.NewTLSParams(appdbCAFilePath, pemKeyFile) + } + log.Debugw("Added monitoring agent configuration", "host", hostname, "tls", tls) + monitoringVersions = append(monitoringVersions, mv) + } else { + if tls { + monitoringVersions[foundIdx].AdditionalParams = om.NewTLSParams(appdbCAFilePath, pemKeyFile) + } else { + om.ClearTLSParams(monitoringVersions[foundIdx].AdditionalParams) } - log.Debugw("Added monitoring agent configuration", "host", p.HostName, "tls", tls) - monitoringVersions = append(monitoringVersions, monitoringVersion) } } ac.MonitoringVersions = monitoringVersions + } // registerAppDBHostsWithProject uses the Hosts API to add each process in the AppDB to the project diff --git a/controllers/operator/appdbreplicaset_controller_test.go b/controllers/operator/appdbreplicaset_controller_test.go index df6d6b1ed..15fab0da7 100644 --- a/controllers/operator/appdbreplicaset_controller_test.go +++ b/controllers/operator/appdbreplicaset_controller_test.go @@ -1457,3 +1457,77 @@ func createRunningAppDB(ctx context.Context, t *testing.T, startingMembers int, assert.Equal(t, ok, res) return reconciler } + +// TestClearTLSParams tests CLOUDP-351614 fix: +// When TLS is disabled on AppDB, TLS-specific params should be cleared from +// the monitoring config's additionalParams to prevent the monitoring agent +// from trying to use certificate files that no longer exist. +func TestClearTLSParams(t *testing.T) { + tests := []struct { + name string + input map[string]string + expectedOutput map[string]string + }{ + { + name: "nil map", + input: nil, + expectedOutput: nil, + }, + { + name: "empty map", + input: map[string]string{}, + expectedOutput: map[string]string{}, + }, + { + name: "only TLS params", + input: map[string]string{ + "useSslForAllConnections": "true", + "sslTrustedServerCertificates": "/some/path/ca.pem", + "sslClientCertificate": "/some/path/cert.pem", + }, + expectedOutput: map[string]string{}, + }, + { + name: "mixed params - TLS and non-TLS", + input: map[string]string{ + "useSslForAllConnections": "true", + "sslTrustedServerCertificates": "/some/path/ca.pem", + "sslClientCertificate": "/some/path/cert.pem", + "someOtherParam": "someValue", + "anotherParam": "anotherValue", + }, + expectedOutput: map[string]string{ + "someOtherParam": "someValue", + "anotherParam": "anotherValue", + }, + }, + { + name: "only non-TLS params", + input: map[string]string{ + "someOtherParam": "someValue", + "anotherParam": "anotherValue", + }, + expectedOutput: map[string]string{ + "someOtherParam": "someValue", + "anotherParam": "anotherValue", + }, + }, + { + name: "partial TLS params", + input: map[string]string{ + "useSslForAllConnections": "true", + "someOtherParam": "someValue", + }, + expectedOutput: map[string]string{ + "someOtherParam": "someValue", + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + om.ClearTLSParams(tt.input) + assert.Equal(t, tt.expectedOutput, tt.input) + }) + } +} diff --git a/controllers/operator/common_controller.go b/controllers/operator/common_controller.go index da6e00c1a..2af4690d1 100644 --- a/controllers/operator/common_controller.go +++ b/controllers/operator/common_controller.go @@ -1075,7 +1075,7 @@ func ReconcileReplicaSetAC(ctx context.Context, d om.Deployment, spec mdbv1.DbCo } d.MergeReplicaSet(rs, spec.GetAdditionalMongodConfig().ToMap(), lastMongodConfig, log) - d.AddMonitoringAndBackup(log, spec.GetSecurity().IsTLSEnabled(), caFilePath) + d.ConfigureMonitoringAndBackup(log, spec.GetSecurity().IsTLSEnabled(), caFilePath) d.ConfigureTLS(spec.GetSecurity(), caFilePath) d.ConfigureInternalClusterAuthentication(rs.GetProcessNames(), spec.GetSecurity().GetInternalClusterAuthenticationMode(), internalClusterPath) diff --git a/controllers/operator/mongodbmultireplicaset_controller_test.go b/controllers/operator/mongodbmultireplicaset_controller_test.go index 0f98b1522..13c9a5818 100644 --- a/controllers/operator/mongodbmultireplicaset_controller_test.go +++ b/controllers/operator/mongodbmultireplicaset_controller_test.go @@ -911,7 +911,7 @@ func TestScaling(t *testing.T) { dep, err := omConnectionFactory.GetConnection().ReadDeployment() assert.NoError(t, err) - replicaSets := dep.ReplicaSets() + replicaSets := dep.GetReplicaSets() assert.Len(t, replicaSets, 1) members := replicaSets[0].Members() @@ -932,7 +932,7 @@ func TestScaling(t *testing.T) { dep, err = omConnectionFactory.GetConnection().ReadDeployment() assert.NoError(t, err) - replicaSets = dep.ReplicaSets() + replicaSets = dep.GetReplicaSets() assert.Len(t, replicaSets, 1) members = replicaSets[0].Members() diff --git a/controllers/operator/mongodbshardedcluster_controller.go b/controllers/operator/mongodbshardedcluster_controller.go index 1cb9e6b31..66d954a28 100644 --- a/controllers/operator/mongodbshardedcluster_controller.go +++ b/controllers/operator/mongodbshardedcluster_controller.go @@ -2006,7 +2006,7 @@ func (r *ShardedClusterReconcileHelper) publishDeployment(ctx context.Context, c return err } - d.AddMonitoringAndBackup(log, sc.Spec.GetSecurity().IsTLSEnabled(), opts.caFilePath) + d.ConfigureMonitoringAndBackup(log, sc.Spec.GetSecurity().IsTLSEnabled(), opts.caFilePath) d.ConfigureTLS(sc.Spec.GetSecurity(), opts.caFilePath) setupInternalClusterAuth(d, sc.Name, sc.GetSecurity().GetInternalClusterAuthenticationMode(), diff --git a/controllers/operator/mongodbshardedcluster_controller_multi_test.go b/controllers/operator/mongodbshardedcluster_controller_multi_test.go index 20e5eeee9..4a5a89e14 100644 --- a/controllers/operator/mongodbshardedcluster_controller_multi_test.go +++ b/controllers/operator/mongodbshardedcluster_controller_multi_test.go @@ -1277,7 +1277,7 @@ func TestReconcileForComplexMultiClusterYaml(t *testing.T) { require.NoError(t, err) automationConfig, err := omConnectionFactory.GetConnection().ReadAutomationConfig() require.NoError(t, err) - normalizedActualReplicaSets, err := normalizeObjectToInterfaceMap(map[string]any{"replicaSets": automationConfig.Deployment.ReplicaSets()}) + normalizedActualReplicaSets, err := normalizeObjectToInterfaceMap(map[string]any{"replicaSets": automationConfig.Deployment.GetReplicaSets()}) require.NoError(t, err) if !assert.Equal(t, normalizedExpectedReplicaSets, normalizedActualReplicaSets) { visualDiff, err := getVisualJsonDiff(normalizedExpectedReplicaSets, normalizedActualReplicaSets) diff --git a/controllers/operator/mongodbshardedcluster_controller_test.go b/controllers/operator/mongodbshardedcluster_controller_test.go index c49a2852d..b6c3335b4 100644 --- a/controllers/operator/mongodbshardedcluster_controller_test.go +++ b/controllers/operator/mongodbshardedcluster_controller_test.go @@ -1676,7 +1676,7 @@ func createDeploymentFromShardedCluster(t *testing.T, updatable v1.CustomResourc Finalizing: false, }) assert.NoError(t, err) - d.AddMonitoringAndBackup(zap.S(), sh.Spec.GetSecurity().IsTLSEnabled(), util.CAFilePathInContainer) + d.ConfigureMonitoringAndBackup(zap.S(), sh.Spec.GetSecurity().IsTLSEnabled(), util.CAFilePathInContainer) return d } diff --git a/controllers/operator/mongodbstandalone_controller.go b/controllers/operator/mongodbstandalone_controller.go index 56b4531ca..27f641b68 100644 --- a/controllers/operator/mongodbstandalone_controller.go +++ b/controllers/operator/mongodbstandalone_controller.go @@ -365,7 +365,7 @@ func (r *ReconcileMongoDbStandalone) updateOmDeployment(ctx context.Context, con d.MergeStandalone(standaloneOmObject, s.Spec.AdditionalMongodConfig.ToMap(), lastStandaloneConfig.ToMap(), nil) // TODO change last argument in separate PR - d.AddMonitoringAndBackup(log, s.Spec.GetSecurity().IsTLSEnabled(), util.CAFilePathInContainer) + d.ConfigureMonitoringAndBackup(log, s.Spec.GetSecurity().IsTLSEnabled(), util.CAFilePathInContainer) d.ConfigureTLS(s.Spec.GetSecurity(), util.CAFilePathInContainer) return nil }, diff --git a/controllers/operator/mongodbstandalone_controller_test.go b/controllers/operator/mongodbstandalone_controller_test.go index da9edab70..757ac47aa 100644 --- a/controllers/operator/mongodbstandalone_controller_test.go +++ b/controllers/operator/mongodbstandalone_controller_test.go @@ -458,6 +458,6 @@ func createDeploymentFromStandalone(st *mdbv1.MongoDB) om.Deployment { } d.MergeStandalone(process, st.Spec.AdditionalMongodConfig.ToMap(), lastConfig.ToMap(), nil) - d.AddMonitoringAndBackup(zap.S(), st.Spec.GetSecurity().IsTLSEnabled(), util.CAFilePathInContainer) + d.ConfigureMonitoringAndBackup(zap.S(), st.Spec.GetSecurity().IsTLSEnabled(), util.CAFilePathInContainer) return d }