Skip to content

Commit 9df6415

Browse files
committed
CLOUDP-351614: Fix monitoring failure after disabling TLS on AppDB
When TLS is disabled on AppDB, the operator now correctly clears stale TLS parameters (additionalParams) from the monitoring configuration. Before this fix, stale TLS params like useSslForAllConnections and sslClientCertificate would remain in the monitoring config after TLS was disabled, causing monitoring agents to fail when trying to use certificate files that are no longer valid for authentication. Changes: - deployment.go: Clear additionalParams when TLS is disabled - appdbreplicaset_controller.go: Use correct TLS state for monitoring - Added e2e test (e2e_om_appdb_tls_disable) that verifies: 1. TLS params are present when TLS is enabled 2. TLS params are cleared after TLS is disabled 3. Monitoring continues to work after TLS disable
1 parent 2bf57af commit 9df6415

File tree

8 files changed

+284
-3
lines changed

8 files changed

+284
-3
lines changed

.evergreen-tasks.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -783,6 +783,11 @@ tasks:
783783
commands:
784784
- func: "e2e_test"
785785

786+
- name: e2e_om_appdb_tls_disable
787+
tags: [ "patch-run" ]
788+
commands:
789+
- func: "e2e_test"
790+
786791
- name: e2e_om_appdb_multi_change
787792
tags: [ "patch-run" ]
788793
commands:

.evergreen.yml

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -968,6 +968,7 @@ task_groups:
968968
- e2e_om_appdb_flags_and_config
969969
- e2e_om_appdb_upgrade
970970
- e2e_om_appdb_monitoring_tls
971+
- e2e_om_appdb_tls_disable
971972
- e2e_om_ops_manager_backup
972973
- e2e_om_ops_manager_backup_light
973974
- e2e_om_ops_manager_backup_liveness_probe
@@ -1022,6 +1023,7 @@ task_groups:
10221023
- e2e_om_appdb_flags_and_config
10231024
- e2e_om_appdb_upgrade
10241025
- e2e_om_appdb_monitoring_tls
1026+
- e2e_om_appdb_tls_disable
10251027
- e2e_om_ops_manager_backup
10261028
- e2e_om_ops_manager_backup_light
10271029
- e2e_om_ops_manager_backup_liveness_probe
@@ -1076,6 +1078,7 @@ task_groups:
10761078
- e2e_om_appdb_external_connectivity
10771079
- e2e_om_appdb_flags_and_config
10781080
- e2e_om_appdb_monitoring_tls
1081+
- e2e_om_appdb_tls_disable
10791082
- e2e_om_appdb_multi_change
10801083
- e2e_om_appdb_scale_up_down
10811084
- e2e_om_appdb_upgrade
@@ -1120,6 +1123,7 @@ task_groups:
11201123
- e2e_om_appdb_external_connectivity
11211124
- e2e_om_appdb_flags_and_config
11221125
- e2e_om_appdb_monitoring_tls
1126+
- e2e_om_appdb_tls_disable
11231127
- e2e_om_appdb_multi_change
11241128
- e2e_om_appdb_scale_up_down
11251129
- e2e_om_appdb_upgrade
@@ -1439,7 +1443,21 @@ buildvariants:
14391443
tags: [ "pr_patch", "staging", "e2e_test_suite", "static" ]
14401444
run_on:
14411445
- ubuntu2404-large
1442-
<<: *base_om8_dependency
1446+
depends_on:
1447+
- name: build_om_images
1448+
variant: build_om80_images
1449+
- name: build_operator_ubi
1450+
variant: init_test_run
1451+
- name: build_init_database_image_ubi
1452+
variant: init_test_run
1453+
- name: build_test_image
1454+
variant: init_test_run
1455+
- name: build_init_appdb_images_ubi
1456+
variant: init_test_run
1457+
- name: build_init_om_images_ubi
1458+
variant: init_test_run
1459+
- name: publish_helm_chart
1460+
variant: init_test_run
14431461
tasks:
14441462
- name: e2e_static_ops_manager_kind_only_task_group
14451463
- name: e2e_static_ops_manager_kind_6_0_only_task_group
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
kind: fix
3+
date: 2025-12-16
4+
---
5+
6+
* Fixed an issue where monitoring agents would fail after disabling TLS on a MongoDB deployment.

controllers/om/deployment.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,10 @@ func (d Deployment) AddMonitoring(log *zap.SugaredLogger, tls bool, caFilePath s
317317
}
318318

319319
monitoringVersion["additionalParams"] = additionalParams
320+
} else {
321+
// Clear TLS params when TLS is disabled to prevent monitoring from
322+
// trying to use certificate files that no longer exist
323+
delete(monitoringVersion, "additionalParams")
320324
}
321325

322326
}

controllers/om/deployment_test.go

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -528,10 +528,39 @@ func TestAddMonitoringTls(t *testing.T) {
528528
assert.Equal(t, expectedMonitoringVersions, d.getMonitoringVersions())
529529

530530
// adding again - nothing changes
531-
d.AddMonitoring(zap.S(), false, util.CAFilePathInContainer)
531+
d.AddMonitoring(zap.S(), true, util.CAFilePathInContainer)
532532
assert.Equal(t, expectedMonitoringVersions, d.getMonitoringVersions())
533533
}
534534

535+
func TestAddMonitoringTLSDisable(t *testing.T) {
536+
d := NewDeployment()
537+
538+
rs0 := buildRsByProcesses("my-rs", createReplicaSetProcessesCount(3, "my-rs"))
539+
d.MergeReplicaSet(rs0, nil, nil, zap.S())
540+
d.AddMonitoring(zap.S(), true, util.CAFilePathInContainer)
541+
542+
// verify TLS is present in additionalParams
543+
expectedAdditionalParams := map[string]string{
544+
"useSslForAllConnections": "true",
545+
"sslTrustedServerCertificates": util.CAFilePathInContainer,
546+
}
547+
expectedMonitoringVersionsWithTls := []interface{}{
548+
map[string]interface{}{"hostname": "my-rs-0.some.host", "name": MonitoringAgentDefaultVersion, "additionalParams": expectedAdditionalParams},
549+
map[string]interface{}{"hostname": "my-rs-1.some.host", "name": MonitoringAgentDefaultVersion, "additionalParams": expectedAdditionalParams},
550+
map[string]interface{}{"hostname": "my-rs-2.some.host", "name": MonitoringAgentDefaultVersion, "additionalParams": expectedAdditionalParams},
551+
}
552+
assert.Equal(t, expectedMonitoringVersionsWithTls, d.getMonitoringVersions())
553+
554+
// disabling TLS should clear additionalParams (CLOUDP-351614)
555+
d.AddMonitoring(zap.S(), false, util.CAFilePathInContainer)
556+
expectedMonitoringVersionsWithoutTls := []interface{}{
557+
map[string]interface{}{"hostname": "my-rs-0.some.host", "name": MonitoringAgentDefaultVersion},
558+
map[string]interface{}{"hostname": "my-rs-1.some.host", "name": MonitoringAgentDefaultVersion},
559+
map[string]interface{}{"hostname": "my-rs-2.some.host", "name": MonitoringAgentDefaultVersion},
560+
}
561+
assert.Equal(t, expectedMonitoringVersionsWithoutTls, d.getMonitoringVersions())
562+
}
563+
535564
func TestAddBackup(t *testing.T) {
536565
d := NewDeployment()
537566

controllers/operator/appdbreplicaset_controller.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1430,9 +1430,14 @@ func addMonitoring(ac *automationconfig.AutomationConfig, log *zap.SugaredLogger
14301430
monitoringVersions := ac.MonitoringVersions
14311431
for _, p := range ac.Processes {
14321432
found := false
1433-
for _, m := range monitoringVersions {
1433+
for i, m := range monitoringVersions {
14341434
if m.Hostname == p.HostName {
14351435
found = true
1436+
if !tls {
1437+
// Clear TLS params when TLS is disabled to prevent monitoring from
1438+
// trying to use certificate files that no longer exist
1439+
monitoringVersions[i].AdditionalParams = nil
1440+
}
14361441
break
14371442
}
14381443
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
apiVersion: mongodb.com/v1
2+
kind: MongoDBOpsManager
3+
metadata:
4+
name: om-tls-disable-test
5+
spec:
6+
replicas: 1
7+
version: 4.4.1
8+
adminCredentials: ops-manager-admin-secret
9+
security:
10+
tls:
11+
secretRef:
12+
name: certs-for-ops-manager
13+
ca: issuer-ca
14+
backup:
15+
enabled: false
16+
applicationDatabase:
17+
members: 3
18+
version: 5.0.14-ent
19+
security:
20+
certsSecretPrefix: appdb
21+
tls:
22+
ca: issuer-ca
23+
24+
# adding this just to avoid wizard when opening OM UI
25+
configuration:
26+
automation.versions.source: mongodb
27+
mms.adminEmailAddr: [email protected]
28+
mms.fromEmailAddr: [email protected]
29+
mms.ignoreInitialUiSetup: "true"
30+
mms.mail.hostname: email-smtp.us-east-1.amazonaws.com
31+
mms.mail.port: "465"
32+
mms.mail.ssl: "true"
33+
mms.mail.transport: smtp
34+
mms.minimumTLSVersion: TLSv1.2
35+
mms.replyToEmailAddr: [email protected]
Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
"""
2+
CLOUDP-351614: Test that TLS can be disabled on AppDB without breaking monitoring.
3+
4+
This is a dedicated test file to verify that when TLS is disabled on AppDB,
5+
the operator correctly clears stale TLS params from the monitoring configuration.
6+
"""
7+
from typing import Optional
8+
9+
from kubetester import create_or_update_secret, try_load
10+
from kubetester.certs import create_ops_manager_tls_certs
11+
from kubetester.kubetester import fixture as yaml_fixture
12+
from kubetester.opsmanager import MongoDBOpsManager
13+
from kubetester.phase import Phase
14+
from pytest import fixture, mark
15+
from tests.common.cert.cert_issuer import create_appdb_certs
16+
from tests.conftest import is_multi_cluster
17+
from tests.opsmanager.withMonitoredAppDB.conftest import enable_multi_cluster_deployment
18+
19+
20+
def get_monitoring_tls_params(ops_manager: MongoDBOpsManager) -> dict:
21+
"""
22+
Extract TLS-related additionalParams from monitoring config.
23+
24+
Returns a dict with the additionalParams for all monitoring agents,
25+
keyed by hostname. An empty dict means no TLS params are present.
26+
"""
27+
ac = ops_manager.get_automation_config_tester()
28+
monitoring_versions = ac.automation_config.get("monitoringVersions", [])
29+
30+
tls_params_by_host = {}
31+
for mv in monitoring_versions:
32+
hostname = mv.get("hostname", "unknown")
33+
additional_params = mv.get("additionalParams", {})
34+
if additional_params:
35+
tls_params_by_host[hostname] = additional_params
36+
37+
return tls_params_by_host
38+
39+
OM_NAME = "om-tls-disable-test"
40+
APPDB_NAME = f"{OM_NAME}-db"
41+
42+
43+
@fixture(scope="module")
44+
def ops_manager_certs(namespace: str, issuer: str):
45+
return create_ops_manager_tls_certs(issuer, namespace, OM_NAME)
46+
47+
48+
@fixture(scope="module")
49+
def appdb_certs(namespace: str, issuer: str):
50+
return create_appdb_certs(namespace, issuer, APPDB_NAME)
51+
52+
53+
@fixture(scope="module")
54+
@mark.usefixtures("appdb_certs", "ops_manager_certs", "issuer_ca_configmap")
55+
def ops_manager(
56+
namespace: str,
57+
issuer_ca_configmap: str,
58+
appdb_certs: str,
59+
ops_manager_certs: str,
60+
custom_version: Optional[str],
61+
custom_appdb_version: str,
62+
) -> MongoDBOpsManager:
63+
"""Create an Ops Manager with TLS-enabled AppDB for testing TLS disable."""
64+
om = MongoDBOpsManager.from_yaml(yaml_fixture("om_appdb_tls_disable.yaml"), namespace=namespace)
65+
om.set_version(custom_version)
66+
om.set_appdb_version(custom_appdb_version)
67+
68+
if try_load(om):
69+
return om
70+
71+
if is_multi_cluster():
72+
enable_multi_cluster_deployment(om)
73+
74+
om.update()
75+
return om
76+
77+
78+
@mark.e2e_om_appdb_tls_disable
79+
def test_om_created_with_tls(ops_manager: MongoDBOpsManager):
80+
"""Verify OM and AppDB are running with TLS enabled."""
81+
ops_manager.om_status().assert_reaches_phase(Phase.Running, timeout=900)
82+
ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=600)
83+
84+
85+
@mark.e2e_om_appdb_tls_disable
86+
def test_appdb_monitoring_works_with_tls(ops_manager: MongoDBOpsManager):
87+
"""Verify monitoring works with TLS enabled before we disable it."""
88+
ops_manager.assert_appdb_monitoring_group_was_created()
89+
ops_manager.assert_monitoring_data_exists(timeout=600, all_hosts=False)
90+
91+
92+
@mark.e2e_om_appdb_tls_disable
93+
def test_monitoring_config_has_tls_params_when_tls_enabled(ops_manager: MongoDBOpsManager):
94+
"""
95+
CLOUDP-351614: Verify that monitoring config contains TLS params when TLS is enabled.
96+
97+
When TLS is enabled on AppDB, the monitoring agents should be configured with
98+
TLS parameters in additionalParams, including:
99+
- useSslForAllConnections: "true"
100+
- sslTrustedServerCertificates: path to CA file
101+
- sslClientCertificate: path to client certificate (for x509 auth)
102+
"""
103+
tls_params = get_monitoring_tls_params(ops_manager)
104+
105+
# Monitoring agents should have TLS params configured
106+
assert len(tls_params) > 0, "Expected TLS params in monitoring config when TLS is enabled"
107+
108+
# Verify TLS params contain expected keys
109+
for hostname, params in tls_params.items():
110+
assert params.get("useSslForAllConnections") == "true", (
111+
f"Expected useSslForAllConnections=true for {hostname}, got {params}"
112+
)
113+
assert "sslTrustedServerCertificates" in params, (
114+
f"Expected sslTrustedServerCertificates in params for {hostname}"
115+
)
116+
117+
118+
@mark.e2e_om_appdb_tls_disable
119+
def test_disable_tls_on_appdb(ops_manager: MongoDBOpsManager):
120+
"""
121+
CLOUDP-351614: Disable TLS on AppDB and verify the operator correctly handles
122+
the transition without leaving stale TLS params in monitoring config.
123+
124+
This test disables TLS by setting tls.enabled = False. The operator handles
125+
the TLS mode transition: requireTLS -> preferTLS -> allowTLS -> disabled.
126+
127+
Note: We keep the certsSecretPrefix in place because:
128+
1. The cert files are needed during the TLS transition
129+
2. Removing certsSecretPrefix while TLS mode is still transitioning causes failures
130+
3. The certs are harmless to keep after TLS is disabled (just unused)
131+
"""
132+
ops_manager.load()
133+
134+
# Disable TLS mode (keeping certs for the transition)
135+
# The operator will handle the TLS mode transition: requireTLS -> preferTLS -> allowTLS -> disabled
136+
ops_manager["spec"]["applicationDatabase"]["security"]["tls"]["enabled"] = False
137+
ops_manager.update()
138+
139+
# Wait for AppDB to reach Running state after TLS mode is fully disabled
140+
# Use a longer timeout as the transition goes through multiple TLS modes
141+
ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=1800)
142+
143+
144+
@mark.e2e_om_appdb_tls_disable
145+
def test_monitoring_config_tls_params_cleared_after_tls_disabled(ops_manager: MongoDBOpsManager):
146+
"""
147+
CLOUDP-351614: Verify that TLS params are CLEARED from monitoring config after TLS is disabled.
148+
149+
THIS IS THE KEY TEST FOR THE BUG FIX:
150+
Before the fix in CLOUDP-351614, the operator would leave stale TLS params
151+
(useSslForAllConnections, sslClientCertificate, etc.) in the monitoring config
152+
even after TLS was disabled. This caused monitoring agents to fail because they
153+
would try to use TLS certificate files that are no longer valid for authentication.
154+
155+
After the fix, the operator correctly clears these params by calling:
156+
delete(monitoringVersion, "additionalParams")
157+
"""
158+
tls_params = get_monitoring_tls_params(ops_manager)
159+
160+
# After TLS is disabled, monitoring config should NOT have TLS params
161+
# This assertion would have FAILED before the fix because stale TLS params remained
162+
assert len(tls_params) == 0, (
163+
f"CLOUDP-351614 BUG: TLS params should be cleared from monitoring config after "
164+
f"TLS is disabled, but found stale params: {tls_params}"
165+
)
166+
167+
168+
@mark.e2e_om_appdb_tls_disable
169+
def test_monitoring_works_after_tls_disable(ops_manager: MongoDBOpsManager):
170+
"""
171+
CLOUDP-351614: Verify monitoring data can still be collected after TLS disable.
172+
173+
After TLS is disabled, monitoring agents switch from x509 to SCRAM authentication.
174+
This test verifies that:
175+
1. The operator correctly cleared stale TLS params from monitoring config
176+
2. Monitoring agents can reconnect and collect data
177+
"""
178+
# Use a longer timeout as monitoring agents need time to reconnect with SCRAM auth
179+
ops_manager.assert_monitoring_data_exists(timeout=1200, all_hosts=False)

0 commit comments

Comments
 (0)