Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .evergreen-tasks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -783,6 +783,11 @@ tasks:
commands:
- func: "e2e_test"

- name: e2e_om_appdb_tls_disable
tags: [ "patch-run" ]
commands:
- func: "e2e_test"

- name: e2e_om_appdb_multi_change
tags: [ "patch-run" ]
commands:
Expand Down
20 changes: 19 additions & 1 deletion .evergreen.yml
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,7 @@ task_groups:
- e2e_om_appdb_flags_and_config
- e2e_om_appdb_upgrade
- e2e_om_appdb_monitoring_tls
- e2e_om_appdb_tls_disable
- e2e_om_ops_manager_backup
- e2e_om_ops_manager_backup_light
- e2e_om_ops_manager_backup_liveness_probe
Expand Down Expand Up @@ -1022,6 +1023,7 @@ task_groups:
- e2e_om_appdb_flags_and_config
- e2e_om_appdb_upgrade
- e2e_om_appdb_monitoring_tls
- e2e_om_appdb_tls_disable
- e2e_om_ops_manager_backup
- e2e_om_ops_manager_backup_light
- e2e_om_ops_manager_backup_liveness_probe
Expand Down Expand Up @@ -1076,6 +1078,7 @@ task_groups:
- e2e_om_appdb_external_connectivity
- e2e_om_appdb_flags_and_config
- e2e_om_appdb_monitoring_tls
- e2e_om_appdb_tls_disable
- e2e_om_appdb_multi_change
- e2e_om_appdb_scale_up_down
- e2e_om_appdb_upgrade
Expand Down Expand Up @@ -1120,6 +1123,7 @@ task_groups:
- e2e_om_appdb_external_connectivity
- e2e_om_appdb_flags_and_config
- e2e_om_appdb_monitoring_tls
- e2e_om_appdb_tls_disable
- e2e_om_appdb_multi_change
- e2e_om_appdb_scale_up_down
- e2e_om_appdb_upgrade
Expand Down Expand Up @@ -1439,7 +1443,21 @@ buildvariants:
tags: [ "pr_patch", "staging", "e2e_test_suite", "static" ]
run_on:
- ubuntu2404-large
<<: *base_om8_dependency
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

revert me once the image build has been fixed for the database image

depends_on:
- name: build_om_images
variant: build_om80_images
- name: build_operator_ubi
variant: init_test_run
- name: build_init_database_image_ubi
variant: init_test_run
- name: build_test_image
variant: init_test_run
- name: build_init_appdb_images_ubi
variant: init_test_run
- name: build_init_om_images_ubi
variant: init_test_run
- name: publish_helm_chart
variant: init_test_run
tasks:
- name: e2e_static_ops_manager_kind_only_task_group
- name: e2e_static_ops_manager_kind_6_0_only_task_group
Expand Down
6 changes: 6 additions & 0 deletions changelog/20251216_fix_tls_monitoring.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
kind: fix
date: 2025-12-16
---

* Fixed an issue where monitoring agents would fail after disabling TLS on a MongoDB deployment.
4 changes: 4 additions & 0 deletions controllers/om/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,10 @@ func (d Deployment) AddMonitoring(log *zap.SugaredLogger, tls bool, caFilePath s
}

monitoringVersion["additionalParams"] = additionalParams
} else {
// Clear TLS params when TLS is disabled to prevent monitoring from
// trying to use certificate files that no longer exist
delete(monitoringVersion, "additionalParams")
}

}
Expand Down
31 changes: 30 additions & 1 deletion controllers/om/deployment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -528,10 +528,39 @@ func TestAddMonitoringTls(t *testing.T) {
assert.Equal(t, expectedMonitoringVersions, d.getMonitoringVersions())

// adding again - nothing changes
d.AddMonitoring(zap.S(), false, util.CAFilePathInContainer)
d.AddMonitoring(zap.S(), true, util.CAFilePathInContainer)
assert.Equal(t, expectedMonitoringVersions, d.getMonitoringVersions())
}

func TestAddMonitoringTLSDisable(t *testing.T) {
d := NewDeployment()

rs0 := buildRsByProcesses("my-rs", createReplicaSetProcessesCount(3, "my-rs"))
d.MergeReplicaSet(rs0, nil, nil, zap.S())
d.AddMonitoring(zap.S(), true, util.CAFilePathInContainer)

// verify TLS is present in additionalParams
expectedAdditionalParams := map[string]string{
"useSslForAllConnections": "true",
"sslTrustedServerCertificates": util.CAFilePathInContainer,
}
expectedMonitoringVersionsWithTls := []interface{}{
map[string]interface{}{"hostname": "my-rs-0.some.host", "name": MonitoringAgentDefaultVersion, "additionalParams": expectedAdditionalParams},
map[string]interface{}{"hostname": "my-rs-1.some.host", "name": MonitoringAgentDefaultVersion, "additionalParams": expectedAdditionalParams},
map[string]interface{}{"hostname": "my-rs-2.some.host", "name": MonitoringAgentDefaultVersion, "additionalParams": expectedAdditionalParams},
}
assert.Equal(t, expectedMonitoringVersionsWithTls, d.getMonitoringVersions())

// disabling TLS should clear additionalParams (CLOUDP-351614)
d.AddMonitoring(zap.S(), false, util.CAFilePathInContainer)
expectedMonitoringVersionsWithoutTls := []interface{}{
map[string]interface{}{"hostname": "my-rs-0.some.host", "name": MonitoringAgentDefaultVersion},
map[string]interface{}{"hostname": "my-rs-1.some.host", "name": MonitoringAgentDefaultVersion},
map[string]interface{}{"hostname": "my-rs-2.some.host", "name": MonitoringAgentDefaultVersion},
}
assert.Equal(t, expectedMonitoringVersionsWithoutTls, d.getMonitoringVersions())
}

func TestAddBackup(t *testing.T) {
d := NewDeployment()

Expand Down
7 changes: 6 additions & 1 deletion controllers/operator/appdbreplicaset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -1430,9 +1430,14 @@ func addMonitoring(ac *automationconfig.AutomationConfig, log *zap.SugaredLogger
monitoringVersions := ac.MonitoringVersions
for _, p := range ac.Processes {
found := false
for _, m := range monitoringVersions {
for i, m := range monitoringVersions {
if m.Hostname == p.HostName {
found = true
if !tls {
// Clear TLS params when TLS is disabled to prevent monitoring from
// trying to use certificate files that no longer exist
monitoringVersions[i].AdditionalParams = nil
}
break
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
apiVersion: mongodb.com/v1
kind: MongoDBOpsManager
metadata:
name: om-tls-disable-test
spec:
replicas: 1
version: 4.4.1
adminCredentials: ops-manager-admin-secret
security:
tls:
secretRef:
name: certs-for-ops-manager
ca: issuer-ca
backup:
enabled: false
applicationDatabase:
members: 3
version: 5.0.14-ent
security:
certsSecretPrefix: appdb
tls:
ca: issuer-ca

# adding this just to avoid wizard when opening OM UI
configuration:
automation.versions.source: mongodb
mms.adminEmailAddr: [email protected]
mms.fromEmailAddr: [email protected]
mms.ignoreInitialUiSetup: "true"
mms.mail.hostname: email-smtp.us-east-1.amazonaws.com
mms.mail.port: "465"
mms.mail.ssl: "true"
mms.mail.transport: smtp
mms.minimumTLSVersion: TLSv1.2
mms.replyToEmailAddr: [email protected]
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
"""
CLOUDP-351614: Test that TLS can be disabled on AppDB without breaking monitoring.

This is a dedicated test file to verify that when TLS is disabled on AppDB,
the operator correctly clears stale TLS params from the monitoring configuration.
"""
from typing import Optional

from kubetester import create_or_update_secret, try_load
from kubetester.certs import create_ops_manager_tls_certs
from kubetester.kubetester import fixture as yaml_fixture
from kubetester.opsmanager import MongoDBOpsManager
from kubetester.phase import Phase
from pytest import fixture, mark
from tests.common.cert.cert_issuer import create_appdb_certs
from tests.conftest import is_multi_cluster
from tests.opsmanager.withMonitoredAppDB.conftest import enable_multi_cluster_deployment


def get_monitoring_tls_params(ops_manager: MongoDBOpsManager) -> dict:
"""
Extract TLS-related additionalParams from monitoring config via OM API.

Returns a dict with the additionalParams for all monitoring agents,
keyed by hostname. An empty dict means no TLS params are present.

Note: We use get_om_tester() to access the full automation config from
Ops Manager API, which includes monitoringVersions. The K8s secret-based
get_automation_config_tester() only has the cluster config for agents.
"""
om_tester = ops_manager.get_om_tester(ops_manager.app_db_name())
ac = om_tester.get_automation_config_tester()
monitoring_versions = ac.automation_config.get("monitoringVersions", [])

tls_params_by_host = {}
for mv in monitoring_versions:
hostname = mv.get("hostname", "unknown")
additional_params = mv.get("additionalParams", {})
if additional_params:
tls_params_by_host[hostname] = additional_params

return tls_params_by_host

OM_NAME = "om-tls-disable-test"
APPDB_NAME = f"{OM_NAME}-db"


@fixture(scope="module")
def ops_manager_certs(namespace: str, issuer: str):
return create_ops_manager_tls_certs(issuer, namespace, OM_NAME)


@fixture(scope="module")
def appdb_certs(namespace: str, issuer: str):
return create_appdb_certs(namespace, issuer, APPDB_NAME)


@fixture(scope="module")
@mark.usefixtures("appdb_certs", "ops_manager_certs", "issuer_ca_configmap")
def ops_manager(
namespace: str,
issuer_ca_configmap: str,
appdb_certs: str,
ops_manager_certs: str,
custom_version: Optional[str],
custom_appdb_version: str,
) -> MongoDBOpsManager:
"""Create an Ops Manager with TLS-enabled AppDB for testing TLS disable."""
om = MongoDBOpsManager.from_yaml(yaml_fixture("om_appdb_tls_disable.yaml"), namespace=namespace)
om.set_version(custom_version)
om.set_appdb_version(custom_appdb_version)

if try_load(om):
return om

if is_multi_cluster():
enable_multi_cluster_deployment(om)

om.update()
return om


@mark.e2e_om_appdb_tls_disable
def test_om_created_with_tls(ops_manager: MongoDBOpsManager):
"""Verify OM and AppDB are running with TLS enabled."""
ops_manager.om_status().assert_reaches_phase(Phase.Running, timeout=900)
ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=600)


@mark.e2e_om_appdb_tls_disable
def test_appdb_monitoring_works_with_tls(ops_manager: MongoDBOpsManager):
"""Verify monitoring works with TLS enabled before we disable it."""
ops_manager.assert_appdb_monitoring_group_was_created()
ops_manager.assert_monitoring_data_exists(timeout=600, all_hosts=False)


@mark.e2e_om_appdb_tls_disable
def test_monitoring_config_has_tls_params_when_tls_enabled(ops_manager: MongoDBOpsManager):
"""
CLOUDP-351614: Verify that monitoring config contains TLS params when TLS is enabled.

When TLS is enabled on AppDB, the monitoring agents should be configured with
TLS parameters in additionalParams, including:
- useSslForAllConnections: "true"
- sslTrustedServerCertificates: path to CA file
- sslClientCertificate: path to client certificate (for x509 auth)
"""
tls_params = get_monitoring_tls_params(ops_manager)

# Monitoring agents should have TLS params configured
assert len(tls_params) > 0, "Expected TLS params in monitoring config when TLS is enabled"

# Verify TLS params contain expected keys
for hostname, params in tls_params.items():
assert params.get("useSslForAllConnections") == "true", (
f"Expected useSslForAllConnections=true for {hostname}, got {params}"
)
assert "sslTrustedServerCertificates" in params, (
f"Expected sslTrustedServerCertificates in params for {hostname}"
)


@mark.e2e_om_appdb_tls_disable
def test_disable_tls_on_appdb(ops_manager: MongoDBOpsManager):
"""
CLOUDP-351614: Disable TLS on AppDB and verify the operator correctly handles
the transition without leaving stale TLS params in monitoring config.

This test disables TLS by setting tls.enabled = False. The operator handles
the TLS mode transition: requireTLS -> preferTLS -> allowTLS -> disabled.

Note: We keep the certsSecretPrefix in place because:
1. The cert files are needed during the TLS transition
2. Removing certsSecretPrefix while TLS mode is still transitioning causes failures
3. The certs are harmless to keep after TLS is disabled (just unused)
"""
ops_manager.load()

# Disable TLS mode (keeping certs for the transition)
# The operator will handle the TLS mode transition: requireTLS -> preferTLS -> allowTLS -> disabled
ops_manager["spec"]["applicationDatabase"]["security"]["tls"]["enabled"] = False
ops_manager.update()

# Wait for AppDB to reach Running state after TLS mode is fully disabled
# Use a longer timeout as the transition goes through multiple TLS modes
ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=1800)


@mark.e2e_om_appdb_tls_disable
def test_monitoring_config_tls_params_cleared_after_tls_disabled(ops_manager: MongoDBOpsManager):
"""
CLOUDP-351614: Verify that TLS params are CLEARED from monitoring config after TLS is disabled.

THIS IS THE KEY TEST FOR THE BUG FIX:
Before the fix in CLOUDP-351614, the operator would leave stale TLS params
(useSslForAllConnections, sslClientCertificate, etc.) in the monitoring config
even after TLS was disabled. This caused monitoring agents to fail because they
would try to use TLS certificate files that are no longer valid for authentication.

After the fix, the operator correctly clears these params by calling:
delete(monitoringVersion, "additionalParams")
"""
tls_params = get_monitoring_tls_params(ops_manager)

# After TLS is disabled, monitoring config should NOT have TLS params
# This assertion would have FAILED before the fix because stale TLS params remained
assert len(tls_params) == 0, (
f"CLOUDP-351614 BUG: TLS params should be cleared from monitoring config after "
f"TLS is disabled, but found stale params: {tls_params}"
)


@mark.e2e_om_appdb_tls_disable
def test_monitoring_works_after_tls_disable(ops_manager: MongoDBOpsManager):
"""
CLOUDP-351614: Verify monitoring data can still be collected after TLS disable.

After TLS is disabled, monitoring agents switch from x509 to SCRAM authentication.
This test verifies that:
1. The operator correctly cleared stale TLS params from monitoring config
2. Monitoring agents can reconnect and collect data
"""
# Use a longer timeout as monitoring agents need time to reconnect with SCRAM auth
ops_manager.assert_monitoring_data_exists(timeout=1200, all_hosts=False)