Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -340,11 +340,7 @@ mockgen: #HELP Generate mocks
# Consider using counterfeiter:generate directives to speed things up.
# See https://github.com/maxbrunsfeld/counterfeiter#step-2b---add-counterfeitergenerate-directives for more information.
# Set the "COUNTERFEITER_NO_GENERATE_WARNING" environment variable to suppress this message.
@set -e; \
overlay_file=$$(mktemp "$(CURDIR)/hack/overlays/goimports_overlay.XXXXXX.json"); \
trap 'rm -f "$$overlay_file"' EXIT; \
printf '{\n "Replace": {\n "%s/vendor/golang.org/x/tools/imports/vendorlesspath.go": "%s/hack/overlays/goimports_vendorlesspath.go"\n }\n}\n' "$(CURDIR)" "$(CURDIR)" > "$$overlay_file"; \
GO111MODULE=on GOWORK=off COUNTERFEITER_NO_GENERATE_WARNING=1 GOFLAGS="$$GOFLAGS -overlay=$$overlay_file" go generate ./pkg/...
GO111MODULE=on GOWORK=off COUNTERFEITER_NO_GENERATE_WARNING=1 go generate ./pkg/...

#SECTION Verification

Expand Down
14 changes: 0 additions & 14 deletions hack/overlays/goimports_vendorlesspath.go

This file was deleted.

84 changes: 72 additions & 12 deletions pkg/controller/operators/olm/apiservices.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,9 +213,23 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
}

// Check if deployment is being updated or rolling out
if deployment.Status.UnavailableReplicas > 0 ||
deployment.Status.UpdatedReplicas < deployment.Status.Replicas {
a.logger.Debugf("Deployment %s has unavailable replicas, likely due to pod disruption", deploymentSpec.Name)
// This includes several scenarios:
// 1. UnavailableReplicas > 0: Some replicas are not ready
// 2. UpdatedReplicas < Replicas: Rollout in progress
// 3. Generation != ObservedGeneration: Spec changed but not yet observed
// 4. AvailableReplicas < desired: Not all replicas are available yet
isRollingOut := deployment.Status.UnavailableReplicas > 0 ||
deployment.Status.UpdatedReplicas < deployment.Status.Replicas ||
deployment.Generation != deployment.Status.ObservedGeneration ||
deployment.Status.AvailableReplicas < *deployment.Spec.Replicas

if isRollingOut {
a.logger.Debugf("Deployment %s is rolling out or has unavailable replicas (unavailable=%d, updated=%d/%d, available=%d/%d, generation=%d/%d), likely due to pod disruption",
deploymentSpec.Name,
deployment.Status.UnavailableReplicas,
deployment.Status.UpdatedReplicas, deployment.Status.Replicas,
deployment.Status.AvailableReplicas, *deployment.Spec.Replicas,
deployment.Status.ObservedGeneration, deployment.Generation)

// Check pod status to confirm disruption
selector, err := metav1.LabelSelectorAsSelector(deployment.Spec.Selector)
Expand All @@ -230,6 +244,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
continue
}

// For single-replica deployments during rollout, if no pods exist yet,
// this is likely the brief window where the old pod is gone and new pod
// hasn't been created yet. This is expected disruption during upgrade.
// According to the OpenShift contract: "A component must not report Available=False
// during the course of a normal upgrade."
if len(pods) == 0 && *deployment.Spec.Replicas == 1 && isRollingOut {
a.logger.Infof("Single-replica deployment %s is rolling out with no pods yet - expected disruption during upgrade, will not mark CSV as Failed", deploymentSpec.Name)
return true
}

// Track if we found any real failures or expected disruptions
foundExpectedDisruption := false
foundRealFailure := false

// Check if any pod is in expected disruption state
for _, pod := range pods {
// Check how long the pod has been in disrupted state
Expand All @@ -244,7 +272,8 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
// Pod is terminating (DeletionTimestamp is set)
if pod.DeletionTimestamp != nil {
a.logger.Debugf("Pod %s is terminating - expected disruption", pod.Name)
return true
foundExpectedDisruption = true
continue
}

// For pending pods, we need to distinguish between expected disruption
Expand All @@ -257,11 +286,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
// Check pod conditions for scheduling issues
for _, condition := range pod.Status.Conditions {
if condition.Type == corev1.PodScheduled && condition.Status == corev1.ConditionFalse {
// If pod has been unschedulable for a while, it's likely a real issue
// not a temporary disruption from cluster upgrade
if condition.Reason == "Unschedulable" {
isRealFailure = true
a.logger.Debugf("Pod %s is unschedulable - not a temporary disruption", pod.Name)
// CRITICAL: In single-replica deployments during rollout, Unschedulable is EXPECTED
// due to PodAntiAffinity preventing new pod from scheduling while old pod is terminating.
// This is especially common in single-node clusters or control plane scenarios.
// Per OpenShift contract: "A component must not report Available=False during normal upgrade."
if *deployment.Spec.Replicas == 1 && isRollingOut {
a.logger.Infof("Pod %s is unschedulable during single-replica rollout - likely PodAntiAffinity conflict, treating as expected disruption", pod.Name)
isExpectedDisruption = true
} else {
// Multi-replica or non-rollout Unschedulable is a real issue
isRealFailure = true
foundRealFailure = true
a.logger.Debugf("Pod %s is unschedulable - not a temporary disruption", pod.Name)
}
break
}
}
Expand All @@ -278,6 +316,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
// These are real failures, not temporary disruptions
isRealFailure = true
foundRealFailure = true
a.logger.Debugf("Pod %s has container error %s - real failure, not disruption", pod.Name, reason)
}
}
Expand All @@ -292,6 +331,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
isExpectedDisruption = true
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
isRealFailure = true
foundRealFailure = true
a.logger.Debugf("Pod %s has init container error %s - real failure, not disruption", pod.Name, reason)
}
}
Expand All @@ -302,17 +342,19 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
continue
}

// If it's in expected disruption state, return true
// If it's in expected disruption state, mark it
if isExpectedDisruption {
a.logger.Debugf("Pod %s is in expected disruption state", pod.Name)
return true
foundExpectedDisruption = true
continue
}

// If pending without clear container status, check if it's just being scheduled
// This could be normal pod creation during node drain
if len(pod.Status.ContainerStatuses) == 0 && len(pod.Status.InitContainerStatuses) == 0 {
a.logger.Debugf("Pod %s is pending without container statuses - likely being scheduled", pod.Name)
return true
foundExpectedDisruption = true
continue
}
}

Expand All @@ -323,14 +365,32 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
switch reason {
case "ContainerCreating", "PodInitializing":
a.logger.Debugf("Pod %s container is starting - expected disruption", pod.Name)
return true
foundExpectedDisruption = true
case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff":
// Real failures - don't treat as disruption
a.logger.Debugf("Pod %s has container error %s - not treating as disruption", pod.Name, reason)
foundRealFailure = true
}
}
}
}

// After checking all pods, make a decision
// If we found expected disruption and no real failures, treat as disruption
if foundExpectedDisruption && !foundRealFailure {
a.logger.Infof("Deployment %s has pods in expected disruption state - will not mark CSV as Failed per Available contract", deploymentSpec.Name)
return true
}

// For single-replica deployments during rollout, if we found no real failures,
// treat as expected disruption to comply with the OpenShift contract:
// "A component must not report Available=False during the course of a normal upgrade."
// Single-replica deployments inherently have unavailability during rollout,
// but this is acceptable and should not trigger Available=False.
if !foundRealFailure && *deployment.Spec.Replicas == 1 && isRollingOut {
a.logger.Infof("Single-replica deployment %s is rolling out - treating as expected disruption per Available contract", deploymentSpec.Name)
return true
}
}
}

Expand Down
Loading