operator-framework · jianzhangbjz · Dec 11, 2025 · Dec 11, 2025
diff --git a/Makefile b/Makefile
@@ -340,11 +340,7 @@ mockgen: #HELP Generate mocks
 	# Consider using counterfeiter:generate directives to speed things up.
 	# See https://github.com/maxbrunsfeld/counterfeiter#step-2b---add-counterfeitergenerate-directives for more information.
 	# Set the "COUNTERFEITER_NO_GENERATE_WARNING" environment variable to suppress this message.
-	@set -e; \
-	overlay_file=$$(mktemp "$(CURDIR)/hack/overlays/goimports_overlay.XXXXXX.json"); \
-	trap 'rm -f "$$overlay_file"' EXIT; \
-	printf '{\n  "Replace": {\n    "%s/vendor/golang.org/x/tools/imports/vendorlesspath.go": "%s/hack/overlays/goimports_vendorlesspath.go"\n  }\n}\n' "$(CURDIR)" "$(CURDIR)" > "$$overlay_file"; \
-	GO111MODULE=on GOWORK=off COUNTERFEITER_NO_GENERATE_WARNING=1 GOFLAGS="$$GOFLAGS -overlay=$$overlay_file" go generate ./pkg/...
+	GO111MODULE=on GOWORK=off COUNTERFEITER_NO_GENERATE_WARNING=1 go generate ./pkg/...
 
 #SECTION Verification
 

diff --git a/hack/overlays/goimports_vendorlesspath.go b/hack/overlays/goimports_vendorlesspath.go
diff --git a/pkg/controller/operators/olm/apiservices.go b/pkg/controller/operators/olm/apiservices.go
@@ -213,9 +213,23 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 		}
 
 		// Check if deployment is being updated or rolling out
-		if deployment.Status.UnavailableReplicas > 0 ||
-			deployment.Status.UpdatedReplicas < deployment.Status.Replicas {
-			a.logger.Debugf("Deployment %s has unavailable replicas, likely due to pod disruption", deploymentSpec.Name)
+		// This includes several scenarios:
+		// 1. UnavailableReplicas > 0: Some replicas are not ready
+		// 2. UpdatedReplicas < Replicas: Rollout in progress
+		// 3. Generation != ObservedGeneration: Spec changed but not yet observed
+		// 4. AvailableReplicas < desired: Not all replicas are available yet
+		isRollingOut := deployment.Status.UnavailableReplicas > 0 ||
+			deployment.Status.UpdatedReplicas < deployment.Status.Replicas ||
+			deployment.Generation != deployment.Status.ObservedGeneration ||
+			deployment.Status.AvailableReplicas < *deployment.Spec.Replicas
+
+		if isRollingOut {
+			a.logger.Debugf("Deployment %s is rolling out or has unavailable replicas (unavailable=%d, updated=%d/%d, available=%d/%d, generation=%d/%d), likely due to pod disruption",
+				deploymentSpec.Name,
+				deployment.Status.UnavailableReplicas,
+				deployment.Status.UpdatedReplicas, deployment.Status.Replicas,
+				deployment.Status.AvailableReplicas, *deployment.Spec.Replicas,
+				deployment.Status.ObservedGeneration, deployment.Generation)
 
 			// Check pod status to confirm disruption
 			selector, err := metav1.LabelSelectorAsSelector(deployment.Spec.Selector)
@@ -230,6 +244,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 				continue
 			}
 
+			// For single-replica deployments during rollout, if no pods exist yet,
+			// this is likely the brief window where the old pod is gone and new pod
+			// hasn't been created yet. This is expected disruption during upgrade.
+			// According to the OpenShift contract: "A component must not report Available=False
+			// during the course of a normal upgrade."
+			if len(pods) == 0 && *deployment.Spec.Replicas == 1 && isRollingOut {
+				a.logger.Infof("Single-replica deployment %s is rolling out with no pods yet - expected disruption during upgrade, will not mark CSV as Failed", deploymentSpec.Name)
+				return true
+			}
+
+			// Track if we found any real failures or expected disruptions
+			foundExpectedDisruption := false
+			foundRealFailure := false
+
 			// Check if any pod is in expected disruption state
 			for _, pod := range pods {
 				// Check how long the pod has been in disrupted state
@@ -244,7 +272,8 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 				// Pod is terminating (DeletionTimestamp is set)
 				if pod.DeletionTimestamp != nil {
 					a.logger.Debugf("Pod %s is terminating - expected disruption", pod.Name)
-					return true
+					foundExpectedDisruption = true
+					continue
 				}
 
 				// For pending pods, we need to distinguish between expected disruption
@@ -257,11 +286,20 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 					// Check pod conditions for scheduling issues
 					for _, condition := range pod.Status.Conditions {
 						if condition.Type == corev1.PodScheduled && condition.Status == corev1.ConditionFalse {
-							// If pod has been unschedulable for a while, it's likely a real issue
-							// not a temporary disruption from cluster upgrade
 							if condition.Reason == "Unschedulable" {
-								isRealFailure = true
-								a.logger.Debugf("Pod %s is unschedulable - not a temporary disruption", pod.Name)
+								// CRITICAL: In single-replica deployments during rollout, Unschedulable is EXPECTED
+								// due to PodAntiAffinity preventing new pod from scheduling while old pod is terminating.
+								// This is especially common in single-node clusters or control plane scenarios.
+								// Per OpenShift contract: "A component must not report Available=False during normal upgrade."
+								if *deployment.Spec.Replicas == 1 && isRollingOut {
+									a.logger.Infof("Pod %s is unschedulable during single-replica rollout - likely PodAntiAffinity conflict, treating as expected disruption", pod.Name)
+									isExpectedDisruption = true
+								} else {
+									// Multi-replica or non-rollout Unschedulable is a real issue
+									isRealFailure = true
+									foundRealFailure = true
+									a.logger.Debugf("Pod %s is unschedulable - not a temporary disruption", pod.Name)
+								}
 								break
 							}
 						}
@@ -278,6 +316,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 							case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
 								// These are real failures, not temporary disruptions
 								isRealFailure = true
+								foundRealFailure = true
 								a.logger.Debugf("Pod %s has container error %s - real failure, not disruption", pod.Name, reason)
 							}
 						}
@@ -292,6 +331,7 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 								isExpectedDisruption = true
 							case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff", "CreateContainerConfigError", "InvalidImageName":
 								isRealFailure = true
+								foundRealFailure = true
 								a.logger.Debugf("Pod %s has init container error %s - real failure, not disruption", pod.Name, reason)
 							}
 						}
@@ -302,17 +342,19 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 						continue
 					}
 
-					// If it's in expected disruption state, return true
+					// If it's in expected disruption state, mark it
 					if isExpectedDisruption {
 						a.logger.Debugf("Pod %s is in expected disruption state", pod.Name)
-						return true
+						foundExpectedDisruption = true
+						continue
 					}
 
 					// If pending without clear container status, check if it's just being scheduled
 					// This could be normal pod creation during node drain
 					if len(pod.Status.ContainerStatuses) == 0 && len(pod.Status.InitContainerStatuses) == 0 {
 						a.logger.Debugf("Pod %s is pending without container statuses - likely being scheduled", pod.Name)
-						return true
+						foundExpectedDisruption = true
+						continue
 					}
 				}
 
@@ -323,14 +365,32 @@ func (a *Operator) isAPIServiceBackendDisrupted(csv *v1alpha1.ClusterServiceVers
 						switch reason {
 						case "ContainerCreating", "PodInitializing":
 							a.logger.Debugf("Pod %s container is starting - expected disruption", pod.Name)
-							return true
+							foundExpectedDisruption = true
 						case "ImagePullBackOff", "ErrImagePull", "CrashLoopBackOff":
 							// Real failures - don't treat as disruption
 							a.logger.Debugf("Pod %s has container error %s - not treating as disruption", pod.Name, reason)
+							foundRealFailure = true
 						}
 					}
 				}
 			}
+
+			// After checking all pods, make a decision
+			// If we found expected disruption and no real failures, treat as disruption
+			if foundExpectedDisruption && !foundRealFailure {
+				a.logger.Infof("Deployment %s has pods in expected disruption state - will not mark CSV as Failed per Available contract", deploymentSpec.Name)
+				return true
+			}
+
+			// For single-replica deployments during rollout, if we found no real failures,
+			// treat as expected disruption to comply with the OpenShift contract:
+			// "A component must not report Available=False during the course of a normal upgrade."
+			// Single-replica deployments inherently have unavailability during rollout,
+			// but this is acceptable and should not trigger Available=False.
+			if !foundRealFailure && *deployment.Spec.Replicas == 1 && isRollingOut {
+				a.logger.Infof("Single-replica deployment %s is rolling out - treating as expected disruption per Available contract", deploymentSpec.Name)
+				return true
+			}
 		}
 	}