Skip to content

Commit e786eb5

Browse files
committed
refactor how the syncagent integrates with multicluster-runtime
Since the mcprovider now watches the APIExportEndpointSlice itself, it does not make sense anymore to start it dynamically on demand. There is no point in having the syncmanager watch an APIExport or EndpointSlice itself, since the mcprovider does not take any URLs as input values anymore, it relies solely on watching the EndpointSlice itself. To accomodate this, I moved the manager and its provider up into the main(). Since the syncmanager controller however not just simply starts an mcmanager/provider, but also deals with dynamically started controllers, I decided to move that behaviour into a new DynamicMultiClusterManager (DMCM). The DMCM is capable of keeping track of all engaged clusters and allows you to add and start controllers at any time. It will automatically pre-seed new controller instances with the known clusters. There is one downside: Right now, if the controller fails to start, the syncmanager controller will not notice, re-reconcile and restore it. There's no back channel for that error from the sync controller's goroutine yet. --- Since the syncagent by design dynamically fills APIExports with resource schemas, when the DMCM is started, there are most likely (at least in cold start situations) no resource schemas yet in the APIExport. Even though the syncmanager waits for the apiresourceschema controller to be done, it currently does not wait for the apiexport controller to be done (in adding that ResourceSchema to the APIExport) (there is no condition or anything to tell when that controller has finished). This means the syncmanager will start new sync controllers at the same time as the apiexport controller is still filling the APIExport. However when the resources are not present yet, these sync controllers fail to boot up, and since there is currently no channel to report this back to the syncmanager, they do not get quickly restarted. To handle this (it might have been simpler to wait for the APIExport to be ready, but alas... I realized my oversight too late), I added an explicit API discovery step, where the sync controllers are only started if and when the primary resource is actually available. --- One major side effect of this whole refactoring is of course: the agent does not longer support watching APIExports for their deprecated virtual workspace URLs. When this is merged, you must provide an APIExportEndpointSlice. On-behalf-of: @SAP [email protected]
1 parent 0dfa0e5 commit e786eb5

File tree

8 files changed

+478
-396
lines changed

8 files changed

+478
-396
lines changed

cmd/api-syncagent/main.go

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,12 @@ import (
3232
"github.com/kcp-dev/api-syncagent/internal/controller/apiexport"
3333
"github.com/kcp-dev/api-syncagent/internal/controller/apiresourceschema"
3434
"github.com/kcp-dev/api-syncagent/internal/controller/syncmanager"
35+
"github.com/kcp-dev/api-syncagent/internal/discovery"
36+
"github.com/kcp-dev/api-syncagent/internal/kcp"
3537
syncagentlog "github.com/kcp-dev/api-syncagent/internal/log"
3638
"github.com/kcp-dev/api-syncagent/internal/version"
3739
syncagentv1alpha1 "github.com/kcp-dev/api-syncagent/sdk/apis/syncagent/v1alpha1"
3840

39-
kcpapisv1alpha1 "github.com/kcp-dev/sdk/apis/apis/v1alpha1"
40-
4141
corev1 "k8s.io/api/core/v1"
4242
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
4343
"k8s.io/apimachinery/pkg/runtime"
@@ -147,6 +147,21 @@ func run(ctx context.Context, log *zap.SugaredLogger, opts *Options) error {
147147
if err := mgr.Add(endpointKcpCluster); err != nil {
148148
return fmt.Errorf("failed to add endpoint kcp cluster runnable: %w", err)
149149
}
150+
151+
endpointSliceCluster = endpointKcpCluster
152+
}
153+
154+
// Setup the magical dynamic multicluster manager. It's a dynamic version of the
155+
// regular mcmanager, capable of starting new controllers at any later time and
156+
// allowing them to be also stopped at any time. The syncmanager needs it to
157+
// start/stop sync controllers for each PublishedResource.
158+
dmcm, err := kcp.NewDynamicMultiClusterManager(endpoint.EndpointSlice.Config, endpoint.EndpointSlice.Name)
159+
if err != nil {
160+
return fmt.Errorf("failed to start dynamic multi cluster manager: %w", err)
161+
}
162+
163+
if err := mgr.Add(dmcm); err != nil {
164+
return fmt.Errorf("failed to add endpoint kcp cluster runnable: %w", err)
150165
}
151166

152167
startController := func(name string, creator func() error) error {
@@ -177,20 +192,11 @@ func run(ctx context.Context, log *zap.SugaredLogger, opts *Options) error {
177192
// This controller is called "sync" because it makes the most sense to the users, even though internally the relevant
178193
// controller is the syncmanager (which in turn would start/stop the sync controllers).
179194
if err := startController("sync", func() error {
180-
cluster := endpointKcpCluster
181-
if cluster == nil {
182-
cluster = managedKcpCluster
183-
}
184-
185-
var endpointSlice *kcpdevv1alpha1.APIExportEndpointSlice
186-
if endpoint.EndpointSlice != nil {
187-
endpointSlice = endpoint.EndpointSlice.APIExportEndpointSlice
188-
}
189-
190-
// It doesn't matter which rest config we specify, as the URL will be overwritten with the
191-
// virtual workspace URL anyway.
195+
// The syncmanager needs to be able to determine whether an API is already bound and available
196+
// before it can start any sync controllers. That discovery logic is encapsulated in the ResourceProber.
197+
prober := discovery.NewResourceProber(endpoint.EndpointSlice.Config, endpointSliceCluster.GetClient(), endpoint.EndpointSlice.Name)
192198

193-
return syncmanager.Add(ctx, mgr, cluster, kcpRestConfig, log, endpoint.APIExport.APIExport, endpointSlice, opts.PublishedResourceSelector, opts.Namespace, opts.AgentName)
199+
return syncmanager.Add(ctx, mgr, prober, dmcm, log, opts.PublishedResourceSelector, opts.Namespace, opts.AgentName)
194200
}); err != nil {
195201
return err
196202
}

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ require (
1616
github.com/go-logr/zapr v1.3.0
1717
github.com/google/cel-go v0.26.0
1818
github.com/google/go-cmp v0.7.0
19+
github.com/google/uuid v1.6.0
1920
github.com/kcp-dev/api-syncagent/sdk v0.0.0-00010101000000-000000000000
2021
github.com/kcp-dev/kcp v0.29.1-0.20251210093424-08fb9eb48494
2122
github.com/kcp-dev/logicalcluster/v3 v3.0.5
@@ -79,7 +80,6 @@ require (
7980
github.com/golang/protobuf v1.5.4 // indirect
8081
github.com/google/btree v1.1.3 // indirect
8182
github.com/google/gnostic-models v0.7.0 // indirect
82-
github.com/google/uuid v1.6.0 // indirect
8383
github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 // indirect
8484
github.com/grpc-ecosystem/grpc-gateway/v2 v2.26.3 // indirect
8585
github.com/hashicorp/golang-lru/v2 v2.0.7 // indirect

internal/controller/sync/controller.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -134,12 +134,12 @@ func Create(
134134
// The manager parameter is mostly unused and will be removed in future CR versions.
135135
c, err := mccontroller.NewUnmanaged(ControllerName, remoteManager, ctrlOptions)
136136
if err != nil {
137-
return nil, err
137+
return nil, fmt.Errorf("failed to instantiate new controller: %w", err)
138138
}
139139

140140
// watch the target resource in the virtual workspace
141141
if err := c.MultiClusterWatch(mcsource.TypedKind(remoteDummy, mchandler.TypedEnqueueRequestForObject[*unstructured.Unstructured]())); err != nil {
142-
return nil, err
142+
return nil, fmt.Errorf("failed to setup remote-side watch: %w", err)
143143
}
144144

145145
// watch the source resource in the local cluster, but enqueue the origin remote object
@@ -158,7 +158,7 @@ func Create(
158158
})
159159

160160
if err := c.Watch(source.TypedKind(localManager.GetCache(), localDummy, enqueueRemoteObjForLocalObj, nameFilter)); err != nil {
161-
return nil, err
161+
return nil, fmt.Errorf("failed to setup local-side watch: %w", err)
162162
}
163163

164164
log.Info("Done setting up unmanaged controller.")

0 commit comments

Comments
 (0)