From eaff058a53bb8cf8c9b3f9b7fa3b635fb293ce76 Mon Sep 17 00:00:00 2001 From: Flavio Crisciani Date: Sun, 30 Apr 2017 14:51:43 -0700 Subject: [PATCH] Fix race condition between swarm and libnetwork This commit in conjunction with a libnetwork side commit, cleans up the libnetwork SetClusterProvider logic interaction. The previous code was inducing libnetwork to spawn several go routines that were racing between each other during the agent init and close. A test got added to verify that back to back swarm init and leave are properly processed and not raise crashes Signed-off-by: Flavio Crisciani Upstream-commit: e2ec006797fa14f59bcf7b9c23505ccdf1d3ded3 Component: engine --- components/engine/cmd/dockerd/daemon.go | 6 +++- components/engine/daemon/cluster.go | 2 ++ components/engine/daemon/cluster/cluster.go | 35 ++++++++++++++----- .../engine/daemon/cluster/noderunner.go | 6 ++-- components/engine/daemon/cluster/swarm.go | 1 - components/engine/daemon/network.go | 10 ++++-- .../integration-cli/docker_cli_swarm_test.go | 21 +++++++++++ 7 files changed, 66 insertions(+), 15 deletions(-) diff --git a/components/engine/cmd/dockerd/daemon.go b/components/engine/cmd/dockerd/daemon.go index 2e3f11f84e..3b6d0f014c 100644 --- a/components/engine/cmd/dockerd/daemon.go +++ b/components/engine/cmd/dockerd/daemon.go @@ -300,6 +300,11 @@ func (cli *DaemonCli) start(opts daemonOptions) (err error) { if err != nil { logrus.Fatalf("Error creating cluster component: %v", err) } + d.SetCluster(c) + err = c.Start() + if err != nil { + logrus.Fatalf("Error starting cluster component: %v", err) + } // Restart all autostart containers which has a swarm endpoint // and is not yet running now that we have successfully @@ -316,7 +321,6 @@ func (cli *DaemonCli) start(opts daemonOptions) (err error) { cli.d = d - d.SetCluster(c) initRouter(api, d, c) cli.setupConfigReloadTrap() diff --git a/components/engine/daemon/cluster.go b/components/engine/daemon/cluster.go index b7970edbb5..d22970bcd7 100644 --- a/components/engine/daemon/cluster.go +++ b/components/engine/daemon/cluster.go @@ -2,12 +2,14 @@ package daemon import ( apitypes "github.com/docker/docker/api/types" + lncluster "github.com/docker/libnetwork/cluster" ) // Cluster is the interface for github.com/docker/docker/daemon/cluster.(*Cluster). type Cluster interface { ClusterStatus NetworkManager + SendClusterEvent(event lncluster.ConfigEventType) } // ClusterStatus interface provides information about the Swarm status of the Cluster diff --git a/components/engine/daemon/cluster/cluster.go b/components/engine/daemon/cluster/cluster.go index 6a00a4d7b8..aeefa360fd 100644 --- a/components/engine/daemon/cluster/cluster.go +++ b/components/engine/daemon/cluster/cluster.go @@ -51,6 +51,7 @@ import ( types "github.com/docker/docker/api/types/swarm" executorpkg "github.com/docker/docker/daemon/cluster/executor" "github.com/docker/docker/pkg/signal" + lncluster "github.com/docker/libnetwork/cluster" swarmapi "github.com/docker/swarmkit/api" swarmnode "github.com/docker/swarmkit/node" "github.com/pkg/errors" @@ -115,7 +116,7 @@ type Cluster struct { root string runtimeRoot string config Config - configEvent chan struct{} // todo: make this array and goroutine safe + configEvent chan lncluster.ConfigEventType // todo: make this array and goroutine safe attachers map[string]*attacher } @@ -147,22 +148,30 @@ func New(config Config) (*Cluster, error) { c := &Cluster{ root: root, config: config, - configEvent: make(chan struct{}, 10), + configEvent: make(chan lncluster.ConfigEventType, 10), runtimeRoot: config.RuntimeRoot, attachers: make(map[string]*attacher), } + return c, nil +} + +// Start the Cluster instance +// TODO The split between New and Start can be join again when the SendClusterEvent +// method is no longer required +func (c *Cluster) Start() error { + root := filepath.Join(c.config.Root, swarmDirName) nodeConfig, err := loadPersistentState(root) if err != nil { if os.IsNotExist(err) { - return c, nil + return nil } - return nil, err + return err } nr, err := c.newNodeRunner(*nodeConfig) if err != nil { - return nil, err + return err } c.nr = nr @@ -172,10 +181,10 @@ func New(config Config) (*Cluster, error) { case err := <-nr.Ready(): if err != nil { logrus.WithError(err).Error("swarm component could not be started") - return c, nil + return nil } } - return c, nil + return nil } func (c *Cluster) newNodeRunner(conf nodeStartConfig) (*nodeRunner, error) { @@ -308,7 +317,7 @@ func (c *Cluster) getRemoteAddressList() []string { // ListenClusterEvents returns a channel that receives messages on cluster // participation changes. // todo: make cancelable and accessible to multiple callers -func (c *Cluster) ListenClusterEvents() <-chan struct{} { +func (c *Cluster) ListenClusterEvents() <-chan lncluster.ConfigEventType { return c.configEvent } @@ -413,3 +422,13 @@ func (c *Cluster) lockedManagerAction(fn func(ctx context.Context, state nodeSta return fn(ctx, state) } + +// SendClusterEvent allows to send cluster events on the configEvent channel +// TODO This method should not be exposed. +// Currently it is used to notify the network controller that the keys are +// available +func (c *Cluster) SendClusterEvent(event lncluster.ConfigEventType) { + c.mu.RLock() + defer c.mu.RUnlock() + c.configEvent <- event +} diff --git a/components/engine/daemon/cluster/noderunner.go b/components/engine/daemon/cluster/noderunner.go index 13947fa7b0..2b18d99da1 100644 --- a/components/engine/daemon/cluster/noderunner.go +++ b/components/engine/daemon/cluster/noderunner.go @@ -11,6 +11,7 @@ import ( "github.com/Sirupsen/logrus" types "github.com/docker/docker/api/types/swarm" "github.com/docker/docker/daemon/cluster/executor/container" + lncluster "github.com/docker/libnetwork/cluster" swarmapi "github.com/docker/swarmkit/api" swarmnode "github.com/docker/swarmkit/node" "github.com/pkg/errors" @@ -162,7 +163,7 @@ func (n *nodeRunner) handleControlSocketChange(ctx context.Context, node *swarmn } n.grpcConn = conn n.mu.Unlock() - n.cluster.configEvent <- struct{}{} + n.cluster.SendClusterEvent(lncluster.EventSocketChange) } } @@ -175,7 +176,7 @@ func (n *nodeRunner) handleReadyEvent(ctx context.Context, node *swarmnode.Node, close(ready) case <-ctx.Done(): } - n.cluster.configEvent <- struct{}{} + n.cluster.SendClusterEvent(lncluster.EventNodeReady) } func (n *nodeRunner) handleNodeExit(node *swarmnode.Node) { @@ -217,6 +218,7 @@ func (n *nodeRunner) Stop() error { if err := n.swarmNode.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") { return err } + n.cluster.SendClusterEvent(lncluster.EventNodeLeave) <-n.done return nil } diff --git a/components/engine/daemon/cluster/swarm.go b/components/engine/daemon/cluster/swarm.go index 8db4f3621f..8a8d5bf808 100644 --- a/components/engine/daemon/cluster/swarm.go +++ b/components/engine/daemon/cluster/swarm.go @@ -388,7 +388,6 @@ func (c *Cluster) Leave(force bool) error { } } - c.configEvent <- struct{}{} // todo: cleanup optional? if err := clearPersistentState(c.root); err != nil { return err diff --git a/components/engine/daemon/network.go b/components/engine/daemon/network.go index df39c7c96e..781953686a 100644 --- a/components/engine/daemon/network.go +++ b/components/engine/daemon/network.go @@ -16,6 +16,7 @@ import ( "github.com/docker/docker/pkg/plugingetter" "github.com/docker/docker/runconfig" "github.com/docker/libnetwork" + lncluster "github.com/docker/libnetwork/cluster" "github.com/docker/libnetwork/driverapi" "github.com/docker/libnetwork/ipamapi" networktypes "github.com/docker/libnetwork/types" @@ -207,7 +208,6 @@ func (daemon *Daemon) setupIngress(create *clustertypes.NetworkCreateRequest, ip func (daemon *Daemon) releaseIngress(id string) { controller := daemon.netController - if err := controller.SandboxDestroy("ingress-sbox"); err != nil { logrus.Errorf("Failed to delete ingress sandbox: %v", err) } @@ -233,13 +233,17 @@ func (daemon *Daemon) releaseIngress(id string) { logrus.Errorf("Failed to delete ingress network %s: %v", n.ID(), err) return } - return } // SetNetworkBootstrapKeys sets the bootstrap keys. func (daemon *Daemon) SetNetworkBootstrapKeys(keys []*networktypes.EncryptionKey) error { - return daemon.netController.SetKeys(keys) + err := daemon.netController.SetKeys(keys) + if err == nil { + // Upon successful key setting dispatch the keys available event + daemon.cluster.SendClusterEvent(lncluster.EventNetworkKeysAvailable) + } + return err } // UpdateAttachment notifies the attacher about the attachment config. diff --git a/components/engine/integration-cli/docker_cli_swarm_test.go b/components/engine/integration-cli/docker_cli_swarm_test.go index 722dbabf4c..9e45167304 100644 --- a/components/engine/integration-cli/docker_cli_swarm_test.go +++ b/components/engine/integration-cli/docker_cli_swarm_test.go @@ -1980,3 +1980,24 @@ func (s *DockerSwarmSuite) TestSwarmInitUnspecifiedDataPathAddr(c *check.C) { c.Assert(err, checker.NotNil) c.Assert(out, checker.Contains, "data path address must be a non-zero IP") } + +func (s *DockerSwarmSuite) TestSwarmJoinLeave(c *check.C) { + d := s.AddDaemon(c, true, true) + + out, err := d.Cmd("swarm", "join-token", "-q", "worker") + c.Assert(err, checker.IsNil) + c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "") + + token := strings.TrimSpace(out) + + // Verify that back to back join/leave does not cause panics + d1 := s.AddDaemon(c, false, false) + for i := 0; i < 10; i++ { + out, err = d1.Cmd("swarm", "join", "--token", token, d.ListenAddr) + c.Assert(err, checker.IsNil) + c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "") + + _, err = d1.Cmd("swarm", "leave") + c.Assert(err, checker.IsNil) + } +}