From 1ea11295f340d5249972b756fc40455452edc97f Mon Sep 17 00:00:00 2001 From: Jana Radhakrishnan Date: Tue, 9 Aug 2016 13:37:11 -0700 Subject: [PATCH] Retry creating dynamic networks if not found In cases there are failures in task start, swarmkit might be trying to restart the task again in the same node which might keep failing. This creates a race where when a failed task is getting removed it might remove the associated network while another task for the same service or a different service but connected to the same network is proceeding with starting the container knowing that the network is still present. Fix this by reacting to `ErrNoSuchNetwork` error during container start by trying to recreate the managed networks. If they have been removed it will be recreated. If they are already present nothing bad will happen. Signed-off-by: Jana Radhakrishnan Upstream-commit: 117cef5e9766d6ba228770c225e816c6afd16ff8 Component: engine --- .../cluster/executor/container/controller.go | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/components/engine/daemon/cluster/executor/container/controller.go b/components/engine/daemon/cluster/executor/container/controller.go index ea4eab15c0..5f865aae4c 100644 --- a/components/engine/daemon/cluster/executor/container/controller.go +++ b/components/engine/daemon/cluster/executor/container/controller.go @@ -7,6 +7,7 @@ import ( executorpkg "github.com/docker/docker/daemon/cluster/executor" "github.com/docker/engine-api/types" "github.com/docker/engine-api/types/events" + "github.com/docker/libnetwork" "github.com/docker/swarmkit/agent/exec" "github.com/docker/swarmkit/api" "github.com/docker/swarmkit/log" @@ -163,8 +164,23 @@ func (r *controller) Start(ctx context.Context) error { return exec.ErrTaskStarted } - if err := r.adapter.start(ctx); err != nil { - return errors.Wrap(err, "starting container failed") + for { + if err := r.adapter.start(ctx); err != nil { + if _, ok := err.(libnetwork.ErrNoSuchNetwork); ok { + // Retry network creation again if we + // failed because some of the networks + // were not found. + if err := r.adapter.createNetworks(ctx); err != nil { + return err + } + + continue + } + + return errors.Wrap(err, "starting container failed") + } + + break } // no health check