From 1e791aef776d2364ed55b0e5bdc700535171a6e4 Mon Sep 17 00:00:00 2001 From: Derek McGowan Date: Tue, 4 Sep 2018 11:00:28 -0700 Subject: [PATCH] Fix supervisor healthcheck throttling Fix default case causing the throttling to not be used. Ensure that nil client condition is handled. Signed-off-by: Derek McGowan (cherry picked from commit c3e32938430e03a316311f9e4fbdb743e492a07e) Signed-off-by: Andrew Hsu Upstream-commit: f121eccf29576ce5d4b8256a71a9d32ee688ff7d Component: engine --- .../libcontainerd/supervisor/remote_daemon.go | 61 +++++++++++-------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/components/engine/libcontainerd/supervisor/remote_daemon.go b/components/engine/libcontainerd/supervisor/remote_daemon.go index b520d48671..182984aeca 100644 --- a/components/engine/libcontainerd/supervisor/remote_daemon.go +++ b/components/engine/libcontainerd/supervisor/remote_daemon.go @@ -245,20 +245,26 @@ func (r *remote) monitorDaemon(ctx context.Context) { }() for { - select { - case <-ctx.Done(): - r.logger.Info("stopping healthcheck following graceful shutdown") - if client != nil { - client.Close() + if delay != nil { + select { + case <-ctx.Done(): + r.logger.Info("stopping healthcheck following graceful shutdown") + if client != nil { + client.Close() + } + return + case <-delay: } - return - case <-delay: - default: } if r.daemonPid == -1 { if r.daemonWaitCh != nil { - <-r.daemonWaitCh + select { + case <-ctx.Done(): + r.logger.Info("stopping containerd startup following graceful shutdown") + return + case <-r.daemonWaitCh: + } } os.RemoveAll(r.GRPC.Address) @@ -276,26 +282,28 @@ func (r *remote) monitorDaemon(ctx context.Context) { } } - tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout) - _, err := client.IsServing(tctx) - cancel() - if err == nil { - if !started { - close(r.daemonStartCh) - started = true + if client != nil { + tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout) + _, err := client.IsServing(tctx) + cancel() + if err == nil { + if !started { + close(r.daemonStartCh) + started = true + } + + transientFailureCount = 0 + delay = time.After(500 * time.Millisecond) + continue } - transientFailureCount = 0 - delay = time.After(500 * time.Millisecond) - continue - } + r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding") - r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding") - - transientFailureCount++ - if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) { - delay = time.After(time.Duration(transientFailureCount) * 200 * time.Millisecond) - continue + transientFailureCount++ + if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) { + delay = time.After(time.Duration(transientFailureCount) * 200 * time.Millisecond) + continue + } } if system.IsProcessAlive(r.daemonPid) { @@ -304,6 +312,7 @@ func (r *remote) monitorDaemon(ctx context.Context) { } client.Close() + client = nil r.daemonPid = -1 delay = nil transientFailureCount = 0