This PR adds support for user-defined health-check probes for Docker
containers. It adds a `HEALTHCHECK` instruction to the Dockerfile syntax plus
some corresponding "docker run" options. It can be used with a restart policy
to automatically restart a container if the check fails.
The `HEALTHCHECK` instruction has two forms:
* `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container)
* `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image)
The `HEALTHCHECK` instruction tells Docker how to test a container to check that
it is still working. This can detect cases such as a web server that is stuck in
an infinite loop and unable to handle new connections, even though the server
process is still running.
When a container has a healthcheck specified, it has a _health status_ in
addition to its normal status. This status is initially `starting`. Whenever a
health check passes, it becomes `healthy` (whatever state it was previously in).
After a certain number of consecutive failures, it becomes `unhealthy`.
The options that can appear before `CMD` are:
* `--interval=DURATION` (default: `30s`)
* `--timeout=DURATION` (default: `30s`)
* `--retries=N` (default: `1`)
The health check will first run **interval** seconds after the container is
started, and then again **interval** seconds after each previous check completes.
If a single run of the check takes longer than **timeout** seconds then the check
is considered to have failed.
It takes **retries** consecutive failures of the health check for the container
to be considered `unhealthy`.
There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list
more than one then only the last `HEALTHCHECK` will take effect.
The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK
CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands;
see e.g. `ENTRYPOINT` for details).
The command's exit status indicates the health status of the container.
The possible values are:
- 0: success - the container is healthy and ready for use
- 1: unhealthy - the container is not working correctly
- 2: starting - the container is not ready for use yet, but is working correctly
If the probe returns 2 ("starting") when the container has already moved out of the
"starting" state then it is treated as "unhealthy" instead.
For example, to check every five minutes or so that a web-server is able to
serve the site's main page within three seconds:
HEALTHCHECK --interval=5m --timeout=3s \
CMD curl -f http://localhost/ || exit 1
To help debug failing probes, any output text (UTF-8 encoded) that the command writes
on stdout or stderr will be stored in the health status and can be queried with
`docker inspect`. Such output should be kept short (only the first 4096 bytes
are stored currently).
When the health status of a container changes, a `health_status` event is
generated with the new status. The health status is also displayed in the
`docker ps` output.
Signed-off-by: Thomas Leonard <thomas.leonard@docker.com>
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
Upstream-commit: b6c7becbfe1d76b1250f6d8e991e645e13808a9c
Component: engine
68 lines
2.3 KiB
Go
68 lines
2.3 KiB
Go
package daemon
|
|
|
|
import (
|
|
"fmt"
|
|
"net/http"
|
|
"time"
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
"github.com/docker/docker/container"
|
|
"github.com/docker/docker/errors"
|
|
)
|
|
|
|
// ContainerStop looks for the given container and terminates it,
|
|
// waiting the given number of seconds before forcefully killing the
|
|
// container. If a negative number of seconds is given, ContainerStop
|
|
// will wait for a graceful termination. An error is returned if the
|
|
// container is not found, is already stopped, or if there is a
|
|
// problem stopping the container.
|
|
func (daemon *Daemon) ContainerStop(name string, seconds int) error {
|
|
container, err := daemon.GetContainer(name)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if !container.IsRunning() {
|
|
err := fmt.Errorf("Container %s is already stopped", name)
|
|
return errors.NewErrorWithStatusCode(err, http.StatusNotModified)
|
|
}
|
|
if err := daemon.containerStop(container, seconds); err != nil {
|
|
return fmt.Errorf("Cannot stop container %s: %v", name, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// containerStop halts a container by sending a stop signal, waiting for the given
|
|
// duration in seconds, and then calling SIGKILL and waiting for the
|
|
// process to exit. If a negative duration is given, Stop will wait
|
|
// for the initial signal forever. If the container is not running Stop returns
|
|
// immediately.
|
|
func (daemon *Daemon) containerStop(container *container.Container, seconds int) error {
|
|
if !container.IsRunning() {
|
|
return nil
|
|
}
|
|
|
|
daemon.stopHealthchecks(container)
|
|
|
|
stopSignal := container.StopSignal()
|
|
// 1. Send a stop signal
|
|
if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {
|
|
logrus.Infof("Failed to send signal %d to the process, force killing", stopSignal)
|
|
if err := daemon.killPossiblyDeadProcess(container, 9); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// 2. Wait for the process to exit on its own
|
|
if _, err := container.WaitStop(time.Duration(seconds) * time.Second); err != nil {
|
|
logrus.Infof("Container %v failed to exit within %d seconds of signal %d - using the force", container.ID, seconds, stopSignal)
|
|
// 3. If it doesn't, then send SIGKILL
|
|
if err := daemon.Kill(container); err != nil {
|
|
container.WaitStop(-1 * time.Second)
|
|
logrus.Warn(err) // Don't return error because we only care that container is stopped, not what function stopped it
|
|
}
|
|
}
|
|
|
|
daemon.LogContainerEvent(container, "stop")
|
|
return nil
|
|
}
|