diff --git a/.env.sample b/.env.sample index 6689006..2dce721 100644 --- a/.env.sample +++ b/.env.sample @@ -30,6 +30,9 @@ LIVE_DEBUGGING=false # server is remote # PROMETHEUS_REMOTE_WRITE_URL=https://prometheus.$DOMAIN/api/v1/write +# Monitor physical disks health +# COMPOSE_FILE="$COMPOSE_FILE:compose.smartctl.yml" + # Monitoring Server # ## Prometheus diff --git a/README.md b/README.md index 2dcf161..9fc9379 100644 --- a/README.md +++ b/README.md @@ -158,4 +158,10 @@ It is possible to enable the following alerts, by setting the corresponding env - node disk space: `ALERT_NODE_DISK_SPACE_ENABLED` - node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED` +## smart monitoring +To be able monitor hard drive health data, you need to configure +`smartd` to run on the host system, and also the +`collect-smartctl-json.sh` script provided here (via cronjob or as +a `smartd` hook). This is a limitation on Docker Swarm, which prevents +the `smartctl_exporter` from running on privileged mode. diff --git a/collect-smartctl-json.service b/collect-smartctl-json.service new file mode 100644 index 0000000..b4b8e7c --- /dev/null +++ b/collect-smartctl-json.service @@ -0,0 +1,6 @@ +[Unit] +Description=Collect SMART data + +[Service] +Type=oneshot +ExecStart=/usr/local/bin/collect-smartctl-json.sh diff --git a/collect-smartctl-json.sh b/collect-smartctl-json.sh new file mode 100755 index 0000000..ca5e55a --- /dev/null +++ b/collect-smartctl-json.sh @@ -0,0 +1,69 @@ +#! /bin/bash +# Adapted from https://github.com/prometheus-community/smartctl_exporter/blob/master/collect-smartctl-json.sh + +script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +# Data directory to dump smartctl output +# This directory will be created if it doesn't exist +data_dir="/var/lib/smartmontools/json" + +# The original script used --xall but that doesn't work +# This matches the command in readSMARTctl() +smartctl_args="--json --info --health --attributes --tolerance=verypermissive \ +--nocheck=standby --format=brief --log=error" + +# Ignore this devices +smartctl_ignore_dev_regex="^(/dev/bus)" + +# Determine the json query tool to use +if command -v jq >/dev/null; then + json_tool="jq" + json_args="--raw-output" +elif command -v yq >/dev/null; then + json_tool="yq" + json_args="--unwrapScalar" +else + echo -e "One of 'yq' or 'jq' is required. Please try again after \ +installing one of them" + exit 1 +fi + +if [[ ! "${UID}" -eq 0 ]] && ! command -v sudo >/dev/null; then + # Not root and sudo doesn't exist + echo "sudo does not exist. Please run this as root" + exit 1 +fi + +SUDO="sudo" +if [[ "${UID}" -eq 0 ]]; then + # Don't use sudo if root + SUDO="" +fi + +[[ ! -d "${data_dir}" ]] && mkdir --parents "${data_dir}" + +if [[ $# -ne 0 ]]; then + devices="${1}" +else + devices="$(smartctl --scan --json | "${json_tool}" "${json_args}" \ +".devices[].name | select(test(\"${smartctl_ignore_dev_regex}\") | not)")" + mapfile -t devices <<< "${devices[@]}" +fi + +for device in "${devices[@]}" + do + echo -n "Collecting data for '${device}'..." + # shellcheck disable=SC2086 + data="$($SUDO smartctl ${smartctl_args} ${device})" + # Accommodate a smartmontools pre-7.3 bug + data=${data#" Pending defect count:"} + type="$(echo "${data}" | "${json_tool}" "${json_args}" '.device.type')" + family="$(echo "${data}" | "${json_tool}" "${json_args}" \ +'select(.model_family != null) | .model_family | sub(" |/" ; "_" ; "g") + | sub("\"|\\(|\\)" ; "" ; "g")')" + model="$(echo "${data}" | "${json_tool}" "${json_args}" \ +'.model_name | sub(" |/" ; "_" ; "g") | sub("\"|\\(|\\)" ; "" ; "g")')" + device_name="$(basename "${device}")" + echo -e "\tSaving to ${device_name}.json" + echo "${data}" > "${data_dir}/${device_name}.json" +done diff --git a/collect-smartctl-json.timer b/collect-smartctl-json.timer new file mode 100644 index 0000000..a56e8b3 --- /dev/null +++ b/collect-smartctl-json.timer @@ -0,0 +1,9 @@ +[Unit] +Description=Collect SMART data + +[Timer] +OnCalendar=hourly +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/compose.smartctl.yml b/compose.smartctl.yml new file mode 100644 index 0000000..a87d765 --- /dev/null +++ b/compose.smartctl.yml @@ -0,0 +1,18 @@ +--- +version: "3.8" +services: + smartctl: + image: "prometheuscommunity/smartctl-exporter:v0.14.0" + volumes: + - "/dev:/dev" + - "/var/lib/smartmontools/json:/debug" + command: + - "--smartctl.fake-data" + - "--smartctl.interval=1h" + networks: + - "proxy" + deploy: + labels: + - "prometheus.io/scrape=true" + - "prometheus.io/port=9633" + - "prometheus.io/path=/metrics"