feat: smart monitoring #25

Open
fauno wants to merge 3 commits from smartctl into alloy
6 changed files with 111 additions and 0 deletions

View File

@ -30,6 +30,9 @@ LIVE_DEBUGGING=false
# server is remote
# PROMETHEUS_REMOTE_WRITE_URL=https://prometheus.$DOMAIN/api/v1/write
# Monitor physical disks health
# COMPOSE_FILE="$COMPOSE_FILE:compose.smartctl.yml"
# Monitoring Server
#
## Prometheus

View File

@ -158,4 +158,10 @@ It is possible to enable the following alerts, by setting the corresponding env
- node disk space: `ALERT_NODE_DISK_SPACE_ENABLED`
- node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED`
## smart monitoring
To be able monitor hard drive health data, you need to configure
`smartd` to run on the host system, and also the
`collect-smartctl-json.sh` script provided here (via cronjob or as
a `smartd` hook). This is a limitation on Docker Swarm, which prevents
the `smartctl_exporter` from running on privileged mode.

View File

@ -0,0 +1,6 @@
[Unit]
Description=Collect SMART data
[Service]
Type=oneshot
ExecStart=/usr/local/bin/collect-smartctl-json.sh

69
collect-smartctl-json.sh Executable file
View File

@ -0,0 +1,69 @@
#! /bin/bash
# Adapted from https://github.com/prometheus-community/smartctl_exporter/blob/master/collect-smartctl-json.sh
script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
# Data directory to dump smartctl output
# This directory will be created if it doesn't exist
data_dir="/var/lib/smartmontools/json"
# The original script used --xall but that doesn't work
# This matches the command in readSMARTctl()
smartctl_args="--json --info --health --attributes --tolerance=verypermissive \
--nocheck=standby --format=brief --log=error"
# Ignore this devices
smartctl_ignore_dev_regex="^(/dev/bus)"
# Determine the json query tool to use
if command -v jq >/dev/null; then
json_tool="jq"
json_args="--raw-output"
elif command -v yq >/dev/null; then
json_tool="yq"
json_args="--unwrapScalar"
else
echo -e "One of 'yq' or 'jq' is required. Please try again after \
installing one of them"
exit 1
fi
if [[ ! "${UID}" -eq 0 ]] && ! command -v sudo >/dev/null; then
# Not root and sudo doesn't exist
echo "sudo does not exist. Please run this as root"
exit 1
fi
SUDO="sudo"
if [[ "${UID}" -eq 0 ]]; then
# Don't use sudo if root
SUDO=""
fi
[[ ! -d "${data_dir}" ]] && mkdir --parents "${data_dir}"
if [[ $# -ne 0 ]]; then
devices="${1}"
else
devices="$(smartctl --scan --json | "${json_tool}" "${json_args}" \
".devices[].name | select(test(\"${smartctl_ignore_dev_regex}\") | not)")"
mapfile -t devices <<< "${devices[@]}"
fi
for device in "${devices[@]}"
do
echo -n "Collecting data for '${device}'..."
# shellcheck disable=SC2086
data="$($SUDO smartctl ${smartctl_args} ${device})"
# Accommodate a smartmontools pre-7.3 bug
data=${data#" Pending defect count:"}
type="$(echo "${data}" | "${json_tool}" "${json_args}" '.device.type')"
family="$(echo "${data}" | "${json_tool}" "${json_args}" \
'select(.model_family != null) | .model_family | sub(" |/" ; "_" ; "g")
| sub("\"|\\(|\\)" ; "" ; "g")')"
model="$(echo "${data}" | "${json_tool}" "${json_args}" \
'.model_name | sub(" |/" ; "_" ; "g") | sub("\"|\\(|\\)" ; "" ; "g")')"
device_name="$(basename "${device}")"
echo -e "\tSaving to ${device_name}.json"
echo "${data}" > "${data_dir}/${device_name}.json"
done

View File

@ -0,0 +1,9 @@
[Unit]
Description=Collect SMART data
[Timer]
OnCalendar=hourly
Persistent=true
[Install]
WantedBy=timers.target

18
compose.smartctl.yml Normal file
View File

@ -0,0 +1,18 @@
---
version: "3.8"
services:
smartctl:
image: "prometheuscommunity/smartctl-exporter:v0.14.0"
volumes:
- "/dev:/dev"
- "/var/lib/smartmontools/json:/debug"
command:
- "--smartctl.fake-data"
- "--smartctl.interval=1h"
networks:
- "proxy"
deploy:
labels:
- "prometheus.io/scrape=true"
- "prometheus.io/port=9633"
- "prometheus.io/path=/metrics"