Compare commits
12 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
2972272303
|
|||
|
4c69cf97ab
|
|||
|
1110786179
|
|||
| 1d9eb10004 | |||
|
23acf56637
|
|||
|
03227f1907
|
|||
|
d085c66d68
|
|||
|
1970061ff8
|
|||
|
fa76179987
|
|||
|
64cb07a4a2
|
|||
|
e247677433
|
|||
|
f2310f2b86
|
19
.env.sample
19
.env.sample
@ -6,13 +6,23 @@ DOMAIN=monitoring-ng.example.com
|
||||
ENABLE_BACKUPS=true
|
||||
|
||||
SECRET_BASIC_AUTH_VERSION=v1
|
||||
# Enable Live Debugging
|
||||
LIVE_DEBUGGING=false
|
||||
# Enable this to send logs to a Loki server, adapt DOMAIN if server is
|
||||
# remote
|
||||
# LOKI_PUSH_URL=https://loki.$DOMAIN/loki/api/v1/push
|
||||
# Enable this on SystemD hosts to read logs
|
||||
# Enable on systemd hosts to read logs from the journal
|
||||
# JOURNALD=1
|
||||
# Enable this on syslogd hosts and configure the syslogd to send logs to
|
||||
# Alloy on port 514/tcp
|
||||
#
|
||||
# Enable on non-systemd hosts (Alpine, older Debian/Ubuntu) to tail
|
||||
# /var/log/*log files (syslog, auth.log, kern.log, etc.) that a local
|
||||
# syslogd writes. No syslogd reconfiguration needed.
|
||||
# SYSLOG_FILES=1
|
||||
#
|
||||
# Enable to receive syslog messages over the network on port 514/tcp.
|
||||
# Use for remote devices that push syslog to this host, or for a
|
||||
# local syslogd configured to forward over the network.
|
||||
# Not needed if you just want to read local log files — use SYSLOG_FILES instead.
|
||||
# SYSLOG=1
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.syslog.yml"
|
||||
|
||||
@ -20,6 +30,9 @@ SECRET_BASIC_AUTH_VERSION=v1
|
||||
# server is remote
|
||||
# PROMETHEUS_REMOTE_WRITE_URL=https://prometheus.$DOMAIN/api/v1/write
|
||||
|
||||
# Monitor physical disks health
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.smartctl.yml"
|
||||
|
||||
# Monitoring Server
|
||||
#
|
||||
## Prometheus
|
||||
|
||||
@ -158,4 +158,10 @@ It is possible to enable the following alerts, by setting the corresponding env
|
||||
- node disk space: `ALERT_NODE_DISK_SPACE_ENABLED`
|
||||
- node memory usage: `ALERT_NODE_MEMORY_USAGE_ENABLED`
|
||||
|
||||
## smart monitoring
|
||||
|
||||
To be able monitor hard drive health data, you need to configure
|
||||
`smartd` to run on the host system, and also the
|
||||
`collect-smartctl-json.sh` script provided here (via cronjob or as
|
||||
a `smartd` hook). This is a limitation on Docker Swarm, which prevents
|
||||
the `smartctl_exporter` from running on privileged mode.
|
||||
|
||||
2
abra.sh
2
abra.sh
@ -11,7 +11,7 @@ export PROMETHEUS_YML_VERSION=v2
|
||||
export MATRIX_ALERTMANAGER_CONFIG_VERSION=e
|
||||
export MATRIX_ALERTMANAGER_ENTRYPOINT_VERSION=a
|
||||
export GRAFANA_ALERTS_NODE_VERSION=v1c
|
||||
export CONFIG_ALLOY_VERSION=v9
|
||||
export CONFIG_ALLOY_VERSION=v10
|
||||
|
||||
# creates a default prometheus scrape config for a given node
|
||||
add_node(){
|
||||
|
||||
6
collect-smartctl-json.service
Normal file
6
collect-smartctl-json.service
Normal file
@ -0,0 +1,6 @@
|
||||
[Unit]
|
||||
Description=Collect SMART data
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
ExecStart=/usr/local/bin/collect-smartctl-json.sh
|
||||
69
collect-smartctl-json.sh
Executable file
69
collect-smartctl-json.sh
Executable file
@ -0,0 +1,69 @@
|
||||
#! /bin/bash
|
||||
# Adapted from https://github.com/prometheus-community/smartctl_exporter/blob/master/collect-smartctl-json.sh
|
||||
|
||||
script_dir=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
|
||||
# Data directory to dump smartctl output
|
||||
# This directory will be created if it doesn't exist
|
||||
data_dir="/var/lib/smartmontools/json"
|
||||
|
||||
# The original script used --xall but that doesn't work
|
||||
# This matches the command in readSMARTctl()
|
||||
smartctl_args="--json --info --health --attributes --tolerance=verypermissive \
|
||||
--nocheck=standby --format=brief --log=error"
|
||||
|
||||
# Ignore this devices
|
||||
smartctl_ignore_dev_regex="^(/dev/bus)"
|
||||
|
||||
# Determine the json query tool to use
|
||||
if command -v jq >/dev/null; then
|
||||
json_tool="jq"
|
||||
json_args="--raw-output"
|
||||
elif command -v yq >/dev/null; then
|
||||
json_tool="yq"
|
||||
json_args="--unwrapScalar"
|
||||
else
|
||||
echo -e "One of 'yq' or 'jq' is required. Please try again after \
|
||||
installing one of them"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! "${UID}" -eq 0 ]] && ! command -v sudo >/dev/null; then
|
||||
# Not root and sudo doesn't exist
|
||||
echo "sudo does not exist. Please run this as root"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SUDO="sudo"
|
||||
if [[ "${UID}" -eq 0 ]]; then
|
||||
# Don't use sudo if root
|
||||
SUDO=""
|
||||
fi
|
||||
|
||||
[[ ! -d "${data_dir}" ]] && mkdir --parents "${data_dir}"
|
||||
|
||||
if [[ $# -ne 0 ]]; then
|
||||
devices="${1}"
|
||||
else
|
||||
devices="$(smartctl --scan --json | "${json_tool}" "${json_args}" \
|
||||
".devices[].name | select(test(\"${smartctl_ignore_dev_regex}\") | not)")"
|
||||
mapfile -t devices <<< "${devices[@]}"
|
||||
fi
|
||||
|
||||
for device in "${devices[@]}"
|
||||
do
|
||||
echo -n "Collecting data for '${device}'..."
|
||||
# shellcheck disable=SC2086
|
||||
data="$($SUDO smartctl ${smartctl_args} ${device})"
|
||||
# Accommodate a smartmontools pre-7.3 bug
|
||||
data=${data#" Pending defect count:"}
|
||||
type="$(echo "${data}" | "${json_tool}" "${json_args}" '.device.type')"
|
||||
family="$(echo "${data}" | "${json_tool}" "${json_args}" \
|
||||
'select(.model_family != null) | .model_family | sub(" |/" ; "_" ; "g")
|
||||
| sub("\"|\\(|\\)" ; "" ; "g")')"
|
||||
model="$(echo "${data}" | "${json_tool}" "${json_args}" \
|
||||
'.model_name | sub(" |/" ; "_" ; "g") | sub("\"|\\(|\\)" ; "" ; "g")')"
|
||||
device_name="$(basename "${device}")"
|
||||
echo -e "\tSaving to ${device_name}.json"
|
||||
echo "${data}" > "${data_dir}/${device_name}.json"
|
||||
done
|
||||
9
collect-smartctl-json.timer
Normal file
9
collect-smartctl-json.timer
Normal file
@ -0,0 +1,9 @@
|
||||
[Unit]
|
||||
Description=Collect SMART data
|
||||
|
||||
[Timer]
|
||||
OnCalendar=hourly
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
18
compose.smartctl.yml
Normal file
18
compose.smartctl.yml
Normal file
@ -0,0 +1,18 @@
|
||||
---
|
||||
version: "3.8"
|
||||
services:
|
||||
smartctl:
|
||||
image: "prometheuscommunity/smartctl-exporter:v0.14.0"
|
||||
volumes:
|
||||
- "/dev:/dev"
|
||||
- "/var/lib/smartmontools/json:/debug"
|
||||
command:
|
||||
- "--smartctl.fake-data"
|
||||
- "--smartctl.interval=1h"
|
||||
networks:
|
||||
- "proxy"
|
||||
deploy:
|
||||
labels:
|
||||
- "prometheus.io/scrape=true"
|
||||
- "prometheus.io/port=9633"
|
||||
- "prometheus.io/path=/metrics"
|
||||
15
compose.yml
15
compose.yml
@ -10,17 +10,17 @@ services:
|
||||
target: /etc/alloy/config.alloy
|
||||
volumes:
|
||||
- /:/rootfs:ro
|
||||
- /var/run:/var/run:rw
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- /sys:/sys:ro
|
||||
- /var/lib/docker:/var/lib/docker:ro
|
||||
- /dev:/dev:ro
|
||||
- alloy-data:/var/lib/alloy/data
|
||||
command:
|
||||
- "run"
|
||||
- "--storage.path=/var/lib/alloy/data"
|
||||
- "--server.http.listen-addr=0.0.0.0:12345"
|
||||
- "/etc/alloy/config.alloy"
|
||||
networks:
|
||||
- proxy
|
||||
- internal
|
||||
secrets:
|
||||
- basic_auth
|
||||
@ -29,8 +29,15 @@ services:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "backupbot.backup=${ENABLE_BACKUPS:-true}"
|
||||
- "traefik.enable=false"
|
||||
- "coop-cloud.${STACK_NAME}.version=1.6.0+v1.8.1"
|
||||
- "traefik.enable=true"
|
||||
- "traefik.swarm.network=proxy"
|
||||
- "traefik.http.services.${STACK_NAME}-alloy.loadbalancer.server.port=12345"
|
||||
- "traefik.http.routers.${STACK_NAME}-alloy.rule=Host(`alloy.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-alloy.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-alloy.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-alloy.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
- "traefik.http.routers.${STACK_NAME}-alloy.middlewares=basicauth@file"
|
||||
configs:
|
||||
config_alloy:
|
||||
template_driver: golang
|
||||
|
||||
@ -3,27 +3,48 @@ logging {
|
||||
format = "logfmt"
|
||||
}
|
||||
|
||||
livedebugging {
|
||||
enabled = {{ env "LIVE_DEBUGGING" }}
|
||||
}
|
||||
|
||||
discovery.docker "linux" {
|
||||
host = "unix:///var/run/docker.sock"
|
||||
}
|
||||
|
||||
{{ if ne (env "PROMETHEUS_REMOTE_WRITE_URL") "" }}
|
||||
prometheus.exporter.cadvisor "docker" {
|
||||
docker_only = true
|
||||
enabled_metrics = ["cpu", "cpuLoad", "disk", "diskIO", "memory", "network", "process"]
|
||||
}
|
||||
|
||||
prometheus.exporter.unix "default" {
|
||||
include_exporter_metrics = true
|
||||
rootfs_path = "/rootfs"
|
||||
procfs_path = "/rootfs/proc"
|
||||
sysfs_path = "/rootfs/sys"
|
||||
|
||||
disable_collectors = ["ipvs"]
|
||||
|
||||
filesystem {
|
||||
fs_types_exclude = "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|tmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$"
|
||||
mount_points_exclude = "^/(sys|proc|dev|host|etc)($|/)"
|
||||
mount_timeout = "5s"
|
||||
}
|
||||
|
||||
netclass { ignored_devices = "^(veth.*)$" }
|
||||
netdev { device_exclude = "^(veth.*)$" }
|
||||
}
|
||||
|
||||
prometheus.exporter.self "alloy" {}
|
||||
|
||||
prometheus.scrape "default" {
|
||||
scrape_interval = "120s"
|
||||
|
||||
targets = array.concat(
|
||||
[{
|
||||
job = "alloy",
|
||||
__address__ = "127.0.0.1:12345",
|
||||
}],
|
||||
prometheus.exporter.self.alloy.targets,
|
||||
prometheus.exporter.unix.default.targets,
|
||||
prometheus.exporter.cadvisor.docker.targets,
|
||||
discovery.docker.containers.targets,
|
||||
)
|
||||
|
||||
forward_to = [prometheus.remote_write.prometheus.receiver]
|
||||
@ -39,24 +60,176 @@ prometheus.remote_write "prometheus" {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
discovery.docker "containers" {
|
||||
host = "unix:///var/run/docker.sock"
|
||||
match_first_network = false
|
||||
}
|
||||
|
||||
// Scrape Prometheus metrics from other containers on this host.
|
||||
// Containers opt in via Docker labels:
|
||||
// prometheus.io/scrape=true required: enable scraping
|
||||
// prometheus.io/port=9090 optional: port exposing /metrics (defaults to first exposed port)
|
||||
// prometheus.io/path=/metrics optional: path to metrics endpoint (default: /metrics)
|
||||
// prometheus.io/auth=basic optional: use basic auth with the shared basic_auth secret
|
||||
discovery.dockerswarm "swarm" {
|
||||
host = "unix:///var/run/docker.sock"
|
||||
role = "services"
|
||||
}
|
||||
|
||||
discovery.relabel "metrics" {
|
||||
targets = discovery.dockerswarm.swarm.targets
|
||||
|
||||
rule {
|
||||
source_labels = ["__meta_dockerswarm_network_name"]
|
||||
regex = "proxy"
|
||||
action = "keep"
|
||||
}
|
||||
|
||||
rule {
|
||||
source_labels = ["__meta_dockerswarm_service_label_prometheus_io_scrape"]
|
||||
regex = "true"
|
||||
action = "keep"
|
||||
}
|
||||
|
||||
rule {
|
||||
source_labels = ["__address__", "__meta_dockerswarm_service_label_prometheus_io_port"]
|
||||
regex = `(.+):\d+;(\d+)`
|
||||
target_label = "__address__"
|
||||
replacement = "$1:$2"
|
||||
}
|
||||
|
||||
rule {
|
||||
source_labels = ["__meta_dockerswarm_service_label_prometheus_io_path"]
|
||||
regex = `(.+)`
|
||||
target_label = "__metrics_path__"
|
||||
}
|
||||
|
||||
rule {
|
||||
source_labels = ["__meta_dockerswarm_service_name"]
|
||||
target_label = "job"
|
||||
}
|
||||
}
|
||||
|
||||
discovery.relabel "metrics_noauth" {
|
||||
targets = discovery.relabel.metrics.output
|
||||
rule {
|
||||
source_labels = ["__meta_dockerswarm_service_label_prometheus_io_auth"]
|
||||
regex = "^$"
|
||||
action = "keep"
|
||||
}
|
||||
}
|
||||
|
||||
discovery.relabel "metrics_basicauth" {
|
||||
targets = discovery.relabel.metrics.output
|
||||
rule {
|
||||
source_labels = ["__meta_dockerswarm_service_label_prometheus_io_auth"]
|
||||
regex = "basic"
|
||||
action = "keep"
|
||||
}
|
||||
}
|
||||
|
||||
discovery.relabel "metrics_bearerauth" {
|
||||
targets = discovery.relabel.metrics.output
|
||||
rule {
|
||||
source_labels = ["__meta_dockerswarm_service_label_prometheus_io_auth"]
|
||||
regex = "bearer"
|
||||
action = "keep"
|
||||
}
|
||||
}
|
||||
|
||||
prometheus.scrape "containers" {
|
||||
scrape_interval = "120s"
|
||||
targets = discovery.relabel.metrics_noauth.output
|
||||
forward_to = [prometheus.remote_write.prometheus.receiver]
|
||||
}
|
||||
|
||||
prometheus.scrape "containers_basicauth" {
|
||||
scrape_interval = "120s"
|
||||
targets = discovery.relabel.metrics_basicauth.output
|
||||
forward_to = [prometheus.remote_write.prometheus.receiver]
|
||||
basic_auth {
|
||||
username = "admin"
|
||||
password = "{{ secret "basic_auth" }}"
|
||||
}
|
||||
}
|
||||
|
||||
prometheus.scrape "containers_bearerauth" {
|
||||
scrape_interval = "120s"
|
||||
targets = discovery.relabel.metrics_bearerauth.output
|
||||
forward_to = [prometheus.remote_write.prometheus.receiver]
|
||||
bearer_token = "{{ secret "basic_auth" }}"
|
||||
}
|
||||
{{ end }}
|
||||
|
||||
{{ if ne (env "LOKI_PUSH_URL") "" }}
|
||||
discovery.relabel "docker" {
|
||||
targets = discovery.docker.linux.targets
|
||||
|
||||
rule {
|
||||
source_labels = ["__meta_docker_container_name"]
|
||||
target_label = "container_name"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_docker_container_id"]
|
||||
target_label = "container_id"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_docker_container_label_com_docker_stack_namespace"]
|
||||
target_label = "stack_namespace"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_docker_container_label_com_docker_swarm_service_name"]
|
||||
target_label = "service_name"
|
||||
}
|
||||
rule {
|
||||
source_labels = ["__meta_docker_container_log_stream"]
|
||||
target_label = "stream"
|
||||
}
|
||||
}
|
||||
|
||||
loki.source.docker "docker" {
|
||||
host = "unix:///var/run/docker.sock"
|
||||
targets = discovery.docker.linux.targets
|
||||
targets = discovery.relabel.docker.output
|
||||
labels = {"app" = "docker"}
|
||||
forward_to = [loki.write.loki.receiver]
|
||||
}
|
||||
|
||||
// JOURNALD: reads the systemd journal binary log directly.
|
||||
// Use on systemd hosts (most modern Linux distros). Requires no syslogd.
|
||||
{{ if eq (env "JOURNALD") "1" }}
|
||||
loki.source.journal "journal" {
|
||||
path = "/var/log/journal"
|
||||
path = "/rootfs/var/log/journal"
|
||||
labels = { job = "{{ env "DOMAIN" }}" }
|
||||
forward_to = [loki.write.loki.receiver]
|
||||
}
|
||||
{{ end }}
|
||||
|
||||
// SYSLOG_FILES: tails all /var/log/*log files (syslog, auth.log, kern.log, etc.).
|
||||
// Use on non-systemd hosts where a syslogd writes to /var/log.
|
||||
{{ if eq (env "SYSLOG_FILES") "1" }}
|
||||
local.file_match "syslog_files" {
|
||||
path_targets = [{ __path__ = "/rootfs/var/log/*log" }]
|
||||
}
|
||||
|
||||
loki.source.file "syslog_files" {
|
||||
targets = local.file_match.syslog_files.targets
|
||||
forward_to = [loki.process.syslog_files.receiver]
|
||||
}
|
||||
|
||||
loki.process "syslog_files" {
|
||||
stage.static_labels {
|
||||
values = { job = "syslog" }
|
||||
}
|
||||
forward_to = [loki.write.loki.receiver]
|
||||
}
|
||||
{{ end }}
|
||||
|
||||
// SYSLOG: opens a network syslog listener on port 514.
|
||||
// Use when a remote device or a local syslogd configured to
|
||||
// forward over the network sends logs to this host.
|
||||
// Requires compose.syslog.yml to publish port 514 to the host.
|
||||
// This is NOT needed for reading local log files — use SYSLOG_FILES instead.
|
||||
{{ if eq (env "SYSLOG") "1" }}
|
||||
loki.relabel "syslog" {
|
||||
rule {
|
||||
@ -69,7 +242,7 @@ loki.relabel "syslog" {
|
||||
|
||||
loki.source.syslog "syslog" {
|
||||
listener {
|
||||
address = "[::1]:514"
|
||||
address = "[::]:514"
|
||||
label_structured_data = true
|
||||
labels = { component = "loki.source.syslog" }
|
||||
}
|
||||
@ -88,5 +261,6 @@ loki.write "loki" {
|
||||
password = "{{ secret "basic_auth" }}"
|
||||
}
|
||||
}
|
||||
external_labels = { hostname = "{{ env "DOMAIN" }}" }
|
||||
}
|
||||
{{ end }}
|
||||
|
||||
Reference in New Issue
Block a user