Initial packaging of a prometheus / loki / grafana monitoring system

This is a work in progress
This commit is contained in:
mirsal 2021-07-17 20:25:30 +00:00
parent 2cbc1a6de1
commit d5607c39cd
8 changed files with 327 additions and 24 deletions

View File

@ -1,6 +1,14 @@
TYPE=monitoring
DOMAIN=monitoring.example.com
## Domain aliases
#EXTRA_DOMAINS=', `www.monitoring.example.com`'
GRAFANA_DOMAIN=grafana.example.com
PROMETHEUS_DOMAIN=prometheus.example.com
LETS_ENCRYPT_ENV=production
#GF_SMTP_HOST
#GF_SMTP_ENABLED
#GF_SMTP_FROM_ADDRESS
#GF_SMTP_SKIP_VERIFY
#GF_SECURITY_ALLOW_EMBEDDING
#GF_INSTALL_PLUGINS=grafana-piechart-panel
#GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN}

View File

@ -3,14 +3,14 @@
A server and application monitoring stack based on Prometheus, Loki and Grafana.
<!-- metadata -->
* **Category**:
* **Status**:
* **Image**: [`monitoring`](https://hub.docker.com/r/monitoring/monitoring)
* **Healthcheck**:
* **Backups**:
* **Email**:
* **Tests**:
* **SSO**:
* **Category**: Monitoring
* **Status**: ❷💛
* **Images**: [`prom/prometheus`](https://hub.docker.com/r/prom/prometheus) [`grafana/grafana`](https://hub.docker.com/r/grafana/grafana) [`grafana/loki`](https://hub.docker.com/r/grafana/loki)
* **Healthcheck**: ❶💚
* **Backups**:
* **Email**: ❶💚
* **Tests**:
* **SSO**: ❸🍎
<!-- endmetadata -->
## Basic usage
@ -19,7 +19,7 @@ A server and application monitoring stack based on Prometheus, Loki and Grafana.
2. Deploy [`coop-cloud/traefik`]
3. `abra app new monitoring --secrets` (optionally with `--pass` if you'd like
to save secrets in `pass`)
4. `abra app YOURAPPDOMAIN config` - be sure to change `DOMAIN` to something that resolves to
4. `abra app YOURAPPDOMAIN config` - be sure to change `GRAFANA_DOMAIN` and `PROMETHEUS_DOMAIN` to something that resolves to
your Docker swarm box
5. `abra app YOURAPPDOMAIN deploy`
6. Open the configured domain in your browser to finish set-up

4
abra.sh Normal file
View File

@ -0,0 +1,4 @@
export PROMETHEUS_YML_VERSION=v1
export PROMTAIL_YML_VERSION=v1
export LOKI_YML_VERSION=v1
export NODE_EXPORTER_ENTRYPOINT_VERSION=v1

View File

@ -2,30 +2,167 @@
version: "3.8"
services:
app:
image: nginx:1.19.2
node_exporter:
image: prom/node-exporter:v1.0.1
user: root
environment:
- NODE_ID={{.Node.ID}}
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
- /etc/hostname:/etc/nodename
command:
- '--path.sysfs=/host/sys'
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--collector.textfile.directory=/etc/node-exporter/'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
- '--no-collector.ipvs'
configs:
- source: node_exporter_entrypoint_sh
target: /entrypoint.sh
entrypoint: ['/bin/sh', '-e', '/entrypoint.sh']
networks:
- proxy
- exporters
deploy:
mode: global
endpoint_mode: dnsrr
cadvisor:
image: google/cadvisor:latest
command: -logtostderr -docker_only
volumes:
- /var/lib/docker/:/var/lib/docker
- /dev/disk/:/dev/disk
- /sys:/sys
- /var/run:/var/run
- /:/rootfs
networks:
- exporters
deploy:
mode: global
endpoint_mode: dnsrr
loki:
image: grafana/loki:2.0.0
command: -config.file=/etc/loki/local-config.yaml
networks:
- loki
- api
deploy:
endpoint_mode: dnsrr
ports:
- target: 3100
published: 3100
protocol: tcp
mode: host
configs:
- source: loki_yml
target: /etc/loki/local-config.yaml
volumes:
- loki-data:/loki
promtail:
image: grafana/promtail:2.0.0
volumes:
- /var/log:/var/log
command: -config.file=/etc/promtail/config.yml
networks:
- loki
deploy:
mode: global
endpoint_mode: dnsrr
prometheus:
image: prom/prometheus:latest
volumes:
- prometheus-data:/prometheus:rw
configs:
- source: prometheus_yml
target: /etc/prometheus/prometheus.yml
networks:
api:
aliases:
- prometheus_api
exporters: ~
proxy: ~
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}.loadbalancer.server.port=80"
- "traefik.http.routers.${STACK_NAME}.rule=Host(`${DOMAIN}`${EXTRA_DOMAINS})"
- "traefik.http.routers.${STACK_NAME}.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}.tls.certresolver=${LETS_ENCRYPT_ENV}"
## Redirect from EXTRA_DOMAINS to DOMAIN
#- "traefik.http.routers.${STACK_NAME}.middlewares=${STACK_NAME}-redirect"
#- "traefik.http.middlewares.${STACK_NAME}-redirect.headers.SSLForceHost=true"
#- "traefik.http.middlewares.${STACK_NAME}-redirect.headers.SSLHost=${DOMAIN}"
- "traefik.http.services.${STACK_NAME}-prometheus.loadbalancer.server.port=9090"
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`${PROMETHEUS_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls=true"
- "traefik.http.routers.${STACK_NAME}-tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.middlewares.${STACK_NAME}-http-to-https.redirectscheme.scheme=https"
- "traefik.http.middlewares.${STACK_NAME}-http-to-https.redirectscheme.permanent=true"
- "traefik.http.routers.${STACK_NAME}-prometheus-redirect.rule=Host(`${PROMETHEUS_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-prometheus-redirect.middlewares=${STACK_NAME}-http-to-https@docker"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost"]
test: ["CMD", "curl", "-f", "http://localhost:9090"]
interval: 30s
timeout: 10s
retries: 10
start_period: 1m
grafana:
image: grafana/grafana
volumes:
- grafana-data:/var/lib/grafana:rw
networks:
- api
- proxy
environment:
- GF_SMTP_HOST
- GF_SMTP_ENABLED
- GF_SMTP_FROM_ADDRESS
- GF_SMTP_SKIP_VERIFY
- GF_SECURITY_ALLOW_EMBEDDING
- GF_INSTALL_PLUGINS=grafana-piechart-panel
- GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN}
deploy:
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000"
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-grafana.tls=true"
- "traefik.http.routers.${STACK_NAME}-tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.middlewares.${STACK_NAME}-http-to-https.redirectscheme.scheme=https"
- "traefik.http.middlewares.${STACK_NAME}-http-to-https.redirectscheme.permanent=true"
- "traefik.http.routers.${STACK_NAME}-grafana-redirect.rule=Host(`${GRAFANA_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-grafana-redirect.middlewares=${STACK_NAME}-http-to-https@docker"
configs:
prometheus_yml:
name: ${STACK_NAME}_prometheus_yml_${PROMETHEUS_YML_VERSION}
file: prometheus.yml
promtail_yml:
name: ${STACK_NAME}_promtail_yml_${PROMTAIL_YML_VERSION}
file: promtail.yml
loki_yml:
name: ${STACK_NAME}_loki_yml_${LOKI_YML_VERSION}
file: loki.yml
node_exporter_entrypoint_sh:
name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION}
file: node-exporter-entrypoint.sh
volumes:
prometheus-data:
grafana-data:
loki-data:
networks:
api:
driver: overlay
exporters:
driver: overlay
attachable: true
loki:
driver: overlay
attachable: true
proxy:
external: true

71
loki.yml Normal file
View File

@ -0,0 +1,71 @@
auth_enabled: false
server:
http_listen_port: 3100
distributor:
ring:
kvstore:
store: memberlist
ingester:
lifecycler:
ring:
kvstore:
store: memberlist
replication_factor: 1
final_sleep: 0s
chunk_idle_period: 5m
chunk_retain_period: 30s
memberlist:
abort_if_cluster_join_fails: false
# Expose this port on all distributor, ingester
# and querier replicas.
bind_port: 7946
# You can use a headless k8s service for all distributor,
# ingester and querier components.
join_members:
- loki:7946
max_join_backoff: 1m
max_join_retries: 10
min_join_backoff: 1s
schema_config:
configs:
- from: 2020-11-25
store: boltdb-shipper
object_store: aws
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: /loki/index
cache_location: /loki/index_cache
resync_interval: 5s
shared_store: aws
aws:
endpoint:
region:
access_key_id:
secret_access_key:
bucketnames:
insecure: false
sse_encryption: false
http_config:
idle_conn_timeout: 90s
response_header_timeout: 0s
insecure_skip_verify: false
s3forcepathstyle: true
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h

View File

@ -0,0 +1,9 @@
#!/bin/sh -e
NODE_NAME=$(cat /etc/nodename)
mkdir -p /etc/node-exporter
echo "node_meta{node_id=\"$NODE_ID\", container_label_com_docker_swarm_node_id=\"$NODE_ID\", node_name=\"$NODE_NAME\"} 1" > /etc/node-exporter/node-meta.prom
set -- /bin/node_exporter "$@"
exec "$@"

56
prometheus.yml Normal file
View File

@ -0,0 +1,56 @@
global:
scrape_interval: 30s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 30s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets:
- localhost:9090
# http://node_exporter:9100/metrics
- job_name: node-exporter
scrape_interval: 10s
metrics_path: "/metrics"
dns_sd_configs:
- names:
- 'tasks.node_exporter'
type: 'A'
port: 9100
- job_name: 'cadvisor'
scrape_interval: 30s
metrics_path: '/metrics'
dns_sd_configs:
- names:
- 'tasks.cadvisor'
type: 'A'
port: 8080
- job_name: 'traefik'
scrape_interval: 30s
metrics_path: '/metrics'
dns_sd_configs:
- names:
- 'tasks.traefik_app'
type: 'A'
port: 8082

18
promtail.yml Normal file
View File

@ -0,0 +1,18 @@
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: system
static_configs:
- targets:
- localhost
labels:
job: varlogs
__path__: /var/log/*log