This commit is contained in:
decentral1se 2022-03-31 14:26:41 +02:00
commit 6886e0b1a1
Signed by: decentral1se
GPG Key ID: 03789458B3D0C410
11 changed files with 451 additions and 0 deletions

38
.env.sample Normal file
View File

@ -0,0 +1,38 @@
TYPE=monitoring
STACK_NAME=gp_monitoring
LETS_ENCRYPT_ENV=production
GRAFANA_DOMAIN=g.monitor.autonomic.zone
GRAFANA_CUSTOM_INI_VERSION=v3
GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN}
SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1
SECRET_GRAFANA_OAUTH_CLIENT_SECRET_VERSION=v1
PROMETHEUS_DOMAIN=p.monitor.autonomic.zone
PROMETHEUS_YML_VERSION=v10
PROMETHEUS_WEB_YML_VERSION=v2
SECRET_PROMETHEUS_ADMIN_PASSWORD_VERSION=v1
SECRET_PROMETHEUS_ADMIN_PASSWORD_HASHED_VERSION=v1
LOKI_DOMAIN=l.monitor.autonomic.zone
LOKI_AWS_ENDPOINT=https://minio.autonomic.zone
LOKI_AWS_REGION=eu-west-1
LOKI_ACCESS_KEY_ID=bush-debrief-approval-robust-scraggly-molecule
LOKI_BUCKET_NAMES=loki
LOKI_YML_VERSION=v7
SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION=v1
SECRET_LOKI_ADMIN_PASSWORD_HASHED_VERSION=v1
ALERTMANAGER_CONFIG_VERSION=v2
NGINX_CONFIG_VERSION=v5
HTPASSWD_CONFIG_VERSION=v1
KEYCLOAK_AUTH_URL="https://id.autonomic.zone/auth/realms/autonomic/protocol/openid-connect/auth"
KEYCLOAK_API_URL="https://id.autonomic.zone/auth/realms/autonomic/protocol/openid-connect/userinfo"
KEYCLOAK_TOKEN_URL="https://id.autonomic.zone/auth/realms/autonomic/protocol/openid-connect/token"
ALERTMANAGER_SMTP_FROM=noreply@autonomic.zone
ALERTMANAGER_SMTP_HOST=mail.gandi.net:587
ALERTMANAGER_SMTP_TO=kaboom@autonomic.zone
SECRET_ALERTMANAGER_SMTP_PASSWORD_VERSION=v1

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/.envrc

35
README.md Normal file
View File

@ -0,0 +1,35 @@
# monitoring-lite
A centralised grafana/prometheus/loki stack. This an alternative approach to [`coop-cloud/monitoring`](https://git.coopcloud.tech/coop-cloud/monitoring) which does include any of the services which actually gather metrics and/or logs. Instead, this is a useful recipe for folks who need to centralise their monitoring stack into a single grafana/prometheus/loki & several instances of node_exporter/cadvisor/promtail.
<!-- metadata -->
* **Category**: Apps
* **Status**: 2, beta
* **Image**: [`grafana/grafana`](https://hub.docker.com/r/grafana/grafana), 4, upstream
* **Healthcheck**: 3
* **Backups**: 1
* **Email**: 3
* **Tests**: No
* **SSO**: 1
<!-- endmetadata -->
## Setup
This stack requires 3 domains, one for grafana, prometheus & loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite.
## Post-setup guide
- configure prometheus/loki/alertmanager as data sources in grafana under `Configuration > Data sources`
- for loki, you need to set a "Custom HTTP Header": `X-Scope-OrgID: fake`
- configure the SMTP mailer under `Alerting > Contact points`
- edit the default contact point, choose "Alertmanager" as type & `http://alertmanager:9093` as URL
- use the "Test" button to send a test mail. It should fire a request at the alertmanager & that should send a mail
- `abra app cp` your `scrap_configs: ...` into `/prometheus/scrape_configs` & log into your prometheus web UI to ensure they're working
- load your dashboards in manually under `Create > Dashboard`
- from your dashboard panels, choose `Edit > Alert` to create alerts based on those panels

13
alertmanager.yml.tmpl Normal file
View File

@ -0,0 +1,13 @@
global:
smtp_from: {{ env "ALERTMANAGER_SMTP_FROM" }}
smtp_smarthost: {{ env "ALERTMANAGER_SMTP_HOST" }}
smtp_auth_username: {{ env "ALERTMANAGER_SMTP_FROM" }}
smtp_auth_password: {{ secret "alertmanager_smtp_password" }}
route:
receiver: "kaboom-mailer"
receivers:
- name: "kaboom-mailer"
email_configs:
- to: {{ env "ALERTMANAGER_SMTP_TO" }}

196
compose.yml Normal file
View File

@ -0,0 +1,196 @@
---
version: "3.8"
services:
app:
image: grafana/grafana:8.4.4
volumes:
- grafana-data:/var/lib/grafana:rw
secrets:
- grafana_admin_password
- grafana_oauth_client_secret
configs:
- source: grafana_custom_ini
target: /etc/grafana/grafana.ini
networks:
- proxy
- internal
environment:
- GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN}
- GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password
- KEYCLOAK_API_URL
- KEYCLOAK_AUTH_URL
- KEYCLOAK_TOKEN_URL
deploy:
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000"
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-grafana.tls=true"
- "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}"
healthcheck:
test: "wget -q http://localhost:3000/ -O/dev/null"
interval: 5s
timeout: 10s
retries: 3
start_period: 10s
prometheus:
image: prom/prometheus:v2.34.0
secrets:
- prometheus_admin_password
- prometheus_admin_password_hashed
volumes:
- prometheus-data:/prometheus:rw
configs:
- source: prometheus_yml
target: /etc/prometheus/prometheus.yml
- source: prometheus_web_yml
target: /etc/prometheus/prometheus_web.yml
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--web.config.file=/etc/prometheus/prometheus_web.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.console.libraries=/usr/share/prometheus/console_libraries"
- "--web.console.templates=/usr/share/prometheus/consoles"
networks:
- proxy
- internal
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}_prometheus.loadbalancer.server.port=9090"
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`${PROMETHEUS_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls=true"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}"
alertmanager:
image: prom/alertmanager:v0.23.0
volumes:
- alertmanager-data:/etc/alertmanager
command:
- "--config.file=/etc/alertmanager/config.yml"
- "--storage.path=/alertmanager"
networks:
- internal
secrets:
- alertmanager_smtp_password
configs:
- source: alertmanager_config
target: /etc/alertmanager/config.yml
environment:
- ALERTMANAGER_SMTP_FROM
- ALERTMANAGER_SMTP_HOST
- ALERTMANAGER_SMTP_TO
web:
image: nginx:1.20.0
networks:
- proxy
- internal
environment:
- LOKI_DOMAIN
- STACK_NAME
configs:
- source: nginx_config
target: /etc/nginx/nginx.conf
- source: htpasswd_conf
target: /etc/nginx/conf.d/loki.htpasswd
secrets:
- loki_admin_password_hashed
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-web.loadbalancer.server.port=80"
- "traefik.http.routers.${STACK_NAME}-web.rule=Host(`${LOKI_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-web.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-web.tls.certresolver=${LETS_ENCRYPT_ENV}"
loki:
image: grafana/loki:2.0.0
command: -config.file=/etc/loki/local-config.yaml
networks:
- internal
configs:
- source: loki_yml
target: /etc/loki/local-config.yaml
volumes:
- loki-data:/loki
secrets:
- loki_aws_secret_access_key
environment:
- LOKI_ACCESS_KEY_ID
- LOKI_AWS_ENDPOINT
- LOKI_AWS_REGION
- LOKI_BUCKET_NAMES
- STACK_NAME
configs:
grafana_custom_ini:
template_driver: golang
name: ${STACK_NAME}_grafana_custom_ini_${GRAFANA_CUSTOM_INI_VERSION}
file: grafana_custom.ini
prometheus_yml:
template_driver: golang
name: ${STACK_NAME}_prometheus_yml_${PROMETHEUS_YML_VERSION}
file: prometheus.yml.tmpl
prometheus_web_yml:
template_driver: golang
name: ${STACK_NAME}_prometheus_web_yml_${PROMETHEUS_WEB_YML_VERSION}
file: prometheus_web.yml.tmpl
loki_yml:
template_driver: golang
name: ${STACK_NAME}_loki_yml_${LOKI_YML_VERSION}
file: loki.yml.tmpl
alertmanager_config:
template_driver: golang
name: ${STACK_NAME}_alertmanager_config_${ALERTMANAGER_CONFIG_VERSION}
file: ./alertmanager.yml.tmpl
nginx_config:
template_driver: golang
name: ${STACK_NAME}_nginx_config_${NGINX_CONFIG_VERSION}
file: nginx.conf.tmpl
htpasswd_conf:
template_driver: golang
name: ${STACK_NAME}_htpasswd_${HTPASSWD_CONFIG_VERSION}
file: loki.htpasswd.tmpl
volumes:
prometheus-data:
grafana-data:
loki-data:
alertmanager-data:
networks:
proxy:
external: true
internal:
secrets:
loki_aws_secret_access_key:
external: true
name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION}
grafana_admin_password:
external: true
name: ${STACK_NAME}_grafana_admin_password_${SECRET_GRAFANA_ADMIN_PASSWORD_VERSION}
grafana_oauth_client_secret:
external: true
name: ${STACK_NAME}_grafana_oauth_client_secret_${SECRET_GRAFANA_OAUTH_CLIENT_SECRET_VERSION}
prometheus_admin_password_hashed:
external: true
name: ${STACK_NAME}_prometheus_admin_password_hashed_${SECRET_PROMETHEUS_ADMIN_PASSWORD_HASHED_VERSION}
prometheus_admin_password:
external: true
name: ${STACK_NAME}_prometheus_admin_password_${SECRET_PROMETHEUS_ADMIN_PASSWORD_VERSION}
alertmanager_smtp_password:
external: true
name: ${STACK_NAME}_alertmanager_smtp_password_${SECRET_ALERTMANAGER_SMTP_PASSWORD_VERSION}
loki_admin_password_hashed:
external: true
name: ${STACK_NAME}_loki_admin_password_hashed_${SECRET_LOKI_ADMIN_PASSWORD_HASHED_VERSION}

30
grafana_custom.ini Normal file
View File

@ -0,0 +1,30 @@
[analytics]
reporting_enabled = false
[snapshots]
external_enabled = false
[users]
auto_assign_org_role = Admin
[auth]
disable_login_form = true
[auth.generic_oauth]
enabled = true
scopes = openid email profile
name = id.autonomic.zone
icon = signin
tls_skip_verify_insecure = false
allow_sign_up = true
client_id = grafana
client_secret = {{ secret "grafana_oauth_client_secret" }}
auth_url = {{ env "KEYCLOAK_AUTH_URL" }}
token_url = {{ env "KEYCLOAK_TOKEN_URL" }}
api_url = {{ env "KEYCLOAK_API_URL" }}
[auth.basic]
enabled = false
[plugins]
enable_alpha = true

1
loki.htpasswd.tmpl Normal file
View File

@ -0,0 +1 @@
loki:{{ secret "loki_admin_password_hashed" }}

77
loki.yml.tmpl Normal file
View File

@ -0,0 +1,77 @@
auth_enabled: false
ruler:
storage:
type: local
local:
directory: /loki/rules
rule_path: /loki/scratch
alertmanager_url: http://alertmanager:9093
enable_api: true
enable_alertmanager_v2: true
ring:
kvstore:
store: inmemory
server:
http_listen_port: 3100
distributor:
ring:
kvstore:
store: memberlist
ingester:
lifecycler:
ring:
kvstore:
store: memberlist
replication_factor: 1
final_sleep: 0s
chunk_idle_period: 5m
chunk_retain_period: 30s
memberlist:
abort_if_cluster_join_fails: false
bind_port: 7946
join_members:
- {{ env "STACK_NAME" }}_loki:7946
max_join_backoff: 1m
max_join_retries: 10
min_join_backoff: 1s
schema_config:
configs:
- from: 2020-11-25
store: boltdb-shipper
object_store: aws
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: /loki/index
cache_location: /loki/index_cache
resync_interval: 5s
shared_store: aws
aws:
endpoint: {{ env "LOKI_AWS_ENDPOINT" }}
region: {{ env "LOKI_AWS_REGION" }}
access_key_id: {{ env "LOKI_ACCESS_KEY_ID" }}
secret_access_key: {{ secret "loki_aws_secret_access_key" }}
bucketnames: {{ env "LOKI_BUCKET_NAMES" }}
insecure: false
sse_encryption: false
http_config:
idle_conn_timeout: 90s
response_header_timeout: 0s
insecure_skip_verify: false
s3forcepathstyle: true
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h

43
nginx.conf.tmpl Normal file
View File

@ -0,0 +1,43 @@
user www-data;
events {
worker_connections 768;
}
http {
include /etc/nginx/mime.types;
map $http_upgrade $connection_upgrade {
default upgrade;
'' close;
}
server {
listen 80;
server_name {{ env "LOKI_DOMAIN" }};
auth_basic "loki";
auth_basic_user_file /etc/nginx/conf.d/loki.htpasswd;
location / {
proxy_read_timeout 1800s;
proxy_connect_timeout 1600s;
proxy_pass http://{{ env "STACK_NAME" }}_loki:3100;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection $connection_upgrade;
proxy_set_header Connection "Keep-Alive";
proxy_set_header Proxy-Connection "Keep-Alive";
proxy_redirect off;
}
location /ready {
proxy_pass http://{{ env "STACK_NAME" }}_loki:3100;
proxy_http_version 1.1;
proxy_set_header Connection "Keep-Alive";
proxy_set_header Proxy-Connection "Keep-Alive";
proxy_redirect off;
auth_basic "off";
}
}
}

15
prometheus.yml.tmpl Normal file
View File

@ -0,0 +1,15 @@
global:
scrape_interval: 30s
evaluation_interval: 30s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: "default"
file_sd_configs:
- files:
- /prometheus/scrape_configs/*.yml

2
prometheus_web.yml.tmpl Normal file
View File

@ -0,0 +1,2 @@
basic_auth_users:
admin: {{ secret "prometheus_admin_password_hashed" }}