commit 6886e0b1a137a060c7d8591517e4afbd8576cb8a Author: decentral1se Date: Thu Mar 31 14:26:41 2022 +0200 init diff --git a/.env.sample b/.env.sample new file mode 100644 index 0000000..acff59a --- /dev/null +++ b/.env.sample @@ -0,0 +1,38 @@ +TYPE=monitoring +STACK_NAME=gp_monitoring +LETS_ENCRYPT_ENV=production + +GRAFANA_DOMAIN=g.monitor.autonomic.zone +GRAFANA_CUSTOM_INI_VERSION=v3 +GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN} +SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1 +SECRET_GRAFANA_OAUTH_CLIENT_SECRET_VERSION=v1 + +PROMETHEUS_DOMAIN=p.monitor.autonomic.zone +PROMETHEUS_YML_VERSION=v10 +PROMETHEUS_WEB_YML_VERSION=v2 +SECRET_PROMETHEUS_ADMIN_PASSWORD_VERSION=v1 +SECRET_PROMETHEUS_ADMIN_PASSWORD_HASHED_VERSION=v1 + +LOKI_DOMAIN=l.monitor.autonomic.zone +LOKI_AWS_ENDPOINT=https://minio.autonomic.zone +LOKI_AWS_REGION=eu-west-1 +LOKI_ACCESS_KEY_ID=bush-debrief-approval-robust-scraggly-molecule +LOKI_BUCKET_NAMES=loki +LOKI_YML_VERSION=v7 +SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION=v1 +SECRET_LOKI_ADMIN_PASSWORD_HASHED_VERSION=v1 + +ALERTMANAGER_CONFIG_VERSION=v2 + +NGINX_CONFIG_VERSION=v5 +HTPASSWD_CONFIG_VERSION=v1 + +KEYCLOAK_AUTH_URL="https://id.autonomic.zone/auth/realms/autonomic/protocol/openid-connect/auth" +KEYCLOAK_API_URL="https://id.autonomic.zone/auth/realms/autonomic/protocol/openid-connect/userinfo" +KEYCLOAK_TOKEN_URL="https://id.autonomic.zone/auth/realms/autonomic/protocol/openid-connect/token" + +ALERTMANAGER_SMTP_FROM=noreply@autonomic.zone +ALERTMANAGER_SMTP_HOST=mail.gandi.net:587 +ALERTMANAGER_SMTP_TO=kaboom@autonomic.zone +SECRET_ALERTMANAGER_SMTP_PASSWORD_VERSION=v1 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..37b52cc --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/.envrc diff --git a/README.md b/README.md new file mode 100644 index 0000000..46f2665 --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# monitoring-lite + +A centralised grafana/prometheus/loki stack. This an alternative approach to [`coop-cloud/monitoring`](https://git.coopcloud.tech/coop-cloud/monitoring) which does include any of the services which actually gather metrics and/or logs. Instead, this is a useful recipe for folks who need to centralise their monitoring stack into a single grafana/prometheus/loki & several instances of node_exporter/cadvisor/promtail. + + + +* **Category**: Apps +* **Status**: 2, beta +* **Image**: [`grafana/grafana`](https://hub.docker.com/r/grafana/grafana), 4, upstream +* **Healthcheck**: 3 +* **Backups**: 1 +* **Email**: 3 +* **Tests**: No +* **SSO**: 1 + + + +## Setup + +This stack requires 3 domains, one for grafana, prometheus & loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite. + +## Post-setup guide + +- configure prometheus/loki/alertmanager as data sources in grafana under `Configuration > Data sources` + - for loki, you need to set a "Custom HTTP Header": `X-Scope-OrgID: fake` + +- configure the SMTP mailer under `Alerting > Contact points` + - edit the default contact point, choose "Alertmanager" as type & `http://alertmanager:9093` as URL + - use the "Test" button to send a test mail. It should fire a request at the alertmanager & that should send a mail + +- `abra app cp` your `scrap_configs: ...` into `/prometheus/scrape_configs` & log into your prometheus web UI to ensure they're working + +- load your dashboards in manually under `Create > Dashboard` + +- from your dashboard panels, choose `Edit > Alert` to create alerts based on those panels diff --git a/alertmanager.yml.tmpl b/alertmanager.yml.tmpl new file mode 100644 index 0000000..9f3d795 --- /dev/null +++ b/alertmanager.yml.tmpl @@ -0,0 +1,13 @@ +global: + smtp_from: {{ env "ALERTMANAGER_SMTP_FROM" }} + smtp_smarthost: {{ env "ALERTMANAGER_SMTP_HOST" }} + smtp_auth_username: {{ env "ALERTMANAGER_SMTP_FROM" }} + smtp_auth_password: {{ secret "alertmanager_smtp_password" }} + +route: + receiver: "kaboom-mailer" + +receivers: + - name: "kaboom-mailer" + email_configs: + - to: {{ env "ALERTMANAGER_SMTP_TO" }} diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000..6a80dbb --- /dev/null +++ b/compose.yml @@ -0,0 +1,196 @@ +--- +version: "3.8" + +services: + app: + image: grafana/grafana:8.4.4 + volumes: + - grafana-data:/var/lib/grafana:rw + secrets: + - grafana_admin_password + - grafana_oauth_client_secret + configs: + - source: grafana_custom_ini + target: /etc/grafana/grafana.ini + networks: + - proxy + - internal + environment: + - GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN} + - GF_SECURITY_ADMIN_PASSWORD__FILE=/run/secrets/grafana_admin_password + - KEYCLOAK_API_URL + - KEYCLOAK_AUTH_URL + - KEYCLOAK_TOKEN_URL + deploy: + labels: + - "traefik.enable=true" + - "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000" + - "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN}`)" + - "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure" + - "traefik.http.routers.${STACK_NAME}-grafana.tls=true" + - "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}" + healthcheck: + test: "wget -q http://localhost:3000/ -O/dev/null" + interval: 5s + timeout: 10s + retries: 3 + start_period: 10s + + prometheus: + image: prom/prometheus:v2.34.0 + secrets: + - prometheus_admin_password + - prometheus_admin_password_hashed + volumes: + - prometheus-data:/prometheus:rw + configs: + - source: prometheus_yml + target: /etc/prometheus/prometheus.yml + - source: prometheus_web_yml + target: /etc/prometheus/prometheus_web.yml + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--web.config.file=/etc/prometheus/prometheus_web.yml" + - "--storage.tsdb.path=/prometheus" + - "--web.console.libraries=/usr/share/prometheus/console_libraries" + - "--web.console.templates=/usr/share/prometheus/consoles" + networks: + - proxy + - internal + deploy: + restart_policy: + condition: on-failure + labels: + - "traefik.enable=true" + - "traefik.http.services.${STACK_NAME}_prometheus.loadbalancer.server.port=9090" + - "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`${PROMETHEUS_DOMAIN}`)" + - "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure" + - "traefik.http.routers.${STACK_NAME}-prometheus.tls=true" + - "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}" + + alertmanager: + image: prom/alertmanager:v0.23.0 + volumes: + - alertmanager-data:/etc/alertmanager + command: + - "--config.file=/etc/alertmanager/config.yml" + - "--storage.path=/alertmanager" + networks: + - internal + secrets: + - alertmanager_smtp_password + configs: + - source: alertmanager_config + target: /etc/alertmanager/config.yml + environment: + - ALERTMANAGER_SMTP_FROM + - ALERTMANAGER_SMTP_HOST + - ALERTMANAGER_SMTP_TO + + web: + image: nginx:1.20.0 + networks: + - proxy + - internal + environment: + - LOKI_DOMAIN + - STACK_NAME + configs: + - source: nginx_config + target: /etc/nginx/nginx.conf + - source: htpasswd_conf + target: /etc/nginx/conf.d/loki.htpasswd + secrets: + - loki_admin_password_hashed + deploy: + restart_policy: + condition: on-failure + labels: + - "traefik.enable=true" + - "traefik.http.services.${STACK_NAME}-web.loadbalancer.server.port=80" + - "traefik.http.routers.${STACK_NAME}-web.rule=Host(`${LOKI_DOMAIN}`)" + - "traefik.http.routers.${STACK_NAME}-web.entrypoints=web-secure" + - "traefik.http.routers.${STACK_NAME}-web.tls.certresolver=${LETS_ENCRYPT_ENV}" + + loki: + image: grafana/loki:2.0.0 + command: -config.file=/etc/loki/local-config.yaml + networks: + - internal + configs: + - source: loki_yml + target: /etc/loki/local-config.yaml + volumes: + - loki-data:/loki + secrets: + - loki_aws_secret_access_key + environment: + - LOKI_ACCESS_KEY_ID + - LOKI_AWS_ENDPOINT + - LOKI_AWS_REGION + - LOKI_BUCKET_NAMES + - STACK_NAME + +configs: + grafana_custom_ini: + template_driver: golang + name: ${STACK_NAME}_grafana_custom_ini_${GRAFANA_CUSTOM_INI_VERSION} + file: grafana_custom.ini + prometheus_yml: + template_driver: golang + name: ${STACK_NAME}_prometheus_yml_${PROMETHEUS_YML_VERSION} + file: prometheus.yml.tmpl + prometheus_web_yml: + template_driver: golang + name: ${STACK_NAME}_prometheus_web_yml_${PROMETHEUS_WEB_YML_VERSION} + file: prometheus_web.yml.tmpl + loki_yml: + template_driver: golang + name: ${STACK_NAME}_loki_yml_${LOKI_YML_VERSION} + file: loki.yml.tmpl + alertmanager_config: + template_driver: golang + name: ${STACK_NAME}_alertmanager_config_${ALERTMANAGER_CONFIG_VERSION} + file: ./alertmanager.yml.tmpl + nginx_config: + template_driver: golang + name: ${STACK_NAME}_nginx_config_${NGINX_CONFIG_VERSION} + file: nginx.conf.tmpl + htpasswd_conf: + template_driver: golang + name: ${STACK_NAME}_htpasswd_${HTPASSWD_CONFIG_VERSION} + file: loki.htpasswd.tmpl + +volumes: + prometheus-data: + grafana-data: + loki-data: + alertmanager-data: + +networks: + proxy: + external: true + internal: + +secrets: + loki_aws_secret_access_key: + external: true + name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION} + grafana_admin_password: + external: true + name: ${STACK_NAME}_grafana_admin_password_${SECRET_GRAFANA_ADMIN_PASSWORD_VERSION} + grafana_oauth_client_secret: + external: true + name: ${STACK_NAME}_grafana_oauth_client_secret_${SECRET_GRAFANA_OAUTH_CLIENT_SECRET_VERSION} + prometheus_admin_password_hashed: + external: true + name: ${STACK_NAME}_prometheus_admin_password_hashed_${SECRET_PROMETHEUS_ADMIN_PASSWORD_HASHED_VERSION} + prometheus_admin_password: + external: true + name: ${STACK_NAME}_prometheus_admin_password_${SECRET_PROMETHEUS_ADMIN_PASSWORD_VERSION} + alertmanager_smtp_password: + external: true + name: ${STACK_NAME}_alertmanager_smtp_password_${SECRET_ALERTMANAGER_SMTP_PASSWORD_VERSION} + loki_admin_password_hashed: + external: true + name: ${STACK_NAME}_loki_admin_password_hashed_${SECRET_LOKI_ADMIN_PASSWORD_HASHED_VERSION} diff --git a/grafana_custom.ini b/grafana_custom.ini new file mode 100644 index 0000000..cd1c546 --- /dev/null +++ b/grafana_custom.ini @@ -0,0 +1,30 @@ +[analytics] +reporting_enabled = false + +[snapshots] +external_enabled = false + +[users] +auto_assign_org_role = Admin + +[auth] +disable_login_form = true + +[auth.generic_oauth] +enabled = true +scopes = openid email profile +name = id.autonomic.zone +icon = signin +tls_skip_verify_insecure = false +allow_sign_up = true +client_id = grafana +client_secret = {{ secret "grafana_oauth_client_secret" }} +auth_url = {{ env "KEYCLOAK_AUTH_URL" }} +token_url = {{ env "KEYCLOAK_TOKEN_URL" }} +api_url = {{ env "KEYCLOAK_API_URL" }} + +[auth.basic] +enabled = false + +[plugins] +enable_alpha = true diff --git a/loki.htpasswd.tmpl b/loki.htpasswd.tmpl new file mode 100644 index 0000000..74f33cc --- /dev/null +++ b/loki.htpasswd.tmpl @@ -0,0 +1 @@ +loki:{{ secret "loki_admin_password_hashed" }} diff --git a/loki.yml.tmpl b/loki.yml.tmpl new file mode 100644 index 0000000..1f5d1f9 --- /dev/null +++ b/loki.yml.tmpl @@ -0,0 +1,77 @@ +auth_enabled: false + +ruler: + storage: + type: local + local: + directory: /loki/rules + rule_path: /loki/scratch + alertmanager_url: http://alertmanager:9093 + enable_api: true + enable_alertmanager_v2: true + ring: + kvstore: + store: inmemory + +server: + http_listen_port: 3100 + +distributor: + ring: + kvstore: + store: memberlist + +ingester: + lifecycler: + ring: + kvstore: + store: memberlist + replication_factor: 1 + final_sleep: 0s + chunk_idle_period: 5m + chunk_retain_period: 30s + +memberlist: + abort_if_cluster_join_fails: false + bind_port: 7946 + join_members: + - {{ env "STACK_NAME" }}_loki:7946 + max_join_backoff: 1m + max_join_retries: 10 + min_join_backoff: 1s + +schema_config: + configs: + - from: 2020-11-25 + store: boltdb-shipper + object_store: aws + schema: v11 + index: + prefix: index_ + period: 24h + +storage_config: + boltdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/index_cache + resync_interval: 5s + shared_store: aws + + aws: + endpoint: {{ env "LOKI_AWS_ENDPOINT" }} + region: {{ env "LOKI_AWS_REGION" }} + access_key_id: {{ env "LOKI_ACCESS_KEY_ID" }} + secret_access_key: {{ secret "loki_aws_secret_access_key" }} + bucketnames: {{ env "LOKI_BUCKET_NAMES" }} + insecure: false + sse_encryption: false + http_config: + idle_conn_timeout: 90s + response_header_timeout: 0s + insecure_skip_verify: false + s3forcepathstyle: true + +limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h diff --git a/nginx.conf.tmpl b/nginx.conf.tmpl new file mode 100644 index 0000000..b8af0ba --- /dev/null +++ b/nginx.conf.tmpl @@ -0,0 +1,43 @@ +user www-data; + +events { + worker_connections 768; +} + +http { + include /etc/nginx/mime.types; + + map $http_upgrade $connection_upgrade { + default upgrade; + '' close; + } + + server { + listen 80; + server_name {{ env "LOKI_DOMAIN" }}; + + auth_basic "loki"; + auth_basic_user_file /etc/nginx/conf.d/loki.htpasswd; + + location / { + proxy_read_timeout 1800s; + proxy_connect_timeout 1600s; + proxy_pass http://{{ env "STACK_NAME" }}_loki:3100; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection $connection_upgrade; + proxy_set_header Connection "Keep-Alive"; + proxy_set_header Proxy-Connection "Keep-Alive"; + proxy_redirect off; + } + + location /ready { + proxy_pass http://{{ env "STACK_NAME" }}_loki:3100; + proxy_http_version 1.1; + proxy_set_header Connection "Keep-Alive"; + proxy_set_header Proxy-Connection "Keep-Alive"; + proxy_redirect off; + auth_basic "off"; + } + } +} diff --git a/prometheus.yml.tmpl b/prometheus.yml.tmpl new file mode 100644 index 0000000..03656f2 --- /dev/null +++ b/prometheus.yml.tmpl @@ -0,0 +1,15 @@ +global: + scrape_interval: 30s + evaluation_interval: 30s + +alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager:9093 + +scrape_configs: + - job_name: "default" + file_sd_configs: + - files: + - /prometheus/scrape_configs/*.yml diff --git a/prometheus_web.yml.tmpl b/prometheus_web.yml.tmpl new file mode 100644 index 0000000..38c005a --- /dev/null +++ b/prometheus_web.yml.tmpl @@ -0,0 +1,2 @@ +basic_auth_users: + admin: {{ secret "prometheus_admin_password_hashed" }}