diff --git a/.env.sample b/.env.sample index 90a53c9..0669f33 100644 --- a/.env.sample +++ b/.env.sample @@ -11,10 +11,9 @@ COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml" # COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml" # LOKI_PUSH_URL=https://l.monitor.autonomic.zone/loki/api/v1/push -# Prometheus, Alertmanager +## Prometheus, Alertmanager # # COMPOSE_FILE="$COMPOSE_FILE:compose.prometheus.yml" -# PROMETHEUS_DOMAIN=prometheus.example.com # PROMETHEUS_YML_VERSION=v1 # SECRET_PROMETHEUS_ADMIN_PASSWORD_VERSION=v1 # ALERTMANAGER_CONFIG_VERSION=v1 @@ -23,12 +22,27 @@ COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml" # ALERTMANAGER_SMTP_TO=kaboom@autonomic.zone # SECRET_ALERTMANAGER_SMTP_PASSWORD_VERSION=v1 +## Loki +# Loki Server +# +# COMPOSE_FILE="$COMPOSE_FILE:compose.loki.yml" +# LOKI_STORAGE_FILESYSTEM=1 +# +## S3 Storage +# LOKI_STORAGE_S3=1 +# LOKI_AWS_ENDPOINT=https://minio.autonomic.zone +# LOKI_AWS_REGION=eu-west-1 +# LOKI_ACCESS_KEY_ID=bush-debrief-approval-robust-scraggly-molecule +# LOKI_BUCKET_NAMES=loki +# SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION=v1 +# + + # Grafana # # COMPOSE_FILE="$COMPOSE_FILE:compose.grafana.yml" -# GRAFANA_DOMAIN=grafana.example.com # GRAFANA_CUSTOM_INI_VERSION=v3 -# GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN} +# GF_SERVER_ROOT_URL=https://${DOMAIN} # SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1 # ## Single-Sign-On with OIDC @@ -48,15 +62,3 @@ COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml" # GF_SMTP_ENABLED=1 # GF_SMTP_FROM_ADDRESS=grafana@example.com # GF_SMTP_SKIP_VERIFY=1 - -# Loki Server -# -# COMPOSE_FILE="$COMPOSE_FILE:compose.loki.yml" -# LOKI_DOMAIN=loki.example.com -# LOKI_AWS_ENDPOINT=https://minio.autonomic.zone -# LOKI_AWS_REGION=eu-west-1 -# LOKI_ACCESS_KEY_ID=bush-debrief-approval-robust-scraggly-molecule -# LOKI_BUCKET_NAMES=loki -# LOKI_YML_VERSION=v7 -# SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION=v1 -# SECRET_LOKI_ADMIN_PASSWORD_HASHED_VERSION=v1 \ No newline at end of file diff --git a/README.md b/README.md index 46f2665..bf6039a 100644 --- a/README.md +++ b/README.md @@ -4,27 +4,47 @@ A centralised grafana/prometheus/loki stack. This an alternative approach to [`c -* **Category**: Apps -* **Status**: 2, beta -* **Image**: [`grafana/grafana`](https://hub.docker.com/r/grafana/grafana), 4, upstream -* **Healthcheck**: 3 -* **Backups**: 1 -* **Email**: 3 -* **Tests**: No -* **SSO**: 1 +- **Category**: Apps +- **Status**: 2, beta +- **Image**: [`grafana/grafana`](https://hub.docker.com/r/grafana/grafana), 4, upstream +- **Healthcheck**: 3 +- **Backups**: 1 +- **Email**: 3 +- **Tests**: No +- **SSO**: 1 ## Setup -This stack requires 3 domains, one for grafana, prometheus & loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite. +1. Configure Traefik to use BasicAuth + Generate userslist with httpasswd hashed password + +### + +1. Insert secrets for prometheus +1. add scrape config (see example) + and run abra app cp to copy it +1. grafana sso secret + +| | | | +| ------------- | ------------------ | --------------------------------- | +| Grafana | Email / SSO | monitoring.example.org | +| Prometheus | traefik basic-auth | prometheus.monitoring.example.org | +| loki | traefik basic-auth | loki.monitoring.example.org | +| Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org | +| Node Exporter | traefik basic-auth | node.monitoring.example.org | + +This stack requires 3 domains, one for grafana, prometheus, loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite. ## Post-setup guide - configure prometheus/loki/alertmanager as data sources in grafana under `Configuration > Data sources` + - for loki, you need to set a "Custom HTTP Header": `X-Scope-OrgID: fake` - configure the SMTP mailer under `Alerting > Contact points` + - edit the default contact point, choose "Alertmanager" as type & `http://alertmanager:9093` as URL - use the "Test" button to send a test mail. It should fire a request at the alertmanager & that should send a mail @@ -33,3 +53,5 @@ This stack requires 3 domains, one for grafana, prometheus & loki. This is due t - load your dashboards in manually under `Create > Dashboard` - from your dashboard panels, choose `Edit > Alert` to create alerts based on those panels + +THX to the previous work of @decentral1se @knooflok @3wc @cellarspoon @mirsal diff --git a/abra.sh b/abra.sh index 2d0a4fe..a2b38db 100644 --- a/abra.sh +++ b/abra.sh @@ -1,9 +1,8 @@ -export PROMTAIL_YML_VERSION=v1 export NODE_EXPORTER_ENTRYPOINT_VERSION=v1 -export NGINX_CONFIG_VERSION=v1 -export HTPASSWD_CONFIG_VERSION=v1 export GRAFANA_DATASOURCES_YML_VERSION=v1 export GRAFANA_DASHBOARDS_YML_VERSION=v1 export GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v1 export GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v1 export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v1 +export PROMTAIL_YML_VERSION=v1 +export LOKI_YML_VERSION=v9 \ No newline at end of file diff --git a/compose.grafana.yml b/compose.grafana.yml index b0a854f..138fa6e 100644 --- a/compose.grafana.yml +++ b/compose.grafana.yml @@ -40,7 +40,7 @@ services: labels: - "traefik.enable=true" - "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000" - - "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN}`)" + - "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${DOMAIN}`)" - "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure" - "traefik.http.routers.${STACK_NAME}-grafana.tls=true" - "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}" diff --git a/compose.loki.yml b/compose.loki.yml index 42d2fe3..24221f5 100644 --- a/compose.loki.yml +++ b/compose.loki.yml @@ -11,14 +11,26 @@ services: target: /etc/loki/local-config.yaml volumes: - loki-data:/loki - secrets: - - loki_aws_secret_access_key + # secrets: + # - loki_aws_secret_access_key environment: - LOKI_ACCESS_KEY_ID - LOKI_AWS_ENDPOINT - LOKI_AWS_REGION - LOKI_BUCKET_NAMES - STACK_NAME + deploy: + restart_policy: + condition: on-failure + labels: + - "traefik.enable=true" + - "traefik.http.services.${STACK_NAME}_prometheus.loadbalancer.server.port=9090" + - "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`loki.${DOMAIN}`)" + - "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure" + - "traefik.http.routers.${STACK_NAME}-prometheus.tls=true" + - "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}" + - "traefik.http.routers.${STACK_NAME}-prometheus.middlewares=basicauth@file" + configs: loki_yml: @@ -30,10 +42,7 @@ configs: volumes: loki-data: -secrets: - loki_aws_secret_access_key: - external: true - name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION} - loki_admin_password_hashed: - external: true - name: ${STACK_NAME}_loki_admin_password_hashed_${SECRET_LOKI_ADMIN_PASSWORD_HASHED_VERSION} +# secrets: +# loki_aws_secret_access_key: +# external: true +# name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION} \ No newline at end of file diff --git a/compose.metrics.yml b/compose.metrics.yml index 6d4fa93..9020987 100644 --- a/compose.metrics.yml +++ b/compose.metrics.yml @@ -39,7 +39,13 @@ services: cadvisor: image: gcr.io/cadvisor/cadvisor:v0.47.0 - command: -logtostderr -docker_only + command: + - "-logtostderr" + - "-docker_only" + - "--enable_metrics=cpu,cpuLoad,disk,memory,network" + # all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp. + - "--housekeeping_interval=30s" + volumes: - /var/lib/docker/:/var/lib/docker:ro - /dev/disk/:/dev/disk:ro @@ -60,6 +66,12 @@ services: - "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true" - "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}" - "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file" + healthcheck: + test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1 + interval: 15s + timeout: 15s + retries: 5 + start_period: 30s configs: node_exporter_entrypoint_sh: diff --git a/compose.prometheus.yml b/compose.prometheus.yml index 1567160..9dd5402 100644 --- a/compose.prometheus.yml +++ b/compose.prometheus.yml @@ -24,7 +24,7 @@ services: labels: - "traefik.enable=true" - "traefik.http.services.${STACK_NAME}_prometheus.loadbalancer.server.port=9090" - - "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`${PROMETHEUS_DOMAIN}`)" + - "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`prometheus.${DOMAIN}`)" - "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure" - "traefik.http.routers.${STACK_NAME}-prometheus.tls=true" - "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}" diff --git a/loki.yml.tmpl b/loki.yml.tmpl index 1f5d1f9..360d988 100644 --- a/loki.yml.tmpl +++ b/loki.yml.tmpl @@ -40,6 +40,25 @@ memberlist: max_join_retries: 10 min_join_backoff: 1s +{{ if eq (env "LOKI_STORAGE_FILESYSTEM") "1" }} +schema_config: + configs: + - from: 2020-05-15 + store: boltdb + object_store: filesystem + schema: v11 + index: + prefix: index_ + period: 168h + +storage_config: + boltdb: + directory: /loki/index + + filesystem: + directory: /loki/chunks +{{ end }} +{{ if eq (env "LOKI_STORAGE_S3") "1" }} schema_config: configs: - from: 2020-11-25 @@ -70,6 +89,8 @@ storage_config: response_header_timeout: 0s insecure_skip_verify: false s3forcepathstyle: true +{{ end }} + limits_config: enforce_metric_name: false