wip loki stuff

This commit is contained in:
Philipp Rothmann 2023-02-12 19:06:30 +01:00
parent 7fa22a1350
commit 15cd881356
8 changed files with 105 additions and 40 deletions

View File

@ -11,10 +11,9 @@ COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"
# COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml"
# LOKI_PUSH_URL=https://l.monitor.autonomic.zone/loki/api/v1/push
# Prometheus, Alertmanager
## Prometheus, Alertmanager
#
# COMPOSE_FILE="$COMPOSE_FILE:compose.prometheus.yml"
# PROMETHEUS_DOMAIN=prometheus.example.com
# PROMETHEUS_YML_VERSION=v1
# SECRET_PROMETHEUS_ADMIN_PASSWORD_VERSION=v1
# ALERTMANAGER_CONFIG_VERSION=v1
@ -23,12 +22,27 @@ COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"
# ALERTMANAGER_SMTP_TO=kaboom@autonomic.zone
# SECRET_ALERTMANAGER_SMTP_PASSWORD_VERSION=v1
## Loki
# Loki Server
#
# COMPOSE_FILE="$COMPOSE_FILE:compose.loki.yml"
# LOKI_STORAGE_FILESYSTEM=1
#
## S3 Storage
# LOKI_STORAGE_S3=1
# LOKI_AWS_ENDPOINT=https://minio.autonomic.zone
# LOKI_AWS_REGION=eu-west-1
# LOKI_ACCESS_KEY_ID=bush-debrief-approval-robust-scraggly-molecule
# LOKI_BUCKET_NAMES=loki
# SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION=v1
#
# Grafana
#
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana.yml"
# GRAFANA_DOMAIN=grafana.example.com
# GRAFANA_CUSTOM_INI_VERSION=v3
# GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN}
# GF_SERVER_ROOT_URL=https://${DOMAIN}
# SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1
#
## Single-Sign-On with OIDC
@ -48,15 +62,3 @@ COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"
# GF_SMTP_ENABLED=1
# GF_SMTP_FROM_ADDRESS=grafana@example.com
# GF_SMTP_SKIP_VERIFY=1
# Loki Server
#
# COMPOSE_FILE="$COMPOSE_FILE:compose.loki.yml"
# LOKI_DOMAIN=loki.example.com
# LOKI_AWS_ENDPOINT=https://minio.autonomic.zone
# LOKI_AWS_REGION=eu-west-1
# LOKI_ACCESS_KEY_ID=bush-debrief-approval-robust-scraggly-molecule
# LOKI_BUCKET_NAMES=loki
# LOKI_YML_VERSION=v7
# SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION=v1
# SECRET_LOKI_ADMIN_PASSWORD_HASHED_VERSION=v1

View File

@ -4,27 +4,47 @@ A centralised grafana/prometheus/loki stack. This an alternative approach to [`c
<!-- metadata -->
* **Category**: Apps
* **Status**: 2, beta
* **Image**: [`grafana/grafana`](https://hub.docker.com/r/grafana/grafana), 4, upstream
* **Healthcheck**: 3
* **Backups**: 1
* **Email**: 3
* **Tests**: No
* **SSO**: 1
- **Category**: Apps
- **Status**: 2, beta
- **Image**: [`grafana/grafana`](https://hub.docker.com/r/grafana/grafana), 4, upstream
- **Healthcheck**: 3
- **Backups**: 1
- **Email**: 3
- **Tests**: No
- **SSO**: 1
<!-- endmetadata -->
## Setup
This stack requires 3 domains, one for grafana, prometheus & loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite.
1. Configure Traefik to use BasicAuth
Generate userslist with httpasswd hashed password
###
1. Insert secrets for prometheus
1. add scrape config (see example)
and run abra app cp to copy it
1. grafana sso secret
| | | |
| ------------- | ------------------ | --------------------------------- |
| Grafana | Email / SSO | monitoring.example.org |
| Prometheus | traefik basic-auth | prometheus.monitoring.example.org |
| loki | traefik basic-auth | loki.monitoring.example.org |
| Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org |
| Node Exporter | traefik basic-auth | node.monitoring.example.org |
This stack requires 3 domains, one for grafana, prometheus, loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite.
## Post-setup guide
- configure prometheus/loki/alertmanager as data sources in grafana under `Configuration > Data sources`
- for loki, you need to set a "Custom HTTP Header": `X-Scope-OrgID: fake`
- configure the SMTP mailer under `Alerting > Contact points`
- edit the default contact point, choose "Alertmanager" as type & `http://alertmanager:9093` as URL
- use the "Test" button to send a test mail. It should fire a request at the alertmanager & that should send a mail
@ -33,3 +53,5 @@ This stack requires 3 domains, one for grafana, prometheus & loki. This is due t
- load your dashboards in manually under `Create > Dashboard`
- from your dashboard panels, choose `Edit > Alert` to create alerts based on those panels
THX to the previous work of @decentral1se @knooflok @3wc @cellarspoon @mirsal

View File

@ -1,9 +1,8 @@
export PROMTAIL_YML_VERSION=v1
export NODE_EXPORTER_ENTRYPOINT_VERSION=v1
export NGINX_CONFIG_VERSION=v1
export HTPASSWD_CONFIG_VERSION=v1
export GRAFANA_DATASOURCES_YML_VERSION=v1
export GRAFANA_DASHBOARDS_YML_VERSION=v1
export GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v1
export GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v1
export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v1
export PROMTAIL_YML_VERSION=v1
export LOKI_YML_VERSION=v9

View File

@ -40,7 +40,7 @@ services:
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000"
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-grafana.tls=true"
- "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}"

View File

@ -11,14 +11,26 @@ services:
target: /etc/loki/local-config.yaml
volumes:
- loki-data:/loki
secrets:
- loki_aws_secret_access_key
# secrets:
# - loki_aws_secret_access_key
environment:
- LOKI_ACCESS_KEY_ID
- LOKI_AWS_ENDPOINT
- LOKI_AWS_REGION
- LOKI_BUCKET_NAMES
- STACK_NAME
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}_prometheus.loadbalancer.server.port=9090"
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`loki.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls=true"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-prometheus.middlewares=basicauth@file"
configs:
loki_yml:
@ -30,10 +42,7 @@ configs:
volumes:
loki-data:
secrets:
loki_aws_secret_access_key:
external: true
name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION}
loki_admin_password_hashed:
external: true
name: ${STACK_NAME}_loki_admin_password_hashed_${SECRET_LOKI_ADMIN_PASSWORD_HASHED_VERSION}
# secrets:
# loki_aws_secret_access_key:
# external: true
# name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION}

View File

@ -39,7 +39,13 @@ services:
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.0
command: -logtostderr -docker_only
command:
- "-logtostderr"
- "-docker_only"
- "--enable_metrics=cpu,cpuLoad,disk,memory,network"
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
- "--housekeeping_interval=30s"
volumes:
- /var/lib/docker/:/var/lib/docker:ro
- /dev/disk/:/dev/disk:ro
@ -60,6 +66,12 @@ services:
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
healthcheck:
test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
interval: 15s
timeout: 15s
retries: 5
start_period: 30s
configs:
node_exporter_entrypoint_sh:

View File

@ -24,7 +24,7 @@ services:
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}_prometheus.loadbalancer.server.port=9090"
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`${PROMETHEUS_DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`prometheus.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls=true"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}"

View File

@ -40,6 +40,25 @@ memberlist:
max_join_retries: 10
min_join_backoff: 1s
{{ if eq (env "LOKI_STORAGE_FILESYSTEM") "1" }}
schema_config:
configs:
- from: 2020-05-15
store: boltdb
object_store: filesystem
schema: v11
index:
prefix: index_
period: 168h
storage_config:
boltdb:
directory: /loki/index
filesystem:
directory: /loki/chunks
{{ end }}
{{ if eq (env "LOKI_STORAGE_S3") "1" }}
schema_config:
configs:
- from: 2020-11-25
@ -70,6 +89,8 @@ storage_config:
response_header_timeout: 0s
insecure_skip_verify: false
s3forcepathstyle: true
{{ end }}
limits_config:
enforce_metric_name: false