wip loki stuff

This commit is contained in:
Philipp Rothmann 2023-02-13 16:10:33 +01:00
parent 15cd881356
commit cc8a0b2905
11 changed files with 139 additions and 39 deletions

View File

@ -6,22 +6,21 @@ DOMAIN=monitoring.example.com
# Gathering Metrics (Node Exporter, Cadvisor)
COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"
SECRET_BASIC_AUTH_ADMIN_PASSWORD_VERSION=v1
# Gathering Logs (Promtail)
# COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml"
# LOKI_PUSH_URL=https://l.monitor.autonomic.zone/loki/api/v1/push
# LOKI_PUSH_URL=https://loki.monitoring.example.org/loki/api/v1/push
## Prometheus, Alertmanager
#
# COMPOSE_FILE="$COMPOSE_FILE:compose.prometheus.yml"
# PROMETHEUS_YML_VERSION=v1
# SECRET_PROMETHEUS_ADMIN_PASSWORD_VERSION=v1
# ALERTMANAGER_CONFIG_VERSION=v1
# ALERTMANAGER_SMTP_FROM=noreply@autonomic.zone
# ALERTMANAGER_SMTP_HOST=mail.gandi.net:587
# ALERTMANAGER_SMTP_TO=kaboom@autonomic.zone
# SECRET_ALERTMANAGER_SMTP_PASSWORD_VERSION=v1
## Loki
# Loki Server
#
@ -35,13 +34,11 @@ COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"
# LOKI_ACCESS_KEY_ID=bush-debrief-approval-robust-scraggly-molecule
# LOKI_BUCKET_NAMES=loki
# SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION=v1
#
# Grafana
## Grafana
#
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana.yml"
# GRAFANA_CUSTOM_INI_VERSION=v3
# GF_SERVER_ROOT_URL=https://${DOMAIN}
# SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1
#

View File

@ -15,10 +15,54 @@ A centralised grafana/prometheus/loki stack. This an alternative approach to [`c
<!-- endmetadata -->
## Setup
## Setup a Metrics Gathering
Where gathering.org is the node you want to gather metrics from.
1. Configure DNS
* monitoring.gathering.org
* cadvisor.monitoring.gathering.org
* node.monitoring.gathering.org
1. Configure Traefik to use BasicAuth
Generate userslist with httpasswd hashed password
* `abra app config traefik.gathering.org`
uncomment
```
# BASIC_AUTH
COMPOSE_FILE="$COMPOSE_FILE:compose.basicauth.yml"
BASIC_AUTH=1
SECRET_USERSFILE_VERSION=v1
```
* Generate userslist with httpasswd hashed password
`abra app secret insert traefik.gathering.org userslist v1 'admin:hashed-secret'`
make sure there is no whitespace between admin:hashed-secret, it seems to break stuff...
* `abra app deploy traefik` (might need to undeploy before)
1. `abra app new monitoring-ng`
1. `abra app config monitoring.gathering.org`
for gathering only this is required:
`COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"`
1. `abra app deploy monitoring.gathering.org`
1. check that endpoints are up and basic-auth works
* cadvisor.monitoring.gathering.org
* node.monitoring.gathering.org
## Setup Metrics Browser
1. Configure DNS
* monitoring.example.org
* loki.monitoring.example.org
* loki.monitoring.example.org
```
cp scrape-config.example.yml gathering.org.yml
# adjust domain
# mkdir scrape_configs
abra app cp monitoring.dev.local-it.cloud gathering.org.yml prometheus:/prometheus/scrape_configs/
```
* check that all configured targets are up:
https://prometheus.monitoring.example.org/targets
###
@ -35,6 +79,29 @@ A centralised grafana/prometheus/loki stack. This an alternative approach to [`c
| Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org |
| Node Exporter | traefik basic-auth | node.monitoring.example.org |
### TODO
* metrics.compose.yml -> compose.yml
* [ ] Loki
* [x]
* [ ] s3 aws secret?
* [ ] Promtail
* [ ] Loki -> Grafana Datasource
* prometheus retention!
* traefik metrics
* [uptime-kuma](https://github.com/louislam/uptime-kuma/wiki/Prometheus-Integration), [dashboard](https://grafana.com/grafana/dashboards/14847-uptime-kuma/)
* authentik metrics?
* cool alerts
* note: alle gathering nodes will have the same httpasswd basic-auth secret ...
-> this could be a use case to actually use docker swarm ...
could use swarm_service_discovery then in prometheus
-> multiple scrape_configs in prometheus
service
-> oauth / header? prometheus could do it, does promtail? does traefik?
This stack requires 3 domains, one for grafana, prometheus, loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite.
## Post-setup guide

View File

@ -4,5 +4,8 @@ export GRAFANA_DASHBOARDS_YML_VERSION=v1
export GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v1
export GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v1
export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v1
export GRAFANA_CUSTOM_INI_VERSION=v1
export PROMTAIL_YML_VERSION=v1
export LOKI_YML_VERSION=v9
export LOKI_YML_VERSION=v1
export PROMETHEUS_YML_VERSION=v1
export ALERTMANAGER_CONFIG_VERSION=v1

View File

@ -3,9 +3,10 @@ version: '3.8'
services:
loki:
image: grafana/loki:2.0.0
# entrypoint: 'tail -f /dev/null'
command: -config.file=/etc/loki/local-config.yaml
networks:
- internal
- proxy
configs:
- source: loki_yml
target: /etc/loki/local-config.yaml
@ -19,17 +20,19 @@ services:
- LOKI_AWS_REGION
- LOKI_BUCKET_NAMES
- STACK_NAME
- LOKI_STORAGE_FILESYSTEM
- LOKI_STORAGE_S3
deploy:
restart_policy:
condition: on-failure
labels:
- "traefik.enable=true"
- "traefik.http.services.${STACK_NAME}_prometheus.loadbalancer.server.port=9090"
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`loki.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls=true"
- "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-prometheus.middlewares=basicauth@file"
- "traefik.http.services.${STACK_NAME}_loki.loadbalancer.server.port=9090"
- "traefik.http.routers.${STACK_NAME}-loki.rule=Host(`loki.${DOMAIN}`)"
- "traefik.http.routers.${STACK_NAME}-loki.entrypoints=web-secure"
- "traefik.http.routers.${STACK_NAME}-loki.tls=true"
- "traefik.http.routers.${STACK_NAME}-loki.tls.certresolver=${LETS_ENCRYPT_ENV}"
- "traefik.http.routers.${STACK_NAME}-loki.middlewares=basicauth@file"
configs:

View File

@ -44,7 +44,7 @@ services:
- "-docker_only"
- "--enable_metrics=cpu,cpuLoad,disk,memory,network"
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
- "--housekeeping_interval=30s"
- "--housekeeping_interval=60s"
volumes:
- /var/lib/docker/:/var/lib/docker:ro

View File

@ -4,7 +4,7 @@ services:
prometheus:
image: prom/prometheus:v2.34.0
secrets:
- prometheus_admin_password
- basic_auth_admin_password
volumes:
- prometheus-data:/prometheus:rw
configs:
@ -66,9 +66,6 @@ volumes:
alertmanager-data:
secrets:
prometheus_admin_password:
external: true
name: ${STACK_NAME}_prometheus_admin_password_${SECRET_PROMETHEUS_ADMIN_PASSWORD_VERSION}
alertmanager_smtp_password:
external: true
name: ${STACK_NAME}_alertmanager_smtp_password_${SECRET_ALERTMANAGER_SMTP_PASSWORD_VERSION}

View File

@ -2,7 +2,7 @@ version: "3.8"
services:
promtail:
image: grafana/promtail:2.0.0
image: grafana/promtail:2.7.3
volumes:
- /var/log:/var/log:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
@ -13,17 +13,12 @@ services:
networks:
- internal
secrets:
- loki_admin_password
- basic_auth_admin_password
environment:
- LOKI_PUSH_URL
configs:
promtail_yml:
name: ${STACK_NAME}_promtail_yml_${PROMTAIL_YML_VERSION}
file: promtail.yml.tmpl
template_driver: golang
secrets:
loki_admin_password:
external: true
name: ${STACK_NAME}_loki_admin_password_${SECRET_LOKI_ADMIN_PASSWORD_VERSION}
template_driver: golang

View File

@ -9,4 +9,9 @@ services:
networks:
proxy:
external: true
internal:
internal:
secrets:
basic_auth_admin_password:
external: true
name: ${STACK_NAME}_basic_auth_admin_password_${SECRET_BASIC_AUTH_ADMIN_PASSWORD_VERSION}

View File

@ -8,3 +8,10 @@ datasources:
url: http://prometheus:9090
isDefault: true
editable: false
- name: Loki
type: loki
access: proxy
orgId: 1
url: http://loki:3100
isDefault: false
editable: false

View File

@ -10,11 +10,11 @@ alerting:
scrape_configs:
- job_name: "default"
scrape_interval: 10s
scrape_interval: 30s
metrics_path: "/metrics"
file_sd_configs:
- files:
- /prometheus/scrape_configs/*.yml
basic_auth:
username: admin
password: {{ secret "prometheus_admin_password" }}
password: {{ secret "basic_auth_admin_password" }}

View File

@ -8,8 +8,8 @@ positions:
clients:
- url: {{ env "LOKI_PUSH_URL" }}
basic_auth:
username: loki
password: {{ secret "loki_admin_password" }}
username: admin
password: {{ secret "basic_auth_admin_password" }}
scrape_configs:
- job_name: system
@ -25,5 +25,31 @@ scrape_configs:
- targets:
- localhost
labels:
job: containers
job: containerlogs
__path__: /var/lib/docker/containers/*/*log
pipeline_stages:
- json:
expressions:
output: log
stream: stream
attrs:
- json:
expressions:
tag:
source: attrs
- regex:
expression: (?P<image_name>(?:[^|]*[^|])).(?P<container_name>(?:[^|]*[^|])).(?P<image_id>(?:[^|]*[^|])).(?P<container_id>(?:[^|]*[^|]))
source: tag
- timestamp:
format: RFC3339Nano
source: time
- labels:
tag:
stream:
image_name:
container_name:
image_id:
container_id:
- output:
source: output