wip loki stuff
This commit is contained in:
parent
7fa22a1350
commit
15cd881356
34
.env.sample
34
.env.sample
|
@ -11,10 +11,9 @@ COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"
|
|||
# COMPOSE_FILE="$COMPOSE_FILE:compose.promtail.yml"
|
||||
# LOKI_PUSH_URL=https://l.monitor.autonomic.zone/loki/api/v1/push
|
||||
|
||||
# Prometheus, Alertmanager
|
||||
## Prometheus, Alertmanager
|
||||
#
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.prometheus.yml"
|
||||
# PROMETHEUS_DOMAIN=prometheus.example.com
|
||||
# PROMETHEUS_YML_VERSION=v1
|
||||
# SECRET_PROMETHEUS_ADMIN_PASSWORD_VERSION=v1
|
||||
# ALERTMANAGER_CONFIG_VERSION=v1
|
||||
|
@ -23,12 +22,27 @@ COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"
|
|||
# ALERTMANAGER_SMTP_TO=kaboom@autonomic.zone
|
||||
# SECRET_ALERTMANAGER_SMTP_PASSWORD_VERSION=v1
|
||||
|
||||
## Loki
|
||||
# Loki Server
|
||||
#
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.loki.yml"
|
||||
# LOKI_STORAGE_FILESYSTEM=1
|
||||
#
|
||||
## S3 Storage
|
||||
# LOKI_STORAGE_S3=1
|
||||
# LOKI_AWS_ENDPOINT=https://minio.autonomic.zone
|
||||
# LOKI_AWS_REGION=eu-west-1
|
||||
# LOKI_ACCESS_KEY_ID=bush-debrief-approval-robust-scraggly-molecule
|
||||
# LOKI_BUCKET_NAMES=loki
|
||||
# SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION=v1
|
||||
#
|
||||
|
||||
|
||||
# Grafana
|
||||
#
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.grafana.yml"
|
||||
# GRAFANA_DOMAIN=grafana.example.com
|
||||
# GRAFANA_CUSTOM_INI_VERSION=v3
|
||||
# GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN}
|
||||
# GF_SERVER_ROOT_URL=https://${DOMAIN}
|
||||
# SECRET_GRAFANA_ADMIN_PASSWORD_VERSION=v1
|
||||
#
|
||||
## Single-Sign-On with OIDC
|
||||
|
@ -48,15 +62,3 @@ COMPOSE_FILE="$COMPOSE_FILE:compose.metrics.yml"
|
|||
# GF_SMTP_ENABLED=1
|
||||
# GF_SMTP_FROM_ADDRESS=grafana@example.com
|
||||
# GF_SMTP_SKIP_VERIFY=1
|
||||
|
||||
# Loki Server
|
||||
#
|
||||
# COMPOSE_FILE="$COMPOSE_FILE:compose.loki.yml"
|
||||
# LOKI_DOMAIN=loki.example.com
|
||||
# LOKI_AWS_ENDPOINT=https://minio.autonomic.zone
|
||||
# LOKI_AWS_REGION=eu-west-1
|
||||
# LOKI_ACCESS_KEY_ID=bush-debrief-approval-robust-scraggly-molecule
|
||||
# LOKI_BUCKET_NAMES=loki
|
||||
# LOKI_YML_VERSION=v7
|
||||
# SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION=v1
|
||||
# SECRET_LOKI_ADMIN_PASSWORD_HASHED_VERSION=v1
|
40
README.md
40
README.md
|
@ -4,27 +4,47 @@ A centralised grafana/prometheus/loki stack. This an alternative approach to [`c
|
|||
|
||||
<!-- metadata -->
|
||||
|
||||
* **Category**: Apps
|
||||
* **Status**: 2, beta
|
||||
* **Image**: [`grafana/grafana`](https://hub.docker.com/r/grafana/grafana), 4, upstream
|
||||
* **Healthcheck**: 3
|
||||
* **Backups**: 1
|
||||
* **Email**: 3
|
||||
* **Tests**: No
|
||||
* **SSO**: 1
|
||||
- **Category**: Apps
|
||||
- **Status**: 2, beta
|
||||
- **Image**: [`grafana/grafana`](https://hub.docker.com/r/grafana/grafana), 4, upstream
|
||||
- **Healthcheck**: 3
|
||||
- **Backups**: 1
|
||||
- **Email**: 3
|
||||
- **Tests**: No
|
||||
- **SSO**: 1
|
||||
|
||||
<!-- endmetadata -->
|
||||
|
||||
## Setup
|
||||
|
||||
This stack requires 3 domains, one for grafana, prometheus & loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite.
|
||||
1. Configure Traefik to use BasicAuth
|
||||
Generate userslist with httpasswd hashed password
|
||||
|
||||
###
|
||||
|
||||
1. Insert secrets for prometheus
|
||||
1. add scrape config (see example)
|
||||
and run abra app cp to copy it
|
||||
1. grafana sso secret
|
||||
|
||||
| | | |
|
||||
| ------------- | ------------------ | --------------------------------- |
|
||||
| Grafana | Email / SSO | monitoring.example.org |
|
||||
| Prometheus | traefik basic-auth | prometheus.monitoring.example.org |
|
||||
| loki | traefik basic-auth | loki.monitoring.example.org |
|
||||
| Cadvisor | traefik basic-auth | cadvisor.monitoring.example.org |
|
||||
| Node Exporter | traefik basic-auth | node.monitoring.example.org |
|
||||
|
||||
This stack requires 3 domains, one for grafana, prometheus, loki. This is due to the need for the gathering tools, such as node_exporter, to have a publicy accessible URL for making connections. We make use of the internal prometheus HTTP basic auth & wire up an Nginx proxy with HTTP basic auth for loki. Grafana uses Keycloak OpenId Connect sign in. The alertmanager setup remains internal and is only connected with grafana. It also assume that you are deploying the [`coop-cloud/gathering`](https://git.coopcloud.tech/knoflook/gathering/) recipe on the machines that you want to gather metrics & logs from. Each instance of the gathering recipe will report back and/or be scraped by your central install of monitoring-lite.
|
||||
|
||||
## Post-setup guide
|
||||
|
||||
- configure prometheus/loki/alertmanager as data sources in grafana under `Configuration > Data sources`
|
||||
|
||||
- for loki, you need to set a "Custom HTTP Header": `X-Scope-OrgID: fake`
|
||||
|
||||
- configure the SMTP mailer under `Alerting > Contact points`
|
||||
|
||||
- edit the default contact point, choose "Alertmanager" as type & `http://alertmanager:9093` as URL
|
||||
- use the "Test" button to send a test mail. It should fire a request at the alertmanager & that should send a mail
|
||||
|
||||
|
@ -33,3 +53,5 @@ This stack requires 3 domains, one for grafana, prometheus & loki. This is due t
|
|||
- load your dashboards in manually under `Create > Dashboard`
|
||||
|
||||
- from your dashboard panels, choose `Edit > Alert` to create alerts based on those panels
|
||||
|
||||
THX to the previous work of @decentral1se @knooflok @3wc @cellarspoon @mirsal
|
||||
|
|
5
abra.sh
5
abra.sh
|
@ -1,9 +1,8 @@
|
|||
export PROMTAIL_YML_VERSION=v1
|
||||
export NODE_EXPORTER_ENTRYPOINT_VERSION=v1
|
||||
export NGINX_CONFIG_VERSION=v1
|
||||
export HTPASSWD_CONFIG_VERSION=v1
|
||||
export GRAFANA_DATASOURCES_YML_VERSION=v1
|
||||
export GRAFANA_DASHBOARDS_YML_VERSION=v1
|
||||
export GRAFANA_SWARM_DASHBOARD_JSON_VERSION=v1
|
||||
export GRAFANA_STACKS_DASHBOARD_JSON_VERSION=v1
|
||||
export GRAFANA_TRAEFIK_DASHBOARD_JSON_VERSION=v1
|
||||
export PROMTAIL_YML_VERSION=v1
|
||||
export LOKI_YML_VERSION=v9
|
|
@ -40,7 +40,7 @@ services:
|
|||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-grafana.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
|
|
|
@ -11,14 +11,26 @@ services:
|
|||
target: /etc/loki/local-config.yaml
|
||||
volumes:
|
||||
- loki-data:/loki
|
||||
secrets:
|
||||
- loki_aws_secret_access_key
|
||||
# secrets:
|
||||
# - loki_aws_secret_access_key
|
||||
environment:
|
||||
- LOKI_ACCESS_KEY_ID
|
||||
- LOKI_AWS_ENDPOINT
|
||||
- LOKI_AWS_REGION
|
||||
- LOKI_BUCKET_NAMES
|
||||
- STACK_NAME
|
||||
deploy:
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.services.${STACK_NAME}_prometheus.loadbalancer.server.port=9090"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`loki.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.middlewares=basicauth@file"
|
||||
|
||||
|
||||
configs:
|
||||
loki_yml:
|
||||
|
@ -30,10 +42,7 @@ configs:
|
|||
volumes:
|
||||
loki-data:
|
||||
|
||||
secrets:
|
||||
loki_aws_secret_access_key:
|
||||
external: true
|
||||
name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION}
|
||||
loki_admin_password_hashed:
|
||||
external: true
|
||||
name: ${STACK_NAME}_loki_admin_password_hashed_${SECRET_LOKI_ADMIN_PASSWORD_HASHED_VERSION}
|
||||
# secrets:
|
||||
# loki_aws_secret_access_key:
|
||||
# external: true
|
||||
# name: ${STACK_NAME}_loki_aws_secret_access_key_${SECRET_LOKI_AWS_SECRET_ACCESS_KEY_VERSION}
|
|
@ -39,7 +39,13 @@ services:
|
|||
|
||||
cadvisor:
|
||||
image: gcr.io/cadvisor/cadvisor:v0.47.0
|
||||
command: -logtostderr -docker_only
|
||||
command:
|
||||
- "-logtostderr"
|
||||
- "-docker_only"
|
||||
- "--enable_metrics=cpu,cpuLoad,disk,memory,network"
|
||||
# all possible metrics: advtcp,app,cpu,cpuLoad,cpu_topology,cpuset,disk,diskIO,hugetlb,memory,memory_numa,network,oom_event,percpu,perf_event,process,referenced_memory,resctrl,sched,tcp,udp.
|
||||
- "--housekeeping_interval=30s"
|
||||
|
||||
volumes:
|
||||
- /var/lib/docker/:/var/lib/docker:ro
|
||||
- /dev/disk/:/dev/disk:ro
|
||||
|
@ -60,6 +66,12 @@ services:
|
|||
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
- "traefik.http.routers.${STACK_NAME}-cadvisor.middlewares=basicauth@file"
|
||||
healthcheck:
|
||||
test: wget --quiet --tries=1 --spider http://localhost:8080/healthz || exit 1
|
||||
interval: 15s
|
||||
timeout: 15s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
|
||||
configs:
|
||||
node_exporter_entrypoint_sh:
|
||||
|
|
|
@ -24,7 +24,7 @@ services:
|
|||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.services.${STACK_NAME}_prometheus.loadbalancer.server.port=9090"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`${PROMETHEUS_DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`prometheus.${DOMAIN}`)"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
|
|
|
@ -40,6 +40,25 @@ memberlist:
|
|||
max_join_retries: 10
|
||||
min_join_backoff: 1s
|
||||
|
||||
{{ if eq (env "LOKI_STORAGE_FILESYSTEM") "1" }}
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-05-15
|
||||
store: boltdb
|
||||
object_store: filesystem
|
||||
schema: v11
|
||||
index:
|
||||
prefix: index_
|
||||
period: 168h
|
||||
|
||||
storage_config:
|
||||
boltdb:
|
||||
directory: /loki/index
|
||||
|
||||
filesystem:
|
||||
directory: /loki/chunks
|
||||
{{ end }}
|
||||
{{ if eq (env "LOKI_STORAGE_S3") "1" }}
|
||||
schema_config:
|
||||
configs:
|
||||
- from: 2020-11-25
|
||||
|
@ -70,6 +89,8 @@ storage_config:
|
|||
response_header_timeout: 0s
|
||||
insecure_skip_verify: false
|
||||
s3forcepathstyle: true
|
||||
{{ end }}
|
||||
|
||||
|
||||
limits_config:
|
||||
enforce_metric_name: false
|
||||
|
|
Loading…
Reference in New Issue