From d5607c39cd04da21bb96dfd2123696da45ecf633 Mon Sep 17 00:00:00 2001 From: mirsal Date: Sat, 17 Jul 2021 20:25:30 +0000 Subject: [PATCH] Initial packaging of a prometheus / loki / grafana monitoring system This is a work in progress --- .env.sample | 14 +++- README.md | 18 ++-- abra.sh | 4 + compose.yml | 161 +++++++++++++++++++++++++++++++++--- loki.yml | 71 ++++++++++++++++ node-exporter-entrypoint.sh | 9 ++ prometheus.yml | 56 +++++++++++++ promtail.yml | 18 ++++ 8 files changed, 327 insertions(+), 24 deletions(-) create mode 100644 abra.sh create mode 100644 loki.yml create mode 100644 node-exporter-entrypoint.sh create mode 100644 prometheus.yml create mode 100644 promtail.yml diff --git a/.env.sample b/.env.sample index eba2f1e..7250be9 100644 --- a/.env.sample +++ b/.env.sample @@ -1,6 +1,14 @@ TYPE=monitoring -DOMAIN=monitoring.example.com -## Domain aliases -#EXTRA_DOMAINS=', `www.monitoring.example.com`' +GRAFANA_DOMAIN=grafana.example.com +PROMETHEUS_DOMAIN=prometheus.example.com + LETS_ENCRYPT_ENV=production + +#GF_SMTP_HOST +#GF_SMTP_ENABLED +#GF_SMTP_FROM_ADDRESS +#GF_SMTP_SKIP_VERIFY +#GF_SECURITY_ALLOW_EMBEDDING +#GF_INSTALL_PLUGINS=grafana-piechart-panel +#GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN} diff --git a/README.md b/README.md index 38d6c5f..6a99910 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,14 @@ A server and application monitoring stack based on Prometheus, Loki and Grafana. -* **Category**: -* **Status**: -* **Image**: [`monitoring`](https://hub.docker.com/r/monitoring/monitoring) -* **Healthcheck**: -* **Backups**: -* **Email**: -* **Tests**: -* **SSO**: +* **Category**: Monitoring +* **Status**: β·πŸ’› +* **Images**: [`prom/prometheus`](https://hub.docker.com/r/prom/prometheus) [`grafana/grafana`](https://hub.docker.com/r/grafana/grafana) [`grafana/loki`](https://hub.docker.com/r/grafana/loki) +* **Healthcheck**: βΆπŸ’š +* **Backups**: ❌ +* **Email**: βΆπŸ’š +* **Tests**: ❌ +* **SSO**: ❸🍎 ## Basic usage @@ -19,7 +19,7 @@ A server and application monitoring stack based on Prometheus, Loki and Grafana. 2. Deploy [`coop-cloud/traefik`] 3. `abra app new monitoring --secrets` (optionally with `--pass` if you'd like to save secrets in `pass`) -4. `abra app YOURAPPDOMAIN config` - be sure to change `DOMAIN` to something that resolves to +4. `abra app YOURAPPDOMAIN config` - be sure to change `GRAFANA_DOMAIN` and `PROMETHEUS_DOMAIN` to something that resolves to your Docker swarm box 5. `abra app YOURAPPDOMAIN deploy` 6. Open the configured domain in your browser to finish set-up diff --git a/abra.sh b/abra.sh new file mode 100644 index 0000000..227b7a9 --- /dev/null +++ b/abra.sh @@ -0,0 +1,4 @@ +export PROMETHEUS_YML_VERSION=v1 +export PROMTAIL_YML_VERSION=v1 +export LOKI_YML_VERSION=v1 +export NODE_EXPORTER_ENTRYPOINT_VERSION=v1 diff --git a/compose.yml b/compose.yml index 662a96c..1526199 100644 --- a/compose.yml +++ b/compose.yml @@ -2,30 +2,167 @@ version: "3.8" services: - app: - image: nginx:1.19.2 + node_exporter: + image: prom/node-exporter:v1.0.1 + user: root + environment: + - NODE_ID={{.Node.ID}} + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + - /etc/hostname:/etc/nodename + command: + - '--path.sysfs=/host/sys' + - '--path.procfs=/host/proc' + - '--path.rootfs=/rootfs' + - '--collector.textfile.directory=/etc/node-exporter/' + - '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)' + - '--no-collector.ipvs' + configs: + - source: node_exporter_entrypoint_sh + target: /entrypoint.sh + entrypoint: ['/bin/sh', '-e', '/entrypoint.sh'] networks: - - proxy + - exporters + deploy: + mode: global + endpoint_mode: dnsrr + + cadvisor: + image: google/cadvisor:latest + command: -logtostderr -docker_only + volumes: + - /var/lib/docker/:/var/lib/docker + - /dev/disk/:/dev/disk + - /sys:/sys + - /var/run:/var/run + - /:/rootfs + networks: + - exporters + deploy: + mode: global + endpoint_mode: dnsrr + + loki: + image: grafana/loki:2.0.0 + command: -config.file=/etc/loki/local-config.yaml + networks: + - loki + - api + deploy: + endpoint_mode: dnsrr + ports: + - target: 3100 + published: 3100 + protocol: tcp + mode: host + configs: + - source: loki_yml + target: /etc/loki/local-config.yaml + volumes: + - loki-data:/loki + + promtail: + image: grafana/promtail:2.0.0 + volumes: + - /var/log:/var/log + command: -config.file=/etc/promtail/config.yml + networks: + - loki + deploy: + mode: global + endpoint_mode: dnsrr + + prometheus: + image: prom/prometheus:latest + volumes: + - prometheus-data:/prometheus:rw + configs: + - source: prometheus_yml + target: /etc/prometheus/prometheus.yml + networks: + api: + aliases: + - prometheus_api + exporters: ~ + proxy: ~ deploy: restart_policy: condition: on-failure labels: - "traefik.enable=true" - - "traefik.http.services.${STACK_NAME}.loadbalancer.server.port=80" - - "traefik.http.routers.${STACK_NAME}.rule=Host(`${DOMAIN}`${EXTRA_DOMAINS})" - - "traefik.http.routers.${STACK_NAME}.entrypoints=web-secure" - - "traefik.http.routers.${STACK_NAME}.tls.certresolver=${LETS_ENCRYPT_ENV}" - ## Redirect from EXTRA_DOMAINS to DOMAIN - #- "traefik.http.routers.${STACK_NAME}.middlewares=${STACK_NAME}-redirect" - #- "traefik.http.middlewares.${STACK_NAME}-redirect.headers.SSLForceHost=true" - #- "traefik.http.middlewares.${STACK_NAME}-redirect.headers.SSLHost=${DOMAIN}" + - "traefik.http.services.${STACK_NAME}-prometheus.loadbalancer.server.port=9090" + - "traefik.http.routers.${STACK_NAME}-prometheus.rule=Host(`${PROMETHEUS_DOMAIN}`)" + - "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure" + - "traefik.http.routers.${STACK_NAME}-prometheus.tls=true" + - "traefik.http.routers.${STACK_NAME}-tls.certresolver=${LETS_ENCRYPT_ENV}" + - "traefik.http.middlewares.${STACK_NAME}-http-to-https.redirectscheme.scheme=https" + - "traefik.http.middlewares.${STACK_NAME}-http-to-https.redirectscheme.permanent=true" + - "traefik.http.routers.${STACK_NAME}-prometheus-redirect.rule=Host(`${PROMETHEUS_DOMAIN}`)" + - "traefik.http.routers.${STACK_NAME}-prometheus-redirect.middlewares=${STACK_NAME}-http-to-https@docker" healthcheck: - test: ["CMD", "curl", "-f", "http://localhost"] + test: ["CMD", "curl", "-f", "http://localhost:9090"] interval: 30s timeout: 10s retries: 10 start_period: 1m + grafana: + image: grafana/grafana + volumes: + - grafana-data:/var/lib/grafana:rw + networks: + - api + - proxy + environment: + - GF_SMTP_HOST + - GF_SMTP_ENABLED + - GF_SMTP_FROM_ADDRESS + - GF_SMTP_SKIP_VERIFY + - GF_SECURITY_ALLOW_EMBEDDING + - GF_INSTALL_PLUGINS=grafana-piechart-panel + - GF_SERVER_ROOT_URL=https://${GRAFANA_DOMAIN} + deploy: + labels: + - "traefik.enable=true" + - "traefik.http.services.${STACK_NAME}-grafana.loadbalancer.server.port=3000" + - "traefik.http.routers.${STACK_NAME}-grafana.rule=Host(`${GRAFANA_DOMAIN}`)" + - "traefik.http.routers.${STACK_NAME}-grafana.entrypoints=web-secure" + - "traefik.http.routers.${STACK_NAME}-grafana.tls=true" + - "traefik.http.routers.${STACK_NAME}-tls.certresolver=${LETS_ENCRYPT_ENV}" + - "traefik.http.middlewares.${STACK_NAME}-http-to-https.redirectscheme.scheme=https" + - "traefik.http.middlewares.${STACK_NAME}-http-to-https.redirectscheme.permanent=true" + - "traefik.http.routers.${STACK_NAME}-grafana-redirect.rule=Host(`${GRAFANA_DOMAIN}`)" + - "traefik.http.routers.${STACK_NAME}-grafana-redirect.middlewares=${STACK_NAME}-http-to-https@docker" + +configs: + prometheus_yml: + name: ${STACK_NAME}_prometheus_yml_${PROMETHEUS_YML_VERSION} + file: prometheus.yml + promtail_yml: + name: ${STACK_NAME}_promtail_yml_${PROMTAIL_YML_VERSION} + file: promtail.yml + loki_yml: + name: ${STACK_NAME}_loki_yml_${LOKI_YML_VERSION} + file: loki.yml + node_exporter_entrypoint_sh: + name: ${STACK_NAME}_node_exporter_entrypoint_${NODE_EXPORTER_ENTRYPOINT_VERSION} + file: node-exporter-entrypoint.sh + +volumes: + prometheus-data: + grafana-data: + loki-data: + networks: + api: + driver: overlay + exporters: + driver: overlay + attachable: true + loki: + driver: overlay + attachable: true proxy: external: true diff --git a/loki.yml b/loki.yml new file mode 100644 index 0000000..c4a2b21 --- /dev/null +++ b/loki.yml @@ -0,0 +1,71 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +distributor: + ring: + kvstore: + store: memberlist + +ingester: + lifecycler: + ring: + kvstore: + store: memberlist + replication_factor: 1 + final_sleep: 0s + chunk_idle_period: 5m + chunk_retain_period: 30s + +memberlist: + abort_if_cluster_join_fails: false + + # Expose this port on all distributor, ingester + # and querier replicas. + bind_port: 7946 + + # You can use a headless k8s service for all distributor, + # ingester and querier components. + join_members: + - loki:7946 + + max_join_backoff: 1m + max_join_retries: 10 + min_join_backoff: 1s + +schema_config: + configs: + - from: 2020-11-25 + store: boltdb-shipper + object_store: aws + schema: v11 + index: + prefix: index_ + period: 24h + +storage_config: + boltdb_shipper: + active_index_directory: /loki/index + cache_location: /loki/index_cache + resync_interval: 5s + shared_store: aws + + aws: + endpoint: + region: + access_key_id: + secret_access_key: + bucketnames: + insecure: false + sse_encryption: false + http_config: + idle_conn_timeout: 90s + response_header_timeout: 0s + insecure_skip_verify: false + s3forcepathstyle: true + +limits_config: + enforce_metric_name: false + reject_old_samples: true + reject_old_samples_max_age: 168h diff --git a/node-exporter-entrypoint.sh b/node-exporter-entrypoint.sh new file mode 100644 index 0000000..72e2f0b --- /dev/null +++ b/node-exporter-entrypoint.sh @@ -0,0 +1,9 @@ +#!/bin/sh -e + +NODE_NAME=$(cat /etc/nodename) +mkdir -p /etc/node-exporter +echo "node_meta{node_id=\"$NODE_ID\", container_label_com_docker_swarm_node_id=\"$NODE_ID\", node_name=\"$NODE_NAME\"} 1" > /etc/node-exporter/node-meta.prom + +set -- /bin/node_exporter "$@" + +exec "$@" diff --git a/prometheus.yml b/prometheus.yml new file mode 100644 index 0000000..ad4e77d --- /dev/null +++ b/prometheus.yml @@ -0,0 +1,56 @@ +global: + scrape_interval: 30s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 30s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: + # - alertmanager:9093 + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + # - "first_rules.yml" + # - "second_rules.yml" + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + static_configs: + - targets: + - localhost:9090 + + # http://node_exporter:9100/metrics + - job_name: node-exporter + scrape_interval: 10s + metrics_path: "/metrics" + dns_sd_configs: + - names: + - 'tasks.node_exporter' + type: 'A' + port: 9100 + + + - job_name: 'cadvisor' + scrape_interval: 30s + metrics_path: '/metrics' + dns_sd_configs: + - names: + - 'tasks.cadvisor' + type: 'A' + port: 8080 + + - job_name: 'traefik' + scrape_interval: 30s + metrics_path: '/metrics' + dns_sd_configs: + - names: + - 'tasks.traefik_app' + type: 'A' + port: 8082 diff --git a/promtail.yml b/promtail.yml new file mode 100644 index 0000000..ed06e8c --- /dev/null +++ b/promtail.yml @@ -0,0 +1,18 @@ +server: + http_listen_port: 9080 + grpc_listen_port: 0 + +positions: + filename: /tmp/positions.yaml + +clients: + - url: http://loki:3100/loki/api/v1/push + +scrape_configs: +- job_name: system + static_configs: + - targets: + - localhost + labels: + job: varlogs + __path__: /var/log/*log