Compare commits
1 Commits
fix/Backup
...
cleanup-do
| Author | SHA1 | Date | |
|---|---|---|---|
| 4e038327b5 |
35
README.md
35
README.md
@ -18,32 +18,18 @@ It's based heavily on the [monitoring-lite](https://git.coopcloud.tech/coop-clou
|
||||
|
||||
<!-- endmetadata -->
|
||||
|
||||
## Setup a Metrics Gathering
|
||||
## Setup Metrics Gathering
|
||||
|
||||
Where gathering.org is the node you want to gather metrics from.
|
||||
|
||||
1. Configure DNS
|
||||
- monitoring.gathering.org
|
||||
- cadvisor.monitoring.gathering.org
|
||||
- node.monitoring.gathering.org
|
||||
1. Configure Traefik to use BasicAuth
|
||||
* `abra app config traefik.gathering.org`
|
||||
uncomment
|
||||
```
|
||||
# BASIC_AUTH
|
||||
COMPOSE_FILE="$COMPOSE_FILE:compose.basicauth.yml"
|
||||
BASIC_AUTH=1
|
||||
SECRET_USERSFILE_VERSION=v1
|
||||
```
|
||||
- Generate userslist with httpasswd hashed password
|
||||
`abra app secret insert traefik.gathering.org usersfile v1 'admin:<hashed-secret>'`
|
||||
make sure there is no whitespace in between `admin:<hashed-secret>`, it seems to break stuff...
|
||||
- `abra app deploy -f traefik`
|
||||
1. `abra app new monitoring-ng`
|
||||
1. `abra app config monitoring.gathering.org`
|
||||
for gathering only the main `compose.yml` is needed, nothing more.
|
||||
1. `abra app deploy monitoring.gathering.org`
|
||||
1. check that endpoints are up and basic-auth works
|
||||
2. [Configure Traefik to use BasicAuth](https://git.coopcloud.tech/coop-cloud/traefik#configuring-wildcard-ssl-using-dns)
|
||||
3. `abra app new monitoring-ng`
|
||||
4. `abra app config monitoring.gathering.org` (for gathering only the main `compose.yml` is needed, nothing more.)
|
||||
5. `abra app deploy monitoring.gathering.org`
|
||||
6. check that endpoints are up and basic-auth works
|
||||
- cadvisor.monitoring.gathering.org
|
||||
- node.monitoring.gathering.org
|
||||
|
||||
@ -56,16 +42,13 @@ In case you have no traefik running on the machine, you can expose the ports dir
|
||||
|
||||
## Setup Metrics Browser
|
||||
|
||||
This builds upon [Setup Metrics Gathering](#setup-metrics-grathering) so make sure you did that first.
|
||||
|
||||
1. Configure DNS
|
||||
- monitoring.example.org
|
||||
- prometheus.monitoring.example.org
|
||||
- loki.monitoring.example.org
|
||||
2. Setup monitoring stack
|
||||
- `abra app new monitoring-ng`
|
||||
- `abra app config monitoring.example.org`
|
||||
Uncomment all the stuff
|
||||
- `abra app secret insert monitoring.example.org basic_auth v1 <secret>`
|
||||
- `abra app config monitoring.example.org` Uncomment prometheus, loki and grafana
|
||||
- `abra app secret insert monitoring.example.org basic_auth v1 <password>`
|
||||
this needs the plaintext traefik basic-auth secret, not the hashed one!
|
||||
- `abra app secret ls monitoring.example.org`
|
||||
- `abra app deploy monitoring.example.org`
|
||||
|
||||
@ -30,6 +30,7 @@ services:
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.entrypoints=web-secure"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.tls=true"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.tls.certresolver=${LETS_ENCRYPT_ENV}"
|
||||
- "traefik.http.routers.${STACK_NAME}-prometheus.middlewares=basicauth@file"
|
||||
|
||||
configs:
|
||||
prometheus_yml:
|
||||
|
||||
@ -68,76 +68,50 @@
|
||||
"condition": "B",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||
"datasourceUid": "PBFA97CFB590B2093",
|
||||
"model": {
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "builder",
|
||||
"expr": "rate(backup[24h])",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": true,
|
||||
"intervalMs": 1000,
|
||||
"legendFormat": "__auto",
|
||||
"maxDataPoints": 43200,
|
||||
"range": false,
|
||||
"refId": "A",
|
||||
"relativeTimeRange": {
|
||||
"from": 28800,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "PBFA97CFB590B2093",
|
||||
"model": {
|
||||
"disableTextWrap": false,
|
||||
"editorMode": "code",
|
||||
"expr": "rate(backup[25h]) == 0",
|
||||
"fullMetaSearch": false,
|
||||
"includeNullMetadata": true,
|
||||
"instant": true,
|
||||
"intervalMs": 1000,
|
||||
"legendFormat": "__auto",
|
||||
"maxDataPoints": 43200,
|
||||
"range": false,
|
||||
"refId": "A",
|
||||
"useBackend": false
|
||||
}
|
||||
"useBackend": false
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"relativeTimeRange": { "from": 600, "to": 0 },
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": { "params": [0, 0], "type": "within_range" },
|
||||
"operator": { "type": "and" },
|
||||
"query": { "params": ["C"] },
|
||||
"reducer": { "params": [], "type": "last" },
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": { "type": "__expr__", "uid": "__expr__" },
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "B",
|
||||
"relativeTimeRange": {
|
||||
"from": 28800,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
-1,
|
||||
0
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": []
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "avg"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"name": "Expression",
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "B",
|
||||
"type": "threshold"
|
||||
}
|
||||
"type": "threshold"
|
||||
}
|
||||
}
|
||||
],
|
||||
"noDataState": "NoData",
|
||||
"execErrState": "Error",
|
||||
"for": "5m",
|
||||
"annotations": {},
|
||||
"labels": {},
|
||||
"isPaused": false
|
||||
},
|
||||
{{ end }}
|
||||
@ -214,9 +188,9 @@
|
||||
],
|
||||
"noDataState": "NoData",
|
||||
"execErrState": "Error",
|
||||
"for": "60m",
|
||||
"for": "20m",
|
||||
"annotations": {
|
||||
"summary": "Backup did not finish within 60 minutes"
|
||||
"summary": "Backup did not finish within 20 minutes"
|
||||
},
|
||||
"labels": {},
|
||||
"isPaused": false
|
||||
|
||||
Reference in New Issue
Block a user