diff options
author | Elizabeth Hunt <me@liz.coffee> | 2025-08-18 12:38:15 -0700 |
---|---|---|
committer | Elizabeth Hunt <me@liz.coffee> | 2025-08-18 12:38:15 -0700 |
commit | a6cc1ed3617ce0e71e5f73543a0ae4dcd38791a8 (patch) | |
tree | dd28429a223bfc9264593b3f26eca28fca507fb9 /playbooks/roles/mon | |
parent | 91027c036cf7c15db76fe7b8317c754ab13d63d9 (diff) | |
download | infra-a6cc1ed3617ce0e71e5f73543a0ae4dcd38791a8.tar.gz infra-a6cc1ed3617ce0e71e5f73543a0ae4dcd38791a8.zip |
Deploy monitoring stack
Diffstat (limited to 'playbooks/roles/mon')
-rw-r--r-- | playbooks/roles/mon/templates/stacks/docker-compose.yml | 19 | ||||
-rw-r--r-- | playbooks/roles/mon/templates/volumes/gatus/config/config.yml | 106 |
2 files changed, 117 insertions, 8 deletions
diff --git a/playbooks/roles/mon/templates/stacks/docker-compose.yml b/playbooks/roles/mon/templates/stacks/docker-compose.yml index 98332cc..31a5932 100644 --- a/playbooks/roles/mon/templates/stacks/docker-compose.yml +++ b/playbooks/roles/mon/templates/stacks/docker-compose.yml @@ -8,6 +8,7 @@ services: - TZ={{ timezone }} - DEPLOYMENT_TIME={{ deployment_time }} networks: + - monint - proxy - metrics deploy: @@ -29,6 +30,23 @@ services: - traefik.http.routers.mon.entrypoints=websecure - traefik.http.services.mon.loadbalancer.server.port=8080 + uptime: + image: oci.liz.coffee/emprespresso/uptime:release + networks: + - monint + environment: + - TZ={{ timezone }} + - DEPLOYMENT_TIME={{ deployment_time }} + deploy: + mode: replicated + replicas: 1 + update_config: + parallelism: 1 + order: start-first + failure_action: rollback + labels: + - traefik.enable=false + prometheus: image: prom/prometheus:latest volumes: @@ -56,6 +74,7 @@ services: - traefik.http.services.prometheus.loadbalancer.server.port=9090 networks: + monint: proxy: external: true metrics: diff --git a/playbooks/roles/mon/templates/volumes/gatus/config/config.yml b/playbooks/roles/mon/templates/volumes/gatus/config/config.yml index be11873..fb4e914 100644 --- a/playbooks/roles/mon/templates/volumes/gatus/config/config.yml +++ b/playbooks/roles/mon/templates/volumes/gatus/config/config.yml @@ -1,4 +1,14 @@ metrics: true +ui: + title: "liz.coffee health dashboard." + header: "liz.coffee" + description: "⋆⭒˚.⋆ 🐧 <3 ⋆⭒˚.⋆" + logo: "https://src.liz.coffee/cgit.png" + default-sort-by: "group" +alerting: + ntfy: + topic: "{{ ntfy_topic }}" + priority: 1 endpoints: - name: "HealthCheck" @@ -7,12 +17,11 @@ endpoints: conditions: - "[STATUS] == 200" - "[BODY] == pat(*OK*)" - - - name: "Expiration For {{ domain }}" - url: "https://{{ domain }}" - interval: 30m - conditions: - - "[CERTIFICATE_EXPIRATION] > 240h" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "GlobalHealthCheck" - name: "LDAPS" url: "tls://{{ idm_domain }}:3636" @@ -22,6 +31,52 @@ endpoints: conditions: - "[CONNECTED] == true" - "[CERTIFICATE_EXPIRATION] > 48h" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "LDAPS" + +{% for test in email_tests %} +{% set from_account = (email_accounts | selectattr("email", "equalto", test.from) | list).0 %} +{% set to_account = (email_accounts | selectattr("email", "equalto", test.to) | list).0 %} + - name: "{{ test.name }}" + url: "http://uptime:9000/email" + method: "POST" + interval: 5m + client: + timeout: 120s + group: "mail" + body: | + { + "from": { + "email": "{{ from_account.email }}", + "username": "{{ from_account.username }}", + "password": "{{ from_account.password }}", + "send_host": "{{ from_account.host }}", + "send_port": 465 + }, + "to": { + "email": "{{ to_account.email }}", + "username": "{{ to_account.username }}", + "password": "{{ to_account.password }}", + "read_host": "{{ to_account.host }}", + "read_port": 993 + }, + "readRetry": { + "interval": 5, + "attempts": 24 + } + } + conditions: + - "[STATUS] == 200" + - "[BODY] == pat(*ok*)" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "mail {{ test.name }}" +{% endfor %} {% for port in [465,993] %} - name: "mail on port {{ port }}" @@ -33,6 +88,11 @@ endpoints: conditions: - "[CONNECTED] == true" - "[CERTIFICATE_EXPIRATION] > 48h" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "mail on port {{ port }}" {% endfor %} {% for user, m in mesh.items() %} @@ -43,22 +103,42 @@ endpoints: interval: 1m conditions: - "[STATUS] == 200" + - "[CERTIFICATE_EXPIRATION] > 240h" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "healthcheck {{ user }} pub {{ healthcheck }} 200" {% endfor %} {% for healthcheck in m.private_healthchecks %} - name: "{{ healthcheck }} priv healthcheck {{ user }}" url: "{{ healthcheck }}" + client: + dns-resolver: "tcp://{{ m.gateway }}:53" group: "{{ user }}_priv" interval: 1m conditions: - "[STATUS] == 200" + - "[CERTIFICATE_EXPIRATION] > 240h" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "{{ healthcheck }} priv healthcheck {{ user }}" + - name: "{{ healthcheck }} pub healthcheck {{ user }} 403" group: "{{ user }}_priv" url: "{{ healthcheck }}" client: - dns-resolver: "tcp://1.1.1.1:53" + dns-resolver: "tcp://{{ public_resolver }}:53" interval: 1m conditions: - "[STATUS] == 403" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "{{ healthcheck }} pub healthcheck {{ user }} 403" {% endfor %} {% for record in m.private_records %} - name: "DNS Check [{{ record.name }}_{{ record.type }}]" @@ -71,11 +151,21 @@ endpoints: conditions: - "[BODY] == {{ record.ip }}" - "[DNS_RCODE] == NOERROR" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "DNS {{ record.name }}_{{ record.type }}" - - name: "PING {{ record.name }}" + - name: "PING {{ record.name }}_{{ record.type }}" group: "{{ user }}_dns_private" url: "icmp://{{ record.name }}" conditions: - "[CONNECTED] == true" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "PING {{ record.name }}" {% endfor %} {% endfor %} |