summaryrefslogtreecommitdiff
path: root/playbooks/roles/mon/templates
diff options
context:
space:
mode:
authorElizabeth Hunt <me@liz.coffee>2025-08-18 12:38:15 -0700
committerElizabeth Hunt <me@liz.coffee>2025-08-18 12:38:15 -0700
commita6cc1ed3617ce0e71e5f73543a0ae4dcd38791a8 (patch)
treedd28429a223bfc9264593b3f26eca28fca507fb9 /playbooks/roles/mon/templates
parent91027c036cf7c15db76fe7b8317c754ab13d63d9 (diff)
downloadinfra-a6cc1ed3617ce0e71e5f73543a0ae4dcd38791a8.tar.gz
infra-a6cc1ed3617ce0e71e5f73543a0ae4dcd38791a8.zip
Deploy monitoring stack
Diffstat (limited to 'playbooks/roles/mon/templates')
-rw-r--r--playbooks/roles/mon/templates/stacks/docker-compose.yml19
-rw-r--r--playbooks/roles/mon/templates/volumes/gatus/config/config.yml106
2 files changed, 117 insertions, 8 deletions
diff --git a/playbooks/roles/mon/templates/stacks/docker-compose.yml b/playbooks/roles/mon/templates/stacks/docker-compose.yml
index 98332cc..31a5932 100644
--- a/playbooks/roles/mon/templates/stacks/docker-compose.yml
+++ b/playbooks/roles/mon/templates/stacks/docker-compose.yml
@@ -8,6 +8,7 @@ services:
- TZ={{ timezone }}
- DEPLOYMENT_TIME={{ deployment_time }}
networks:
+ - monint
- proxy
- metrics
deploy:
@@ -29,6 +30,23 @@ services:
- traefik.http.routers.mon.entrypoints=websecure
- traefik.http.services.mon.loadbalancer.server.port=8080
+ uptime:
+ image: oci.liz.coffee/emprespresso/uptime:release
+ networks:
+ - monint
+ environment:
+ - TZ={{ timezone }}
+ - DEPLOYMENT_TIME={{ deployment_time }}
+ deploy:
+ mode: replicated
+ replicas: 1
+ update_config:
+ parallelism: 1
+ order: start-first
+ failure_action: rollback
+ labels:
+ - traefik.enable=false
+
prometheus:
image: prom/prometheus:latest
volumes:
@@ -56,6 +74,7 @@ services:
- traefik.http.services.prometheus.loadbalancer.server.port=9090
networks:
+ monint:
proxy:
external: true
metrics:
diff --git a/playbooks/roles/mon/templates/volumes/gatus/config/config.yml b/playbooks/roles/mon/templates/volumes/gatus/config/config.yml
index be11873..fb4e914 100644
--- a/playbooks/roles/mon/templates/volumes/gatus/config/config.yml
+++ b/playbooks/roles/mon/templates/volumes/gatus/config/config.yml
@@ -1,4 +1,14 @@
metrics: true
+ui:
+ title: "liz.coffee health dashboard."
+ header: "liz.coffee"
+ description: "⋆⭒˚.⋆ 🐧 <3 ⋆⭒˚.⋆"
+ logo: "https://src.liz.coffee/cgit.png"
+ default-sort-by: "group"
+alerting:
+ ntfy:
+ topic: "{{ ntfy_topic }}"
+ priority: 1
endpoints:
- name: "HealthCheck"
@@ -7,12 +17,11 @@ endpoints:
conditions:
- "[STATUS] == 200"
- "[BODY] == pat(*OK*)"
-
- - name: "Expiration For {{ domain }}"
- url: "https://{{ domain }}"
- interval: 30m
- conditions:
- - "[CERTIFICATE_EXPIRATION] > 240h"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "GlobalHealthCheck"
- name: "LDAPS"
url: "tls://{{ idm_domain }}:3636"
@@ -22,6 +31,52 @@ endpoints:
conditions:
- "[CONNECTED] == true"
- "[CERTIFICATE_EXPIRATION] > 48h"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "LDAPS"
+
+{% for test in email_tests %}
+{% set from_account = (email_accounts | selectattr("email", "equalto", test.from) | list).0 %}
+{% set to_account = (email_accounts | selectattr("email", "equalto", test.to) | list).0 %}
+ - name: "{{ test.name }}"
+ url: "http://uptime:9000/email"
+ method: "POST"
+ interval: 5m
+ client:
+ timeout: 120s
+ group: "mail"
+ body: |
+ {
+ "from": {
+ "email": "{{ from_account.email }}",
+ "username": "{{ from_account.username }}",
+ "password": "{{ from_account.password }}",
+ "send_host": "{{ from_account.host }}",
+ "send_port": 465
+ },
+ "to": {
+ "email": "{{ to_account.email }}",
+ "username": "{{ to_account.username }}",
+ "password": "{{ to_account.password }}",
+ "read_host": "{{ to_account.host }}",
+ "read_port": 993
+ },
+ "readRetry": {
+ "interval": 5,
+ "attempts": 24
+ }
+ }
+ conditions:
+ - "[STATUS] == 200"
+ - "[BODY] == pat(*ok*)"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "mail {{ test.name }}"
+{% endfor %}
{% for port in [465,993] %}
- name: "mail on port {{ port }}"
@@ -33,6 +88,11 @@ endpoints:
conditions:
- "[CONNECTED] == true"
- "[CERTIFICATE_EXPIRATION] > 48h"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "mail on port {{ port }}"
{% endfor %}
{% for user, m in mesh.items() %}
@@ -43,22 +103,42 @@ endpoints:
interval: 1m
conditions:
- "[STATUS] == 200"
+ - "[CERTIFICATE_EXPIRATION] > 240h"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "healthcheck {{ user }} pub {{ healthcheck }} 200"
{% endfor %}
{% for healthcheck in m.private_healthchecks %}
- name: "{{ healthcheck }} priv healthcheck {{ user }}"
url: "{{ healthcheck }}"
+ client:
+ dns-resolver: "tcp://{{ m.gateway }}:53"
group: "{{ user }}_priv"
interval: 1m
conditions:
- "[STATUS] == 200"
+ - "[CERTIFICATE_EXPIRATION] > 240h"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "{{ healthcheck }} priv healthcheck {{ user }}"
+
- name: "{{ healthcheck }} pub healthcheck {{ user }} 403"
group: "{{ user }}_priv"
url: "{{ healthcheck }}"
client:
- dns-resolver: "tcp://1.1.1.1:53"
+ dns-resolver: "tcp://{{ public_resolver }}:53"
interval: 1m
conditions:
- "[STATUS] == 403"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "{{ healthcheck }} pub healthcheck {{ user }} 403"
{% endfor %}
{% for record in m.private_records %}
- name: "DNS Check [{{ record.name }}_{{ record.type }}]"
@@ -71,11 +151,21 @@ endpoints:
conditions:
- "[BODY] == {{ record.ip }}"
- "[DNS_RCODE] == NOERROR"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "DNS {{ record.name }}_{{ record.type }}"
- - name: "PING {{ record.name }}"
+ - name: "PING {{ record.name }}_{{ record.type }}"
group: "{{ user }}_dns_private"
url: "icmp://{{ record.name }}"
conditions:
- "[CONNECTED] == true"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "PING {{ record.name }}"
{% endfor %}
{% endfor %}