summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorElizabeth Hunt <me@liz.coffee>2025-08-18 12:38:15 -0700
committerElizabeth Hunt <me@liz.coffee>2025-08-18 12:38:15 -0700
commita6cc1ed3617ce0e71e5f73543a0ae4dcd38791a8 (patch)
treedd28429a223bfc9264593b3f26eca28fca507fb9
parent91027c036cf7c15db76fe7b8317c754ab13d63d9 (diff)
downloadinfra-a6cc1ed3617ce0e71e5f73543a0ae4dcd38791a8.tar.gz
infra-a6cc1ed3617ce0e71e5f73543a0ae4dcd38791a8.zip
Deploy monitoring stack
-rw-r--r--group_vars/all.yml3
-rw-r--r--group_vars/mon.yml34
-rw-r--r--playbooks/roles/coffee/templates/stacks/docker-compose.yml2
-rw-r--r--playbooks/roles/mon/templates/stacks/docker-compose.yml19
-rw-r--r--playbooks/roles/mon/templates/volumes/gatus/config/config.yml106
-rw-r--r--playbooks/roles/traefik/templates/volumes/oauth2proxy/oauth_proxy.cfg2
-rw-r--r--secrets.txt10
7 files changed, 166 insertions, 10 deletions
diff --git a/group_vars/all.yml b/group_vars/all.yml
index 05ce106..e5c4611 100644
--- a/group_vars/all.yml
+++ b/group_vars/all.yml
@@ -103,6 +103,9 @@ mesh:
ip: "10.128.0.103"
- type: "A"
+ name: "{{ domain }}"
+ ip: "{{ loadbalancer_ip }}"
+ - type: "A"
name: "oci.{{ domain }}"
ip: "{{ loadbalancer_ip }}"
- type: "A"
diff --git a/group_vars/mon.yml b/group_vars/mon.yml
index 1d0944e..2262bf3 100644
--- a/group_vars/mon.yml
+++ b/group_vars/mon.yml
@@ -3,4 +3,38 @@
mon_domain: mon.liz.coffee
mon_base: "{{ swarm_base }}/mon"
+public_resolver: "1.1.1.1"
+
prometheus_domain: prometheus.liz.coffee
+
+email_accounts:
+ - username: "{{ mmtc1_username }}"
+ email: "{{ mmtc1_username }}@mistymountainstherapy.com"
+ password: "{{ mmtc1_password }}"
+ host: "mail.mistymountainstherapy.com"
+ - username: "{{ mmtc2_username }}"
+ email: "{{ mmtc2_username }}@mistymountainstherapy.com"
+ password: "{{ mmtc2_password }}"
+ host: "mail.mistymountainstherapy.com"
+ - username: "{{ lizc1_username }}"
+ email: "{{ lizc1_username }}@{{ domain }}"
+ password: "{{ lizc1_password }}"
+ host: "{{ mail_domain }}"
+ - username: "{{ lizc2_username }}"
+ email: "{{ lizc2_username }}@{{ domain }}"
+ password: "{{ lizc2_password }}"
+ host: "{{ mail_domain }}"
+
+email_tests:
+ - name: "mmt_to_mmt"
+ from: "{{ mmtc1_username }}@mistymountainstherapy.com"
+ to: "{{ mmtc2_username }}@mistymountainstherapy.com"
+ - name: "mmt_to_liz"
+ from: "{{ mmtc1_username }}@mistymountainstherapy.com"
+ to: "{{ lizc2_username }}@{{ domain }}"
+ - name: "liz_to_mmt"
+ from: "{{ lizc1_username }}@{{ domain }}"
+ to: "{{ mmtc2_username }}@mistymountainstherapy.com"
+ - name: "liz_to_liz"
+ from: "{{ lizc2_username }}@{{ domain }}"
+ to: "{{ lizc1_username }}@{{ domain }}"
diff --git a/playbooks/roles/coffee/templates/stacks/docker-compose.yml b/playbooks/roles/coffee/templates/stacks/docker-compose.yml
index f3f33bd..1e7afb9 100644
--- a/playbooks/roles/coffee/templates/stacks/docker-compose.yml
+++ b/playbooks/roles/coffee/templates/stacks/docker-compose.yml
@@ -9,7 +9,7 @@ services:
networks:
- proxy
healthcheck:
- test: ["CMD-SHELL", "curl", "--fail", "http://localhost:8080"]
+ test: ["CMD-SHELL", "curl --fail http://localhost:8080"]
timeout: 15s
interval: 30s
retries: 3
diff --git a/playbooks/roles/mon/templates/stacks/docker-compose.yml b/playbooks/roles/mon/templates/stacks/docker-compose.yml
index 98332cc..31a5932 100644
--- a/playbooks/roles/mon/templates/stacks/docker-compose.yml
+++ b/playbooks/roles/mon/templates/stacks/docker-compose.yml
@@ -8,6 +8,7 @@ services:
- TZ={{ timezone }}
- DEPLOYMENT_TIME={{ deployment_time }}
networks:
+ - monint
- proxy
- metrics
deploy:
@@ -29,6 +30,23 @@ services:
- traefik.http.routers.mon.entrypoints=websecure
- traefik.http.services.mon.loadbalancer.server.port=8080
+ uptime:
+ image: oci.liz.coffee/emprespresso/uptime:release
+ networks:
+ - monint
+ environment:
+ - TZ={{ timezone }}
+ - DEPLOYMENT_TIME={{ deployment_time }}
+ deploy:
+ mode: replicated
+ replicas: 1
+ update_config:
+ parallelism: 1
+ order: start-first
+ failure_action: rollback
+ labels:
+ - traefik.enable=false
+
prometheus:
image: prom/prometheus:latest
volumes:
@@ -56,6 +74,7 @@ services:
- traefik.http.services.prometheus.loadbalancer.server.port=9090
networks:
+ monint:
proxy:
external: true
metrics:
diff --git a/playbooks/roles/mon/templates/volumes/gatus/config/config.yml b/playbooks/roles/mon/templates/volumes/gatus/config/config.yml
index be11873..fb4e914 100644
--- a/playbooks/roles/mon/templates/volumes/gatus/config/config.yml
+++ b/playbooks/roles/mon/templates/volumes/gatus/config/config.yml
@@ -1,4 +1,14 @@
metrics: true
+ui:
+ title: "liz.coffee health dashboard."
+ header: "liz.coffee"
+ description: "⋆⭒˚.⋆ 🐧 <3 ⋆⭒˚.⋆"
+ logo: "https://src.liz.coffee/cgit.png"
+ default-sort-by: "group"
+alerting:
+ ntfy:
+ topic: "{{ ntfy_topic }}"
+ priority: 1
endpoints:
- name: "HealthCheck"
@@ -7,12 +17,11 @@ endpoints:
conditions:
- "[STATUS] == 200"
- "[BODY] == pat(*OK*)"
-
- - name: "Expiration For {{ domain }}"
- url: "https://{{ domain }}"
- interval: 30m
- conditions:
- - "[CERTIFICATE_EXPIRATION] > 240h"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "GlobalHealthCheck"
- name: "LDAPS"
url: "tls://{{ idm_domain }}:3636"
@@ -22,6 +31,52 @@ endpoints:
conditions:
- "[CONNECTED] == true"
- "[CERTIFICATE_EXPIRATION] > 48h"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "LDAPS"
+
+{% for test in email_tests %}
+{% set from_account = (email_accounts | selectattr("email", "equalto", test.from) | list).0 %}
+{% set to_account = (email_accounts | selectattr("email", "equalto", test.to) | list).0 %}
+ - name: "{{ test.name }}"
+ url: "http://uptime:9000/email"
+ method: "POST"
+ interval: 5m
+ client:
+ timeout: 120s
+ group: "mail"
+ body: |
+ {
+ "from": {
+ "email": "{{ from_account.email }}",
+ "username": "{{ from_account.username }}",
+ "password": "{{ from_account.password }}",
+ "send_host": "{{ from_account.host }}",
+ "send_port": 465
+ },
+ "to": {
+ "email": "{{ to_account.email }}",
+ "username": "{{ to_account.username }}",
+ "password": "{{ to_account.password }}",
+ "read_host": "{{ to_account.host }}",
+ "read_port": 993
+ },
+ "readRetry": {
+ "interval": 5,
+ "attempts": 24
+ }
+ }
+ conditions:
+ - "[STATUS] == 200"
+ - "[BODY] == pat(*ok*)"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "mail {{ test.name }}"
+{% endfor %}
{% for port in [465,993] %}
- name: "mail on port {{ port }}"
@@ -33,6 +88,11 @@ endpoints:
conditions:
- "[CONNECTED] == true"
- "[CERTIFICATE_EXPIRATION] > 48h"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "mail on port {{ port }}"
{% endfor %}
{% for user, m in mesh.items() %}
@@ -43,22 +103,42 @@ endpoints:
interval: 1m
conditions:
- "[STATUS] == 200"
+ - "[CERTIFICATE_EXPIRATION] > 240h"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "healthcheck {{ user }} pub {{ healthcheck }} 200"
{% endfor %}
{% for healthcheck in m.private_healthchecks %}
- name: "{{ healthcheck }} priv healthcheck {{ user }}"
url: "{{ healthcheck }}"
+ client:
+ dns-resolver: "tcp://{{ m.gateway }}:53"
group: "{{ user }}_priv"
interval: 1m
conditions:
- "[STATUS] == 200"
+ - "[CERTIFICATE_EXPIRATION] > 240h"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "{{ healthcheck }} priv healthcheck {{ user }}"
+
- name: "{{ healthcheck }} pub healthcheck {{ user }} 403"
group: "{{ user }}_priv"
url: "{{ healthcheck }}"
client:
- dns-resolver: "tcp://1.1.1.1:53"
+ dns-resolver: "tcp://{{ public_resolver }}:53"
interval: 1m
conditions:
- "[STATUS] == 403"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "{{ healthcheck }} pub healthcheck {{ user }} 403"
{% endfor %}
{% for record in m.private_records %}
- name: "DNS Check [{{ record.name }}_{{ record.type }}]"
@@ -71,11 +151,21 @@ endpoints:
conditions:
- "[BODY] == {{ record.ip }}"
- "[DNS_RCODE] == NOERROR"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "DNS {{ record.name }}_{{ record.type }}"
- - name: "PING {{ record.name }}"
+ - name: "PING {{ record.name }}_{{ record.type }}"
group: "{{ user }}_dns_private"
url: "icmp://{{ record.name }}"
conditions:
- "[CONNECTED] == true"
+ alerts:
+ - type: ntfy
+ failure-threshold: 1
+ send-on-resolved: true
+ description: "PING {{ record.name }}"
{% endfor %}
{% endfor %}
diff --git a/playbooks/roles/traefik/templates/volumes/oauth2proxy/oauth_proxy.cfg b/playbooks/roles/traefik/templates/volumes/oauth2proxy/oauth_proxy.cfg
index 3c412de..fff224b 100644
--- a/playbooks/roles/traefik/templates/volumes/oauth2proxy/oauth_proxy.cfg
+++ b/playbooks/roles/traefik/templates/volumes/oauth2proxy/oauth_proxy.cfg
@@ -5,7 +5,7 @@ email_domains = "*"
reverse_proxy = true
redirect_url = "https://{{ oauth_proxy_domain }}/oauth2/callback"
real_client_ip_header = "X-Forwarded-For"
-trusted_ips = "{{ homelab_network }}"
+trusted_ips = "{{ swarm_network }}"
## Cookie Settings
cookie_name = "_oauth2_proxy"
diff --git a/secrets.txt b/secrets.txt
index 89b7245..29ede44 100644
--- a/secrets.txt
+++ b/secrets.txt
@@ -26,3 +26,13 @@ oauth_proxy_super_secret_header
metrics_htpasswd_user
metrics_htpasswd_passwd
metrics_htpasswd
+healthchecks_io_ping
+ntfy_topic
+mmtc1_username
+mmtc1_password
+mmtc2_username
+mmtc2_password
+lizc1_username
+lizc1_password
+lizc2_username
+lizc2_password