From a6cc1ed3617ce0e71e5f73543a0ae4dcd38791a8 Mon Sep 17 00:00:00 2001 From: Elizabeth Hunt Date: Mon, 18 Aug 2025 12:38:15 -0700 Subject: Deploy monitoring stack --- group_vars/all.yml | 3 + group_vars/mon.yml | 34 +++++++ .../coffee/templates/stacks/docker-compose.yml | 2 +- .../roles/mon/templates/stacks/docker-compose.yml | 19 ++++ .../mon/templates/volumes/gatus/config/config.yml | 106 +++++++++++++++++++-- .../templates/volumes/oauth2proxy/oauth_proxy.cfg | 2 +- secrets.txt | 10 ++ 7 files changed, 166 insertions(+), 10 deletions(-) diff --git a/group_vars/all.yml b/group_vars/all.yml index 05ce106..e5c4611 100644 --- a/group_vars/all.yml +++ b/group_vars/all.yml @@ -102,6 +102,9 @@ mesh: name: "roton.{{ domain }}" ip: "10.128.0.103" + - type: "A" + name: "{{ domain }}" + ip: "{{ loadbalancer_ip }}" - type: "A" name: "oci.{{ domain }}" ip: "{{ loadbalancer_ip }}" diff --git a/group_vars/mon.yml b/group_vars/mon.yml index 1d0944e..2262bf3 100644 --- a/group_vars/mon.yml +++ b/group_vars/mon.yml @@ -3,4 +3,38 @@ mon_domain: mon.liz.coffee mon_base: "{{ swarm_base }}/mon" +public_resolver: "1.1.1.1" + prometheus_domain: prometheus.liz.coffee + +email_accounts: + - username: "{{ mmtc1_username }}" + email: "{{ mmtc1_username }}@mistymountainstherapy.com" + password: "{{ mmtc1_password }}" + host: "mail.mistymountainstherapy.com" + - username: "{{ mmtc2_username }}" + email: "{{ mmtc2_username }}@mistymountainstherapy.com" + password: "{{ mmtc2_password }}" + host: "mail.mistymountainstherapy.com" + - username: "{{ lizc1_username }}" + email: "{{ lizc1_username }}@{{ domain }}" + password: "{{ lizc1_password }}" + host: "{{ mail_domain }}" + - username: "{{ lizc2_username }}" + email: "{{ lizc2_username }}@{{ domain }}" + password: "{{ lizc2_password }}" + host: "{{ mail_domain }}" + +email_tests: + - name: "mmt_to_mmt" + from: "{{ mmtc1_username }}@mistymountainstherapy.com" + to: "{{ mmtc2_username }}@mistymountainstherapy.com" + - name: "mmt_to_liz" + from: "{{ mmtc1_username }}@mistymountainstherapy.com" + to: "{{ lizc2_username }}@{{ domain }}" + - name: "liz_to_mmt" + from: "{{ lizc1_username }}@{{ domain }}" + to: "{{ mmtc2_username }}@mistymountainstherapy.com" + - name: "liz_to_liz" + from: "{{ lizc2_username }}@{{ domain }}" + to: "{{ lizc1_username }}@{{ domain }}" diff --git a/playbooks/roles/coffee/templates/stacks/docker-compose.yml b/playbooks/roles/coffee/templates/stacks/docker-compose.yml index f3f33bd..1e7afb9 100644 --- a/playbooks/roles/coffee/templates/stacks/docker-compose.yml +++ b/playbooks/roles/coffee/templates/stacks/docker-compose.yml @@ -9,7 +9,7 @@ services: networks: - proxy healthcheck: - test: ["CMD-SHELL", "curl", "--fail", "http://localhost:8080"] + test: ["CMD-SHELL", "curl --fail http://localhost:8080"] timeout: 15s interval: 30s retries: 3 diff --git a/playbooks/roles/mon/templates/stacks/docker-compose.yml b/playbooks/roles/mon/templates/stacks/docker-compose.yml index 98332cc..31a5932 100644 --- a/playbooks/roles/mon/templates/stacks/docker-compose.yml +++ b/playbooks/roles/mon/templates/stacks/docker-compose.yml @@ -8,6 +8,7 @@ services: - TZ={{ timezone }} - DEPLOYMENT_TIME={{ deployment_time }} networks: + - monint - proxy - metrics deploy: @@ -29,6 +30,23 @@ services: - traefik.http.routers.mon.entrypoints=websecure - traefik.http.services.mon.loadbalancer.server.port=8080 + uptime: + image: oci.liz.coffee/emprespresso/uptime:release + networks: + - monint + environment: + - TZ={{ timezone }} + - DEPLOYMENT_TIME={{ deployment_time }} + deploy: + mode: replicated + replicas: 1 + update_config: + parallelism: 1 + order: start-first + failure_action: rollback + labels: + - traefik.enable=false + prometheus: image: prom/prometheus:latest volumes: @@ -56,6 +74,7 @@ services: - traefik.http.services.prometheus.loadbalancer.server.port=9090 networks: + monint: proxy: external: true metrics: diff --git a/playbooks/roles/mon/templates/volumes/gatus/config/config.yml b/playbooks/roles/mon/templates/volumes/gatus/config/config.yml index be11873..fb4e914 100644 --- a/playbooks/roles/mon/templates/volumes/gatus/config/config.yml +++ b/playbooks/roles/mon/templates/volumes/gatus/config/config.yml @@ -1,4 +1,14 @@ metrics: true +ui: + title: "liz.coffee health dashboard." + header: "liz.coffee" + description: "⋆⭒˚.⋆ 🐧 <3 ⋆⭒˚.⋆" + logo: "https://src.liz.coffee/cgit.png" + default-sort-by: "group" +alerting: + ntfy: + topic: "{{ ntfy_topic }}" + priority: 1 endpoints: - name: "HealthCheck" @@ -7,12 +17,11 @@ endpoints: conditions: - "[STATUS] == 200" - "[BODY] == pat(*OK*)" - - - name: "Expiration For {{ domain }}" - url: "https://{{ domain }}" - interval: 30m - conditions: - - "[CERTIFICATE_EXPIRATION] > 240h" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "GlobalHealthCheck" - name: "LDAPS" url: "tls://{{ idm_domain }}:3636" @@ -22,6 +31,52 @@ endpoints: conditions: - "[CONNECTED] == true" - "[CERTIFICATE_EXPIRATION] > 48h" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "LDAPS" + +{% for test in email_tests %} +{% set from_account = (email_accounts | selectattr("email", "equalto", test.from) | list).0 %} +{% set to_account = (email_accounts | selectattr("email", "equalto", test.to) | list).0 %} + - name: "{{ test.name }}" + url: "http://uptime:9000/email" + method: "POST" + interval: 5m + client: + timeout: 120s + group: "mail" + body: | + { + "from": { + "email": "{{ from_account.email }}", + "username": "{{ from_account.username }}", + "password": "{{ from_account.password }}", + "send_host": "{{ from_account.host }}", + "send_port": 465 + }, + "to": { + "email": "{{ to_account.email }}", + "username": "{{ to_account.username }}", + "password": "{{ to_account.password }}", + "read_host": "{{ to_account.host }}", + "read_port": 993 + }, + "readRetry": { + "interval": 5, + "attempts": 24 + } + } + conditions: + - "[STATUS] == 200" + - "[BODY] == pat(*ok*)" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "mail {{ test.name }}" +{% endfor %} {% for port in [465,993] %} - name: "mail on port {{ port }}" @@ -33,6 +88,11 @@ endpoints: conditions: - "[CONNECTED] == true" - "[CERTIFICATE_EXPIRATION] > 48h" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "mail on port {{ port }}" {% endfor %} {% for user, m in mesh.items() %} @@ -43,22 +103,42 @@ endpoints: interval: 1m conditions: - "[STATUS] == 200" + - "[CERTIFICATE_EXPIRATION] > 240h" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "healthcheck {{ user }} pub {{ healthcheck }} 200" {% endfor %} {% for healthcheck in m.private_healthchecks %} - name: "{{ healthcheck }} priv healthcheck {{ user }}" url: "{{ healthcheck }}" + client: + dns-resolver: "tcp://{{ m.gateway }}:53" group: "{{ user }}_priv" interval: 1m conditions: - "[STATUS] == 200" + - "[CERTIFICATE_EXPIRATION] > 240h" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "{{ healthcheck }} priv healthcheck {{ user }}" + - name: "{{ healthcheck }} pub healthcheck {{ user }} 403" group: "{{ user }}_priv" url: "{{ healthcheck }}" client: - dns-resolver: "tcp://1.1.1.1:53" + dns-resolver: "tcp://{{ public_resolver }}:53" interval: 1m conditions: - "[STATUS] == 403" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "{{ healthcheck }} pub healthcheck {{ user }} 403" {% endfor %} {% for record in m.private_records %} - name: "DNS Check [{{ record.name }}_{{ record.type }}]" @@ -71,11 +151,21 @@ endpoints: conditions: - "[BODY] == {{ record.ip }}" - "[DNS_RCODE] == NOERROR" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "DNS {{ record.name }}_{{ record.type }}" - - name: "PING {{ record.name }}" + - name: "PING {{ record.name }}_{{ record.type }}" group: "{{ user }}_dns_private" url: "icmp://{{ record.name }}" conditions: - "[CONNECTED] == true" + alerts: + - type: ntfy + failure-threshold: 1 + send-on-resolved: true + description: "PING {{ record.name }}" {% endfor %} {% endfor %} diff --git a/playbooks/roles/traefik/templates/volumes/oauth2proxy/oauth_proxy.cfg b/playbooks/roles/traefik/templates/volumes/oauth2proxy/oauth_proxy.cfg index 3c412de..fff224b 100644 --- a/playbooks/roles/traefik/templates/volumes/oauth2proxy/oauth_proxy.cfg +++ b/playbooks/roles/traefik/templates/volumes/oauth2proxy/oauth_proxy.cfg @@ -5,7 +5,7 @@ email_domains = "*" reverse_proxy = true redirect_url = "https://{{ oauth_proxy_domain }}/oauth2/callback" real_client_ip_header = "X-Forwarded-For" -trusted_ips = "{{ homelab_network }}" +trusted_ips = "{{ swarm_network }}" ## Cookie Settings cookie_name = "_oauth2_proxy" diff --git a/secrets.txt b/secrets.txt index 89b7245..29ede44 100644 --- a/secrets.txt +++ b/secrets.txt @@ -26,3 +26,13 @@ oauth_proxy_super_secret_header metrics_htpasswd_user metrics_htpasswd_passwd metrics_htpasswd +healthchecks_io_ping +ntfy_topic +mmtc1_username +mmtc1_password +mmtc2_username +mmtc2_password +lizc1_username +lizc1_password +lizc2_username +lizc2_password -- cgit v1.2.3-70-g09d2