summaryrefslogtreecommitdiff
path: root/playbooks/roles
diff options
context:
space:
mode:
authorElizabeth Hunt <me@liz.coffee>2025-09-07 22:14:12 -0700
committerElizabeth Hunt <me@liz.coffee>2025-09-07 22:14:12 -0700
commitb0c0189066a486f9da4a048c7780dbc640031d27 (patch)
tree6ad5997b7ce9e3d4a0232071f103622aca2395ca /playbooks/roles
parent9859fc0ee7503d853b5d028f0594b6f885d27dc3 (diff)
downloadinfra-b0c0189066a486f9da4a048c7780dbc640031d27.tar.gz
infra-b0c0189066a486f9da4a048c7780dbc640031d27.zip
Remove portainer and add some monitoring updates
Diffstat (limited to 'playbooks/roles')
-rw-r--r--playbooks/roles/mon/templates/stacks/docker-compose.yml29
-rw-r--r--playbooks/roles/mon/templates/volumes/gatus/config/config.yml45
-rw-r--r--playbooks/roles/mon/templates/volumes/grafana/config.ini28
-rw-r--r--playbooks/roles/mon/templates/volumes/grafana/data/.gitkeep (renamed from playbooks/roles/portainer/templates/volumes/data/.gitkeep)0
-rw-r--r--playbooks/roles/mon/templates/volumes/prometheus/config.yml9
-rw-r--r--playbooks/roles/outbound/templates/proxy/nginx/conf.d/graph.conf19
-rw-r--r--playbooks/roles/portainer/tasks/main.yml9
-rw-r--r--playbooks/roles/portainer/templates/stacks/docker-compose.yml52
-rw-r--r--playbooks/roles/traefik/templates/stacks/traefik.yml3
9 files changed, 117 insertions, 77 deletions
diff --git a/playbooks/roles/mon/templates/stacks/docker-compose.yml b/playbooks/roles/mon/templates/stacks/docker-compose.yml
index 31a5932..31fe0c1 100644
--- a/playbooks/roles/mon/templates/stacks/docker-compose.yml
+++ b/playbooks/roles/mon/templates/stacks/docker-compose.yml
@@ -47,11 +47,40 @@ services:
labels:
- traefik.enable=false
+ grafana:
+ image: grafana/grafana:latest
+ environment:
+ - TZ={{ timezone }}
+ - DEPLOYMENT_TIME={{ deployment_time }}
+ volumes:
+ - "{{ mon_base }}/volumes/grafana/data:/var/lib/grafana"
+ - "{{ mon_base }}/volumes/grafana/config.ini:/etc/grafana/grafana.ini:ro"
+ networks:
+ - monint
+ - proxy
+ - metrics
+ deploy:
+ mode: replicated
+ replicas: 1
+ update_config:
+ parallelism: 1
+ order: start-first
+ failure_action: rollback
+ labels:
+ - traefik.enable=true
+ - traefik.swarm.network=proxy
+ - traefik.http.routers.grafana.tls=true
+ - traefik.http.routers.grafana.tls.certResolver=letsencrypt
+ - traefik.http.routers.grafana.rule=Host(`{{ grafana_domain }}`)
+ - traefik.http.routers.grafana.entrypoints=websecure
+ - traefik.http.services.grafana.loadbalancer.server.port=3000
+
prometheus:
image: prom/prometheus:latest
volumes:
- "{{ mon_base }}/volumes/prometheus/config.yml:/etc/prometheus/prometheus.yml"
networks:
+ - monint
- proxy
- metrics
environment:
diff --git a/playbooks/roles/mon/templates/volumes/gatus/config/config.yml b/playbooks/roles/mon/templates/volumes/gatus/config/config.yml
index 403df4a..e5fcf73 100644
--- a/playbooks/roles/mon/templates/volumes/gatus/config/config.yml
+++ b/playbooks/roles/mon/templates/volumes/gatus/config/config.yml
@@ -19,13 +19,13 @@ endpoints:
- "[BODY] == pat(*OK*)"
alerts:
- type: ntfy
- failure-threshold: 1
+ failure-threshold: 3
send-on-resolved: true
description: "GlobalHealthCheck"
- name: "LDAPS"
url: "tls://{{ idm_domain }}:3636"
- interval: 5m
+ interval: 2m
client:
timeout: 5s
conditions:
@@ -33,10 +33,24 @@ endpoints:
- "[CERTIFICATE_EXPIRATION] > 48h"
alerts:
- type: ntfy
- failure-threshold: 1
+ failure-threshold: 3
send-on-resolved: true
description: "LDAPS"
+ - name: "ssh for git"
+ url: "tls://src.{{ domain }}:2222"
+ interval: 2m
+ client:
+ timeout: 5s
+ conditions:
+ - "[CONNECTED] == true"
+ - "[CERTIFICATE_EXPIRATION] > 48h"
+ alerts:
+ - type: ntfy
+ failure-threshold: 3
+ send-on-resolved: true
+ description: "ssh for git"
+
{% for test in email_tests %}
{% set from_account = (email_accounts | selectattr("email", "equalto", test.from) | list).0 %}
{% set to_account = (email_accounts | selectattr("email", "equalto", test.to) | list).0 %}
@@ -73,7 +87,7 @@ endpoints:
- "[BODY] == pat(*ok*)"
alerts:
- type: ntfy
- failure-threshold: 1
+ failure-threshold: 3
send-on-resolved: true
description: "mail {{ test.name }}"
{% endfor %}
@@ -82,7 +96,7 @@ endpoints:
- name: "mail on port {{ port }}"
group: "mail"
url: "tls://{{ mail_domain }}:{{ port }}"
- interval: 5m
+ interval: 2m
client:
timeout: 5s
conditions:
@@ -90,7 +104,7 @@ endpoints:
- "[CERTIFICATE_EXPIRATION] > 48h"
alerts:
- type: ntfy
- failure-threshold: 1
+ failure-threshold: 3
send-on-resolved: true
description: "mail on port {{ port }}"
{% endfor %}
@@ -100,13 +114,13 @@ endpoints:
- name: "healthcheck {{ user }} pub {{ healthcheck }} 200"
group: "{{ user }}_pub"
url: "{{ healthcheck }}"
- interval: 1m
+ interval: 2m
conditions:
- "[STATUS] == 200"
- "[CERTIFICATE_EXPIRATION] > 240h"
alerts:
- type: ntfy
- failure-threshold: 1
+ failure-threshold: 3
send-on-resolved: true
description: "healthcheck {{ user }} pub {{ healthcheck }} 200"
{% endfor %}
@@ -116,13 +130,13 @@ endpoints:
client:
dns-resolver: "tcp://{{ m.gateway }}:53"
group: "{{ user }}_priv"
- interval: 1m
+ interval: 2m
conditions:
- "[STATUS] == 200"
- "[CERTIFICATE_EXPIRATION] > 240h"
alerts:
- type: ntfy
- failure-threshold: 1
+ failure-threshold: 3
send-on-resolved: true
description: "{{ healthcheck }} priv healthcheck {{ user }}"
@@ -131,12 +145,12 @@ endpoints:
url: "{{ healthcheck }}"
client:
dns-resolver: "tcp://{{ public_resolver }}:53"
- interval: 1m
+ interval: 2m
conditions:
- "[STATUS] == 403"
alerts:
- type: ntfy
- failure-threshold: 1
+ failure-threshold: 3
send-on-resolved: true
description: "{{ healthcheck }} pub healthcheck {{ user }} 403"
{% endfor %}
@@ -144,7 +158,7 @@ endpoints:
- name: "DNS Check [{{ record.name }}_{{ record.type }}]"
group: "{{ user }}_dns_private"
url: "{{ m.gateway }}"
- interval: 5m
+ interval: 2m
dns:
query-name: "{{ record.name }}"
query-type: "{{ record.type }}"
@@ -153,18 +167,19 @@ endpoints:
- "[DNS_RCODE] == NOERROR"
alerts:
- type: ntfy
- failure-threshold: 1
+ failure-threshold: 3
send-on-resolved: true
description: "DNS {{ record.name }}_{{ record.type }}"
- name: "PING {{ record.name }}_{{ record.type }}"
group: "{{ user }}_dns_private"
url: "icmp://{{ record.name }}"
+ interval: 2m
conditions:
- "[CONNECTED] == true"
alerts:
- type: ntfy
- failure-threshold: 1
+ failure-threshold: 3
send-on-resolved: true
description: "PING {{ record.name }}"
{% endfor %}
diff --git a/playbooks/roles/mon/templates/volumes/grafana/config.ini b/playbooks/roles/mon/templates/volumes/grafana/config.ini
new file mode 100644
index 0000000..e0371ea
--- /dev/null
+++ b/playbooks/roles/mon/templates/volumes/grafana/config.ini
@@ -0,0 +1,28 @@
+[date_formats]
+full_date = YYYY-MM-DD @ HH:mm:ss
+interval_second = HH:mm:ss
+interval_minute = HH:mm
+interval_hour = DD.MM. HH:mm
+interval_day = DD.MM.
+interval_month = MM-YYYY
+interval_year = YYYY
+
+[server]
+root_url = https://{{ grafana_domain }}
+
+[auth.generic_oauth]
+enabled = true
+name = liz.coffee <3
+icon = signin
+client_id = grafana
+client_secret = {{ grafana_secret }}
+scopes = openid profile email groups
+empty_scopes = false
+auth_url="https://{{ idm_domain }}/ui/oauth2"
+token_url="https://{{ idm_domain }}/oauth2/token"
+api_url="https://{{ idm_domain }}/oauth2/openid/grafana/userinfo"
+login_attribute_path = preferred_username
+groups_attribute_path = groups
+name_attribute_path = name
+role_attribute_path = contains(groups, 'grafana_admins@idm.liz.coffee') && 'Admin' || 'Viewer'
+use_pkce = true
diff --git a/playbooks/roles/portainer/templates/volumes/data/.gitkeep b/playbooks/roles/mon/templates/volumes/grafana/data/.gitkeep
index e69de29..e69de29 100644
--- a/playbooks/roles/portainer/templates/volumes/data/.gitkeep
+++ b/playbooks/roles/mon/templates/volumes/grafana/data/.gitkeep
diff --git a/playbooks/roles/mon/templates/volumes/prometheus/config.yml b/playbooks/roles/mon/templates/volumes/prometheus/config.yml
index be59f7f..7476367 100644
--- a/playbooks/roles/mon/templates/volumes/prometheus/config.yml
+++ b/playbooks/roles/mon/templates/volumes/prometheus/config.yml
@@ -1,5 +1,5 @@
global:
- scrape_interval: 20s
+ scrape_interval: 30s
scrape_configs:
- job_name: prometheus
@@ -22,6 +22,13 @@ scrape_configs:
- targets:
- traefik_traefik:5577
+ - job_name: proxmox
+ static_configs:
+ - targets:
+ - piplup.liz.coffee:9001
+ - togepi.liz.coffee:9001
+ - roton.liz.coffee:9001
+
- job_name: headscale
static_configs:
- targets:
diff --git a/playbooks/roles/outbound/templates/proxy/nginx/conf.d/graph.conf b/playbooks/roles/outbound/templates/proxy/nginx/conf.d/graph.conf
new file mode 100644
index 0000000..6c74f8e
--- /dev/null
+++ b/playbooks/roles/outbound/templates/proxy/nginx/conf.d/graph.conf
@@ -0,0 +1,19 @@
+server {
+ listen 80;
+ server_name graph.liz.coffee;
+
+ real_ip_header X-Forwarded-For;
+ real_ip_recursive on;
+ set_real_ip_from {{ docker_network }};
+
+ location / {
+ proxy_pass https://{{ loadbalancer_ip }};
+ proxy_ssl_verify off;
+ proxy_http_version 1.1;
+ proxy_set_header Host $host;
+ proxy_set_header X-Real-IP $remote_addr;
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+ proxy_set_header Upgrade $http_upgrade;
+ proxy_set_header Connection "upgrade";
+ }
+}
diff --git a/playbooks/roles/portainer/tasks/main.yml b/playbooks/roles/portainer/tasks/main.yml
deleted file mode 100644
index 7f26a5f..0000000
--- a/playbooks/roles/portainer/tasks/main.yml
+++ /dev/null
@@ -1,9 +0,0 @@
----
-
-- name: Deploy portainer
- ansible.builtin.import_tasks: manage-docker-swarm-service.yml
- vars:
- service_name: portainer
- template_render_dir: "../templates"
- service_destination_dir: "{{ portainer_base }}"
-
diff --git a/playbooks/roles/portainer/templates/stacks/docker-compose.yml b/playbooks/roles/portainer/templates/stacks/docker-compose.yml
deleted file mode 100644
index 5f28e5a..0000000
--- a/playbooks/roles/portainer/templates/stacks/docker-compose.yml
+++ /dev/null
@@ -1,52 +0,0 @@
-version: '3.2'
-
-services:
- agent:
- image: portainer/agent:2.21.5
- volumes:
- - /var/run/docker.sock:/var/run/docker.sock
- - /var/lib/docker/volumes:/var/lib/docker/volumes
- networks:
- - agent_network
- deploy:
- mode: global
- placement:
- constraints: [node.platform.os == linux]
- environment:
- - TZ={{ timezone }}
- - DEPLOYMENT_TIME={{ deployment_time }}
-
- portainer:
- image: portainer/portainer-ce:alpine
- command: -H tcp://tasks.agent:9001 --tlsskipverify
- ports:
- - "8000:8000"
- volumes:
- - /var/run/docker.sock:/var/run/docker.sock
- - {{ portainer_base }}/volumes/data:/data
- environment:
- - TZ={{ timezone }}
- - DEPLOYMENT_TIME={{ deployment_time }}
- networks:
- - proxy
- - agent_network
- deploy:
- mode: replicated
- replicas: 1
- placement:
- constraints: [node.role == manager]
- labels:
- - traefik.enable=true
- - traefik.swarm.network=proxy
- - traefik.http.routers.portainer.rule=Host(`{{ portainer_host }}`)
- - traefik.http.routers.portainer.entrypoints=websecure
- - traefik.http.routers.portainer.tls=true
- - traefik.http.routers.portainer.tls.certResolver=letsencrypt
- - traefik.http.services.portainer.loadbalancer.server.port=9000
-
-networks:
- proxy:
- external: true
- agent_network:
- driver: overlay
- attachable: true
diff --git a/playbooks/roles/traefik/templates/stacks/traefik.yml b/playbooks/roles/traefik/templates/stacks/traefik.yml
index 68235e4..8caa379 100644
--- a/playbooks/roles/traefik/templates/stacks/traefik.yml
+++ b/playbooks/roles/traefik/templates/stacks/traefik.yml
@@ -47,5 +47,8 @@ certificatesResolvers:
# caServer: https://acme-staging-v02.api.letsencrypt.org/directory # staging
dnsChallenge:
provider: cloudflare
+ resolvers:
+ - 1.0.0.1
+ - 1.1.1.1
propagation:
delayBeforeChecks: 12s