diff options
author | Elizabeth Hunt <me@liz.coffee> | 2025-09-07 22:14:12 -0700 |
---|---|---|
committer | Elizabeth Hunt <me@liz.coffee> | 2025-09-07 22:14:12 -0700 |
commit | b0c0189066a486f9da4a048c7780dbc640031d27 (patch) | |
tree | 6ad5997b7ce9e3d4a0232071f103622aca2395ca /playbooks/roles | |
parent | 9859fc0ee7503d853b5d028f0594b6f885d27dc3 (diff) | |
download | infra-b0c0189066a486f9da4a048c7780dbc640031d27.tar.gz infra-b0c0189066a486f9da4a048c7780dbc640031d27.zip |
Remove portainer and add some monitoring updates
Diffstat (limited to 'playbooks/roles')
-rw-r--r-- | playbooks/roles/mon/templates/stacks/docker-compose.yml | 29 | ||||
-rw-r--r-- | playbooks/roles/mon/templates/volumes/gatus/config/config.yml | 45 | ||||
-rw-r--r-- | playbooks/roles/mon/templates/volumes/grafana/config.ini | 28 | ||||
-rw-r--r-- | playbooks/roles/mon/templates/volumes/grafana/data/.gitkeep (renamed from playbooks/roles/portainer/templates/volumes/data/.gitkeep) | 0 | ||||
-rw-r--r-- | playbooks/roles/mon/templates/volumes/prometheus/config.yml | 9 | ||||
-rw-r--r-- | playbooks/roles/outbound/templates/proxy/nginx/conf.d/graph.conf | 19 | ||||
-rw-r--r-- | playbooks/roles/portainer/tasks/main.yml | 9 | ||||
-rw-r--r-- | playbooks/roles/portainer/templates/stacks/docker-compose.yml | 52 | ||||
-rw-r--r-- | playbooks/roles/traefik/templates/stacks/traefik.yml | 3 |
9 files changed, 117 insertions, 77 deletions
diff --git a/playbooks/roles/mon/templates/stacks/docker-compose.yml b/playbooks/roles/mon/templates/stacks/docker-compose.yml index 31a5932..31fe0c1 100644 --- a/playbooks/roles/mon/templates/stacks/docker-compose.yml +++ b/playbooks/roles/mon/templates/stacks/docker-compose.yml @@ -47,11 +47,40 @@ services: labels: - traefik.enable=false + grafana: + image: grafana/grafana:latest + environment: + - TZ={{ timezone }} + - DEPLOYMENT_TIME={{ deployment_time }} + volumes: + - "{{ mon_base }}/volumes/grafana/data:/var/lib/grafana" + - "{{ mon_base }}/volumes/grafana/config.ini:/etc/grafana/grafana.ini:ro" + networks: + - monint + - proxy + - metrics + deploy: + mode: replicated + replicas: 1 + update_config: + parallelism: 1 + order: start-first + failure_action: rollback + labels: + - traefik.enable=true + - traefik.swarm.network=proxy + - traefik.http.routers.grafana.tls=true + - traefik.http.routers.grafana.tls.certResolver=letsencrypt + - traefik.http.routers.grafana.rule=Host(`{{ grafana_domain }}`) + - traefik.http.routers.grafana.entrypoints=websecure + - traefik.http.services.grafana.loadbalancer.server.port=3000 + prometheus: image: prom/prometheus:latest volumes: - "{{ mon_base }}/volumes/prometheus/config.yml:/etc/prometheus/prometheus.yml" networks: + - monint - proxy - metrics environment: diff --git a/playbooks/roles/mon/templates/volumes/gatus/config/config.yml b/playbooks/roles/mon/templates/volumes/gatus/config/config.yml index 403df4a..e5fcf73 100644 --- a/playbooks/roles/mon/templates/volumes/gatus/config/config.yml +++ b/playbooks/roles/mon/templates/volumes/gatus/config/config.yml @@ -19,13 +19,13 @@ endpoints: - "[BODY] == pat(*OK*)" alerts: - type: ntfy - failure-threshold: 1 + failure-threshold: 3 send-on-resolved: true description: "GlobalHealthCheck" - name: "LDAPS" url: "tls://{{ idm_domain }}:3636" - interval: 5m + interval: 2m client: timeout: 5s conditions: @@ -33,10 +33,24 @@ endpoints: - "[CERTIFICATE_EXPIRATION] > 48h" alerts: - type: ntfy - failure-threshold: 1 + failure-threshold: 3 send-on-resolved: true description: "LDAPS" + - name: "ssh for git" + url: "tls://src.{{ domain }}:2222" + interval: 2m + client: + timeout: 5s + conditions: + - "[CONNECTED] == true" + - "[CERTIFICATE_EXPIRATION] > 48h" + alerts: + - type: ntfy + failure-threshold: 3 + send-on-resolved: true + description: "ssh for git" + {% for test in email_tests %} {% set from_account = (email_accounts | selectattr("email", "equalto", test.from) | list).0 %} {% set to_account = (email_accounts | selectattr("email", "equalto", test.to) | list).0 %} @@ -73,7 +87,7 @@ endpoints: - "[BODY] == pat(*ok*)" alerts: - type: ntfy - failure-threshold: 1 + failure-threshold: 3 send-on-resolved: true description: "mail {{ test.name }}" {% endfor %} @@ -82,7 +96,7 @@ endpoints: - name: "mail on port {{ port }}" group: "mail" url: "tls://{{ mail_domain }}:{{ port }}" - interval: 5m + interval: 2m client: timeout: 5s conditions: @@ -90,7 +104,7 @@ endpoints: - "[CERTIFICATE_EXPIRATION] > 48h" alerts: - type: ntfy - failure-threshold: 1 + failure-threshold: 3 send-on-resolved: true description: "mail on port {{ port }}" {% endfor %} @@ -100,13 +114,13 @@ endpoints: - name: "healthcheck {{ user }} pub {{ healthcheck }} 200" group: "{{ user }}_pub" url: "{{ healthcheck }}" - interval: 1m + interval: 2m conditions: - "[STATUS] == 200" - "[CERTIFICATE_EXPIRATION] > 240h" alerts: - type: ntfy - failure-threshold: 1 + failure-threshold: 3 send-on-resolved: true description: "healthcheck {{ user }} pub {{ healthcheck }} 200" {% endfor %} @@ -116,13 +130,13 @@ endpoints: client: dns-resolver: "tcp://{{ m.gateway }}:53" group: "{{ user }}_priv" - interval: 1m + interval: 2m conditions: - "[STATUS] == 200" - "[CERTIFICATE_EXPIRATION] > 240h" alerts: - type: ntfy - failure-threshold: 1 + failure-threshold: 3 send-on-resolved: true description: "{{ healthcheck }} priv healthcheck {{ user }}" @@ -131,12 +145,12 @@ endpoints: url: "{{ healthcheck }}" client: dns-resolver: "tcp://{{ public_resolver }}:53" - interval: 1m + interval: 2m conditions: - "[STATUS] == 403" alerts: - type: ntfy - failure-threshold: 1 + failure-threshold: 3 send-on-resolved: true description: "{{ healthcheck }} pub healthcheck {{ user }} 403" {% endfor %} @@ -144,7 +158,7 @@ endpoints: - name: "DNS Check [{{ record.name }}_{{ record.type }}]" group: "{{ user }}_dns_private" url: "{{ m.gateway }}" - interval: 5m + interval: 2m dns: query-name: "{{ record.name }}" query-type: "{{ record.type }}" @@ -153,18 +167,19 @@ endpoints: - "[DNS_RCODE] == NOERROR" alerts: - type: ntfy - failure-threshold: 1 + failure-threshold: 3 send-on-resolved: true description: "DNS {{ record.name }}_{{ record.type }}" - name: "PING {{ record.name }}_{{ record.type }}" group: "{{ user }}_dns_private" url: "icmp://{{ record.name }}" + interval: 2m conditions: - "[CONNECTED] == true" alerts: - type: ntfy - failure-threshold: 1 + failure-threshold: 3 send-on-resolved: true description: "PING {{ record.name }}" {% endfor %} diff --git a/playbooks/roles/mon/templates/volumes/grafana/config.ini b/playbooks/roles/mon/templates/volumes/grafana/config.ini new file mode 100644 index 0000000..e0371ea --- /dev/null +++ b/playbooks/roles/mon/templates/volumes/grafana/config.ini @@ -0,0 +1,28 @@ +[date_formats] +full_date = YYYY-MM-DD @ HH:mm:ss +interval_second = HH:mm:ss +interval_minute = HH:mm +interval_hour = DD.MM. HH:mm +interval_day = DD.MM. +interval_month = MM-YYYY +interval_year = YYYY + +[server] +root_url = https://{{ grafana_domain }} + +[auth.generic_oauth] +enabled = true +name = liz.coffee <3 +icon = signin +client_id = grafana +client_secret = {{ grafana_secret }} +scopes = openid profile email groups +empty_scopes = false +auth_url="https://{{ idm_domain }}/ui/oauth2" +token_url="https://{{ idm_domain }}/oauth2/token" +api_url="https://{{ idm_domain }}/oauth2/openid/grafana/userinfo" +login_attribute_path = preferred_username +groups_attribute_path = groups +name_attribute_path = name +role_attribute_path = contains(groups, 'grafana_admins@idm.liz.coffee') && 'Admin' || 'Viewer' +use_pkce = true diff --git a/playbooks/roles/portainer/templates/volumes/data/.gitkeep b/playbooks/roles/mon/templates/volumes/grafana/data/.gitkeep index e69de29..e69de29 100644 --- a/playbooks/roles/portainer/templates/volumes/data/.gitkeep +++ b/playbooks/roles/mon/templates/volumes/grafana/data/.gitkeep diff --git a/playbooks/roles/mon/templates/volumes/prometheus/config.yml b/playbooks/roles/mon/templates/volumes/prometheus/config.yml index be59f7f..7476367 100644 --- a/playbooks/roles/mon/templates/volumes/prometheus/config.yml +++ b/playbooks/roles/mon/templates/volumes/prometheus/config.yml @@ -1,5 +1,5 @@ global: - scrape_interval: 20s + scrape_interval: 30s scrape_configs: - job_name: prometheus @@ -22,6 +22,13 @@ scrape_configs: - targets: - traefik_traefik:5577 + - job_name: proxmox + static_configs: + - targets: + - piplup.liz.coffee:9001 + - togepi.liz.coffee:9001 + - roton.liz.coffee:9001 + - job_name: headscale static_configs: - targets: diff --git a/playbooks/roles/outbound/templates/proxy/nginx/conf.d/graph.conf b/playbooks/roles/outbound/templates/proxy/nginx/conf.d/graph.conf new file mode 100644 index 0000000..6c74f8e --- /dev/null +++ b/playbooks/roles/outbound/templates/proxy/nginx/conf.d/graph.conf @@ -0,0 +1,19 @@ +server { + listen 80; + server_name graph.liz.coffee; + + real_ip_header X-Forwarded-For; + real_ip_recursive on; + set_real_ip_from {{ docker_network }}; + + location / { + proxy_pass https://{{ loadbalancer_ip }}; + proxy_ssl_verify off; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + } +} diff --git a/playbooks/roles/portainer/tasks/main.yml b/playbooks/roles/portainer/tasks/main.yml deleted file mode 100644 index 7f26a5f..0000000 --- a/playbooks/roles/portainer/tasks/main.yml +++ /dev/null @@ -1,9 +0,0 @@ ---- - -- name: Deploy portainer - ansible.builtin.import_tasks: manage-docker-swarm-service.yml - vars: - service_name: portainer - template_render_dir: "../templates" - service_destination_dir: "{{ portainer_base }}" - diff --git a/playbooks/roles/portainer/templates/stacks/docker-compose.yml b/playbooks/roles/portainer/templates/stacks/docker-compose.yml deleted file mode 100644 index 5f28e5a..0000000 --- a/playbooks/roles/portainer/templates/stacks/docker-compose.yml +++ /dev/null @@ -1,52 +0,0 @@ -version: '3.2' - -services: - agent: - image: portainer/agent:2.21.5 - volumes: - - /var/run/docker.sock:/var/run/docker.sock - - /var/lib/docker/volumes:/var/lib/docker/volumes - networks: - - agent_network - deploy: - mode: global - placement: - constraints: [node.platform.os == linux] - environment: - - TZ={{ timezone }} - - DEPLOYMENT_TIME={{ deployment_time }} - - portainer: - image: portainer/portainer-ce:alpine - command: -H tcp://tasks.agent:9001 --tlsskipverify - ports: - - "8000:8000" - volumes: - - /var/run/docker.sock:/var/run/docker.sock - - {{ portainer_base }}/volumes/data:/data - environment: - - TZ={{ timezone }} - - DEPLOYMENT_TIME={{ deployment_time }} - networks: - - proxy - - agent_network - deploy: - mode: replicated - replicas: 1 - placement: - constraints: [node.role == manager] - labels: - - traefik.enable=true - - traefik.swarm.network=proxy - - traefik.http.routers.portainer.rule=Host(`{{ portainer_host }}`) - - traefik.http.routers.portainer.entrypoints=websecure - - traefik.http.routers.portainer.tls=true - - traefik.http.routers.portainer.tls.certResolver=letsencrypt - - traefik.http.services.portainer.loadbalancer.server.port=9000 - -networks: - proxy: - external: true - agent_network: - driver: overlay - attachable: true diff --git a/playbooks/roles/traefik/templates/stacks/traefik.yml b/playbooks/roles/traefik/templates/stacks/traefik.yml index 68235e4..8caa379 100644 --- a/playbooks/roles/traefik/templates/stacks/traefik.yml +++ b/playbooks/roles/traefik/templates/stacks/traefik.yml @@ -47,5 +47,8 @@ certificatesResolvers: # caServer: https://acme-staging-v02.api.letsencrypt.org/directory # staging dnsChallenge: provider: cloudflare + resolvers: + - 1.0.0.1 + - 1.1.1.1 propagation: delayBeforeChecks: 12s |