diff options
| author | Elizabeth Hunt <me@liz.coffee> | 2025-10-04 16:25:00 -0700 |
|---|---|---|
| committer | Elizabeth Hunt <me@liz.coffee> | 2025-10-04 16:25:25 -0700 |
| commit | 0ba6199538478b22763cc4c768c775a9c20baac9 (patch) | |
| tree | e5f97e9d382b6a869e74b0ec37670fc4cc04770c | |
| parent | a59455fe4f0d06f85800117d7871496ff9fa916f (diff) | |
| download | infra-0ba6199538478b22763cc4c768c775a9c20baac9.tar.gz infra-0ba6199538478b22763cc4c768c775a9c20baac9.zip | |
Fix some stuff
| -rw-r--r-- | CLAUDE.md | 121 | ||||
| -rw-r--r-- | ansible.cfg | 1 | ||||
| -rw-r--r-- | group_vars/backup.yml | 2 | ||||
| -rw-r--r-- | playbooks/roles/backup/templates/stacks/docker-compose.yml | 2 | ||||
| -rw-r--r-- | playbooks/roles/backup/templates/volumes/scripts/backup.py | 37 | ||||
| -rw-r--r-- | secrets.txt | 1 | ||||
| -rw-r--r-- | tasks/copy-rendered-templates-recursive.yml | 14 |
7 files changed, 166 insertions, 12 deletions
diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..1213254 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,121 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Overview + +This is an Ansible-based infrastructure-as-code repository for managing the liz.coffee homelab infrastructure. It orchestrates deployment of services across a Docker Swarm cluster (3 nodes: swarm-one, swarm-two, swarm-three) and an outbound proxy server. + +## Architecture + +### Infrastructure Layout + +- **Swarm Cluster**: 3-node Docker Swarm cluster at 10.128.0.201-203 + - Primary services deployed as Docker Swarm stacks + - Shared Ceph storage mounted across all nodes + - Keepalived for high availability + - Traefik as the ingress controller with automatic TLS via Let's Encrypt + +- **Outbound Proxy**: External-facing NGINX reverse proxy (outbound-two.liz.coffee) + - Routes external traffic to internal services via the swarm loadbalancer + - Uses docker-compose instead of swarm stacks + +### Service Deployment Patterns + +Services fall into two deployment models: + +1. **Docker Swarm Services** (most services): Use `tasks/manage-docker-swarm-service.yml` + - Deployed via `docker stack deploy` + - Templates rendered from `playbooks/roles/{service}/templates/` + - Health checks and rolling updates configured in docker-compose.yml + - Traefik labels for automatic routing and TLS + +2. **Docker Compose Services** (nginx_proxy, outbound): Use `tasks/manage-docker-compose-service.yml` + - Deployed via systemd service `docker-compose@{service}` + - Supports rollout using docker-rollout tool for zero-downtime deployments + +### Common Task Files + +- `tasks/manage-docker-swarm-service.yml`: Renders templates and deploys swarm stack +- `tasks/manage-docker-compose-service.yml`: Renders templates, manages systemd service, performs rollouts +- `tasks/copy-rendered-templates-recursive.yml`: Copies Jinja2 templates to destination + +## Key Commands + +### Vault Management + +Initialize or update vault secrets: +```bash +./ansible-vault-init.sh [secret_name] +``` + +To avoid password prompts, store vault password in `secrets.pwd`: +```bash +echo "your_password" > secrets.pwd +``` + +### Deployment + +Full deployment (all services in order): +```bash +ansible-playbook -e @secrets.enc --vault-password-file secrets.pwd deploy.yml +``` + +Deploy a specific playbook during development: +```bash +ansible-playbook -e @secrets.enc --vault-password-file secrets.pwd playbooks/{service}.yml +``` + +### Linting + +```bash +yamllint --strict . +ansible-lint +``` + +### Creating New Services + +Use the `create.py` script to scaffold a new service: +```bash +./create.py --service-name myservice --container-image myimage:latest --service-port 8080 [--external] [--internal] +``` + +This generates: +- Ansible role in `playbooks/roles/{service}/` +- Docker compose template with Traefik labels +- Group vars in `group_vars/{service}.yml` +- Inventory entry and playbook hook in `deploy.yml` +- NGINX config (if `--external` specified) +- DNS records (Cloudflare if `--external`, LabDNS if `--internal`) + +## File Organization + +- `inventory`: Ansible inventory defining host groups and connection details +- `deploy.yml`: Master playbook importing all service playbooks in deployment order +- `playbooks/`: Individual service playbooks +- `playbooks/roles/`: Service-specific roles containing tasks and templates + - `{service}/tasks/main.yml`: Task entry point + - `{service}/templates/`: Jinja2 templates (docker-compose.yml, configs, etc.) +- `group_vars/`: Variables per service/host group +- `secrets.enc`: Ansible vault encrypted secrets +- `ansible.cfg`: Ansible configuration (inventory path, SSH settings) + +## Variable Conventions + +Each service typically defines in `group_vars/{service}.yml`: +- `{service}_domain`: FQDN for the service +- `{service}_base`: Base directory path on swarm nodes (usually under `{{ swarm_base }}`) + +Common variables available across all playbooks: +- `deployment_time`: Timestamp of deployment (forces container recreation) +- `timezone`: System timezone +- `homelab_build`: Boolean indicating local vs production deployment +- `loadbalancer_ip`: Internal VIP for the swarm cluster + +## Important Notes + +- Most services use swarm-one (10.128.0.201) as the deployment target in inventory +- Secrets are referenced as `{{ secret_name }}` from the vault +- All swarm services should connect to the `proxy` external network for Traefik routing +- Use `--resolve-image=always` in stack deploys to ensure latest images are pulled +- The outbound role manages NGINX configs in `playbooks/roles/outbound/templates/proxy/nginx/conf.d/` diff --git a/ansible.cfg b/ansible.cfg index 64b2591..dcb0621 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -1,4 +1,3 @@ [defaults] inventory = inventory host_key_checking = False -private_key_file = /tmp/key diff --git a/group_vars/backup.yml b/group_vars/backup.yml index 503e1ac..beba0e6 100644 --- a/group_vars/backup.yml +++ b/group_vars/backup.yml @@ -4,7 +4,7 @@ backup_domain: backup.{{ domain }} backup_base: "{{ swarm_base }}/backup" backup_retention_days: 10 ntfy_topic: "{{ ntfy_topic }}" -blocklist: "node_modules .git __pycache__ /oci/ /backup/ node_modules .git __pycache__ .cache tmp temp build dist .yarn .npm cache .logs logs" +blocklist: "__pycache__ /oci/volumes/ /backup/volumes/ /node_modules/ .cache tmp temp build dist .yarn .npm .logs logs /laminar/run/" # Borg backup settings borg_repo: "{{ backups_borg_repo }}" # Can be local path or remote (ssh://user@host/path) diff --git a/playbooks/roles/backup/templates/stacks/docker-compose.yml b/playbooks/roles/backup/templates/stacks/docker-compose.yml index 9089a8a..85b9c6c 100644 --- a/playbooks/roles/backup/templates/stacks/docker-compose.yml +++ b/playbooks/roles/backup/templates/stacks/docker-compose.yml @@ -4,7 +4,7 @@ services: volumes: - "{{ swarm_base }}:/mnt/source:ro" - "{{ backup_base }}/volumes/backups:/backups" - - "{{ backup_base }}/volumes/work:/work" + - "/var/backups/work:/work" - "{{ backup_base }}/volumes/scripts:/scripts" {% if borg_repo.startswith('ssh://') %} - "{{ backup_base }}/volumes/ssh:/root/.ssh:ro" diff --git a/playbooks/roles/backup/templates/volumes/scripts/backup.py b/playbooks/roles/backup/templates/volumes/scripts/backup.py index 8b71e0f..19dafed 100644 --- a/playbooks/roles/backup/templates/volumes/scripts/backup.py +++ b/playbooks/roles/backup/templates/volumes/scripts/backup.py @@ -64,7 +64,7 @@ class TaskMonitor: self.logger = logger self.active_tasks = {} self.task_lock = threading.Lock() - self.stop_monitoring = threading.Event() + self._stop_event = threading.Event() self.monitor_thread = None def start_task(self, task_id: str, description: str): @@ -87,21 +87,21 @@ class TaskMonitor: def start_monitoring(self): """Start the periodic status monitoring""" - self.stop_monitoring.clear() + self._stop_event.clear() self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True) self.monitor_thread.start() self.logger.info("📊 Task monitoring started") def stop_monitoring(self): """Stop the periodic status monitoring""" - self.stop_monitoring.set() + self._stop_event.set() if self.monitor_thread: self.monitor_thread.join() self.logger.info("📊 Task monitoring stopped") def _monitor_loop(self): """Periodic monitoring loop""" - while not self.stop_monitoring.wait(5.0): # Check every 5 seconds + while not self._stop_event.wait(5.0): # Check every 5 seconds with self.task_lock: if self.active_tasks: now = datetime.now() @@ -153,11 +153,31 @@ class BackupManager: self.logger.info(f"Cleaned work directory: {self.work_dir}") def _is_blocklisted(self, path: Path) -> bool: - """Check if a path is in the blocklist""" + """Check if a path is in the blocklist + + Supports multiple matching modes: + - Exact filename match: 'node_modules' matches any dir/file named node_modules + - Substring path match: '/docker/ci/' matches /mnt/ceph/docker/ci/volumes/ + - Path component match: checks if blocked string appears in the full path + """ path_str = str(path) + for blocked in self.blocklist: - if blocked in path_str or path.name == blocked: + # Exact filename/dirname match + if path.name == blocked: return True + + # Substring match in full path (handles partial paths like /docker/ci/) + if blocked in path_str: + return True + + # Also check against path components for more flexible matching + # e.g. 'ci' would match /path/to/ci/something + path_parts = path.parts + for part in path_parts: + if part == blocked: + return True + return False def _update_progress(self, files_copied: int, files_skipped: int, sqlite_found: Set[Path]): @@ -588,6 +608,11 @@ class BackupManager: ) if init_result.returncode != 0: + # Exit code 2 typically means repository already exists + if init_result.returncode == 2 and 'already exists' in init_result.stderr.lower(): + self.logger.info(f"Borg repository already exists: {borg_repo}") + return + self.logger.error(f"Failed to initialize Borg repository: {init_result.stderr}") raise subprocess.CalledProcessError(init_result.returncode, init_cmd) diff --git a/secrets.txt b/secrets.txt index 4e1f99c..8b959b6 100644 --- a/secrets.txt +++ b/secrets.txt @@ -36,6 +36,7 @@ lizc1_username lizc1_password lizc2_username lizc2_password +grafana_secret backups_borg_repo backups_borg_passphrase backups_borg_key diff --git a/tasks/copy-rendered-templates-recursive.yml b/tasks/copy-rendered-templates-recursive.yml index e3311df..b933d0a 100644 --- a/tasks/copy-rendered-templates-recursive.yml +++ b/tasks/copy-rendered-templates-recursive.yml @@ -75,12 +75,13 @@ delegate_to: localhost become: false ansible.builtin.command: - cmd: tar -czf {{ tempdir.path }}/rendered-files.tar.gz -C {{ tempdir.path }} . - creates: "{{ tempdir.path }}/rendered-files.tar.gz" + cmd: tar -czf /tmp/rendered-files-{{ tempdir.path | basename }}.tar.gz -C {{ tempdir.path }} . + register: tar_result + changed_when: tar_result.rc == 0 - name: Transfer tarball to remote host ansible.builtin.copy: - src: "{{ tempdir.path }}/rendered-files.tar.gz" + src: "/tmp/rendered-files-{{ tempdir.path | basename }}.tar.gz" dest: "/tmp/rendered-files.tar.gz" mode: '0644' @@ -106,6 +107,13 @@ path: "/tmp/rendered-files.tar.gz" state: absent +- name: Remove local tarball + delegate_to: localhost + become: false + ansible.builtin.file: + path: "/tmp/rendered-files-{{ tempdir.path | basename }}.tar.gz" + state: absent + - name: Remove local temporary directory delegate_to: localhost become: false |
