From 1d4db5bdd2f787974b35559dfece848dabddec85 Mon Sep 17 00:00:00 2001 From: Jonas Thelemann Date: Mon, 13 Apr 2026 01:32:41 +0200 Subject: [PATCH] perf(terraform)!: add --- .github/workflows/cd.yml | 56 ++++ .sops.yaml | 4 + secrets.example.yaml | 45 +++ .../alerting/alert-infrastructure.yaml | 217 ++++++++++++++ src/production/production.env.template | 4 +- src/production/production.yml | 120 +++++++- src/production/terraform/.gitignore | 6 + .../terraform/cloud-init/manager.yaml | 34 +++ .../terraform/cloud-init/worker.yaml | 4 + src/production/terraform/docs/provisioning.md | 196 +++++++++++++ src/production/terraform/docs/secrets.md | 156 ++++++++++ src/production/terraform/main.tf | 268 ++++++++++++++++++ src/production/terraform/outputs.tf | 19 ++ src/production/terraform/scripts/backup.sh | 67 +++++ .../terraform/scripts/create-secrets.sh | 35 +++ .../terraform/scripts/generate-env.sh | 61 ++++ src/production/terraform/scripts/restore.sh | 77 +++++ .../terraform/terraform.tfvars.enc.yaml | 6 + src/production/terraform/tf.sh | 44 +++ src/production/terraform/variables.tf | 51 ++++ src/production/terraform/versions.tf | 10 + 21 files changed, 1477 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/cd.yml create mode 100644 .sops.yaml create mode 100644 secrets.example.yaml create mode 100644 src/production/configurations/grafana/provisioning/alerting/alert-infrastructure.yaml create mode 100644 src/production/terraform/.gitignore create mode 100644 src/production/terraform/cloud-init/manager.yaml create mode 100644 src/production/terraform/cloud-init/worker.yaml create mode 100644 src/production/terraform/docs/provisioning.md create mode 100644 src/production/terraform/docs/secrets.md create mode 100644 src/production/terraform/main.tf create mode 100644 src/production/terraform/outputs.tf create mode 100755 src/production/terraform/scripts/backup.sh create mode 100755 src/production/terraform/scripts/create-secrets.sh create mode 100755 src/production/terraform/scripts/generate-env.sh create mode 100755 src/production/terraform/scripts/restore.sh create mode 100644 src/production/terraform/terraform.tfvars.enc.yaml create mode 100755 src/production/terraform/tf.sh create mode 100644 src/production/terraform/variables.tf create mode 100644 src/production/terraform/versions.tf diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml new file mode 100644 index 00000000..ef11c0f9 --- /dev/null +++ b/.github/workflows/cd.yml @@ -0,0 +1,56 @@ +name: CD + +permissions: {} + +on: + release: + types: [published] + +jobs: + deploy: + name: Deploy + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Check for major version upgrade + run: | + VERSION="${{ github.event.release.tag_name }}" + VERSION="${VERSION#v}" + MAJOR="${VERSION%%.*}" + + if [ -z "${{ vars.DEPLOYED_MAJOR_VERSION }}" ]; then + echo "::error::DEPLOYED_MAJOR_VERSION repository variable is not set." + exit 1 + fi + + if [ "$MAJOR" != "${{ vars.DEPLOYED_MAJOR_VERSION }}" ]; then + echo "::error::Major version upgrade detected (${{ vars.DEPLOYED_MAJOR_VERSION }} -> $MAJOR). Manual deployment required." + exit 1 + fi + + - name: Deploy to manager + env: + DEPLOY_TAG: ${{ github.event.release.tag_name }} + MANAGER_IPV6: ${{ secrets.MANAGER_IPV6 }} + SSH_PRIVATE_KEY: ${{ secrets.SSH_PRIVATE_KEY }} + run: | + mkdir -p ~/.ssh + echo "$SSH_PRIVATE_KEY" > ~/.ssh/deploy_key + chmod 600 ~/.ssh/deploy_key + # TOFU: For stronger assurance, store the manager's host key fingerprint + # as a GitHub secret and write it to known_hosts instead of scanning. + ssh-keyscan -H "$MANAGER_IPV6" >> ~/.ssh/known_hosts 2>/dev/null + + ssh -i ~/.ssh/deploy_key "root@$MANAGER_IPV6" bash -s -- "$(printf '%q' "$DEPLOY_TAG")" << 'DEPLOY' + set -euo pipefail + DEPLOY_TAG="$1" + cd /opt/vibetype + git fetch origin --tags + git checkout -- . && git checkout "$DEPLOY_TAG" + bash src/production/terraform/scripts/create-secrets.sh + bash src/production/terraform/scripts/generate-env.sh + dargstack deploy -p "$DEPLOY_TAG" --offline + DEPLOY + + rm -f ~/.ssh/deploy_key diff --git a/.sops.yaml b/.sops.yaml new file mode 100644 index 00000000..c8f47e73 --- /dev/null +++ b/.sops.yaml @@ -0,0 +1,4 @@ +creation_rules: + - path_regex: (secrets\.enc\.yaml|terraform\.tfvars\.enc\.yaml)$ + # Replace with the age public key from `age-keygen` output (e.g. age1abc...) + age: "" diff --git a/secrets.example.yaml b/secrets.example.yaml new file mode 100644 index 00000000..13e6457c --- /dev/null +++ b/secrets.example.yaml @@ -0,0 +1,45 @@ +# Copy this file to secrets.enc.yaml, fill in the values, and encrypt: +# cp secrets.example.yaml secrets.enc.yaml +# sops -e -i secrets.enc.yaml +elasticsearch-keystore_password: "" +elasticsearch-password: "" +env_CLOUDFLARED_TUNNEL_TOKEN: "" +env_SENTRY_CRONS: "" +env_TRAEFIK_ACME_EMAIL: "" +grafana_admin_email: "" +grafana_admin_password: "" +grafana_admin_user: "" +grafana_discord_webhook: "" +jobber_aliases: "" +jobber_aws-bucket: "" +jobber_aws-configuration: "" +jobber_aws-credentials: "" +jobber_msmtprc: "" +portainer_admin-password: "" +postgraphile_connection: "" +postgraphile_jwt-secret: "" +postgraphile_owner-connection: "" +postgres-backup_db: "" +postgres_db: "" +postgres_password: "" +postgres_role_service_grafana_password: "" +postgres_role_service_grafana_username: "" +postgres_role_service_postgraphile_password: "" +postgres_role_service_postgraphile_username: "" +postgres_role_service_vibetype_password: "" +postgres_role_service_vibetype_username: "" +postgres_role_service_zammad_password: "" +postgres_role_service_zammad_username: "" +postgres_user: "" +reccoom_ingest-api-key: "" +reccoom_openai-api-key: "" +sqitch_target: "" +traefik_cf-dns-api-token: "" +traefik_cf-zone-api-token: "" +tusd_aws: "" +vibetype_api-notification-secret: "" +vibetype_aws-credentials: "" +vibetype_firebase-service-account-credentials: "" +vibetype_monday: "" +vibetype_openai-api-key: "" +vibetype_turnstile-key: "" diff --git a/src/production/configurations/grafana/provisioning/alerting/alert-infrastructure.yaml b/src/production/configurations/grafana/provisioning/alerting/alert-infrastructure.yaml new file mode 100644 index 00000000..8b7ca6a2 --- /dev/null +++ b/src/production/configurations/grafana/provisioning/alerting/alert-infrastructure.yaml @@ -0,0 +1,217 @@ +apiVersion: 1 +groups: + - orgId: 1 + name: Infrastructure + folder: Infrastructure + interval: 1m + rules: + - uid: alert-cpu-high + title: High CPU usage + condition: C + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: prometheus + model: + expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) + intervalMs: 15000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + expression: A + reducer: mean + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + expression: B + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + summary: CPU usage is above 85% for more than 5 minutes on {{ $labels.instance }}. + labels: + severity: warning + isPaused: false + notification_settings: + receiver: 'Discord' + - uid: alert-memory-high + title: High memory usage + condition: C + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: prometheus + model: + expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 + intervalMs: 15000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + expression: A + reducer: mean + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 90 + type: gt + expression: B + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + summary: Memory usage is above 90% for more than 5 minutes on {{ $labels.instance }}. + labels: + severity: warning + isPaused: false + notification_settings: + receiver: 'Discord' + - uid: alert-disk-low + title: Low disk space + condition: C + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: prometheus + model: + expr: (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 + intervalMs: 15000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + expression: A + reducer: max + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 85 + type: gt + expression: B + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + summary: Disk usage is above 85% on {{ $labels.instance }} ({{ $labels.mountpoint }}). + labels: + severity: warning + isPaused: false + notification_settings: + receiver: 'Discord' + - uid: alert-container-restart + title: Container restart loop + condition: C + data: + - refId: A + relativeTimeRange: + from: 3600 + to: 0 + datasourceUid: prometheus + model: + expr: changes(container_start_time_seconds{name=~".+"}[1h]) + intervalMs: 15000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + expression: A + reducer: max + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 5 + type: gt + expression: B + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 0s + annotations: + summary: Container {{ $labels.name }} has restarted more than 5 times in the last hour. + labels: + severity: critical + isPaused: false + notification_settings: + receiver: 'Discord' + - uid: alert-service-down + title: Service down + condition: C + data: + - refId: A + relativeTimeRange: + from: 300 + to: 0 + datasourceUid: prometheus + model: + expr: up == 0 + intervalMs: 15000 + maxDataPoints: 43200 + refId: A + - refId: B + datasourceUid: __expr__ + model: + expression: A + reducer: last + refId: B + type: reduce + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + expression: B + refId: C + type: threshold + noDataState: NoData + execErrState: Error + for: 2m + annotations: + summary: Prometheus target {{ $labels.instance }} (job {{ $labels.job }}) is down. + labels: + severity: critical + isPaused: false + notification_settings: + receiver: 'Discord' diff --git a/src/production/production.env.template b/src/production/production.env.template index 8ffc032b..9035517a 100644 --- a/src/production/production.env.template +++ b/src/production/production.env.template @@ -1,5 +1,5 @@ CLOUDFLARED_TUNNEL_TOKEN= SENTRY_CRONS= -STACK_DOMAIN= +STACK_DOMAIN=vibetype.app TRAEFIK_ACME_EMAIL= -TRAEFIK_ACME_PROVIDER= \ No newline at end of file +TRAEFIK_ACME_PROVIDER=cloudflare \ No newline at end of file diff --git a/src/production/production.yml b/src/production/production.yml index ca734d3e..66cc0484 100644 --- a/src/production/production.yml +++ b/src/production/production.yml @@ -14,26 +14,65 @@ services: labels: - (( append )) - traefik.http.routers.adminer_secure.tls.certresolver=default + placement: + constraints: + - node.labels.role == manager cadvisor: deploy: labels: (( prune )) cloudflared: # You can configure the secure tunnel at [dash.cloudflare.com](https://dash.cloudflare.com/). command: tunnel run + deploy: + placement: + constraints: + - node.labels.role == manager environment: TUNNEL_TOKEN: ${CLOUDFLARED_TUNNEL_TOKEN} image: cloudflare/cloudflared + debezium: + deploy: + placement: + constraints: + - node.labels.role == worker + debezium-postgres-connector: + deploy: + placement: + constraints: + - node.labels.role == worker + elasticsearch: + deploy: + placement: + constraints: + - node.labels.role == worker + geoip: + deploy: + placement: + constraints: + - node.labels.role == worker grafana: deploy: labels: - (( append )) - traefik.http.routers.grafana_secure.tls.certresolver=default + placement: + constraints: + - node.labels.role == manager jobber: + deploy: + placement: + constraints: + - node.labels.role == worker environment: SENTRY_CRONS: ${SENTRY_CRONS} volumes: - (( append )) - ./configurations/jobber/sinks:/srv/sinks:ro + memcached: + deploy: + placement: + constraints: + - node.labels.role == worker # minio: (( prune )) # breaks renovate portainer: deploy: @@ -46,9 +85,21 @@ services: - (( append )) - traefik.http.routers.postgraphile.middlewares=postgraphile_auth,postgraphile_cors - traefik.http.routers.postgraphile_secure.tls.certresolver=default + placement: + constraints: + - node.labels.role == worker image: ghcr.io/maevsi/postgraphile:2.0.1 + postgres: + deploy: + placement: + constraints: + - node.labels.role == worker postgres_backup: # You cannot access the database backup directly. + deploy: + placement: + constraints: + - node.labels.role == worker environment: POSTGRES_DB_FILE: /run/secrets/postgres-backup_db POSTGRES_HOST: postgres @@ -67,15 +118,44 @@ services: labels: - (( append )) - traefik.http.routers.prometheus_secure.tls.certresolver=default + placement: + constraints: + - node.labels.role == manager reccoom: - deploy: (( prune )) + deploy: + labels: (( prune )) + placement: + constraints: + - node.labels.role == worker image: ghcr.io/maevsi/reccoom:0.11.0 + reccoom_postgres: + deploy: + placement: + constraints: + - node.labels.role == worker + redis: + deploy: + placement: + constraints: + - node.labels.role == worker + redpanda: + deploy: + placement: + constraints: + - node.labels.role == worker redpanda-console: deploy: labels: - (( append )) - traefik.http.routers.redpanda_secure.tls.certresolver=default + placement: + constraints: + - node.labels.role == worker sqitch: + deploy: + placement: + constraints: + - node.labels.role == worker image: ghcr.io/maevsi/sqitch:11.1.0 volumes: (( prune )) traefik: @@ -96,6 +176,10 @@ services: - traefik_cf-zone-api-token traefik_certs-dumper: # You cannot access the reverse proxy's certificate helper directly. + deploy: + placement: + constraints: + - node.labels.role == manager command: - file - --clean=false @@ -116,12 +200,18 @@ services: - (( append )) - traefik.http.routers.tusd.middlewares=tusd_cors - traefik.http.routers.tusd_secure.tls.certresolver=default + placement: + constraints: + - node.labels.role == worker vibetype: deploy: labels: - (( append )) - traefik.http.routers.vibetype.middlewares=vibetype_cors,vibetype_redirectregex - traefik.http.routers.vibetype_secure.tls.certresolver=default + placement: + constraints: + - node.labels.role == worker image: ghcr.io/maevsi/vibetype:14.1.1 user: (( prune )) # vibetype_beta: @@ -166,11 +256,39 @@ services: # target: /run/environment-variables/PGUSER # volumes: # - ./configurations/postgraphile/jwtES256.key.pub:/run/environment-variables/NUXT_PUBLIC_VIO_AUTH_JWT_PUBLIC_KEY:ro + zammad-backup: + deploy: + placement: + constraints: + - node.labels.role == worker + zammad-init: + deploy: + placement: + constraints: + - node.labels.role == worker zammad-nginx: deploy: labels: - (( append )) - traefik.http.routers.zammad_secure.tls.certresolver=default + placement: + constraints: + - node.labels.role == worker + zammad-railsserver: + deploy: + placement: + constraints: + - node.labels.role == worker + zammad-scheduler: + deploy: + placement: + constraints: + - node.labels.role == worker + zammad-websocket: + deploy: + placement: + constraints: + - node.labels.role == worker version: "3.7" volumes: acme_data: diff --git a/src/production/terraform/.gitignore b/src/production/terraform/.gitignore new file mode 100644 index 00000000..2058c5d4 --- /dev/null +++ b/src/production/terraform/.gitignore @@ -0,0 +1,6 @@ +# Terraform state and secrets +*.tfstate +*.tfstate.backup +*.tfvars +*.tfvars.json +.terraform/ diff --git a/src/production/terraform/cloud-init/manager.yaml b/src/production/terraform/cloud-init/manager.yaml new file mode 100644 index 00000000..0fb352da --- /dev/null +++ b/src/production/terraform/cloud-init/manager.yaml @@ -0,0 +1,34 @@ +#cloud-config + +packages: + - curl + - git + - jq + +runcmd: + # Install sops (with checksum verification) + - curl -fsSLo /tmp/sops "https://github.com/getsops/sops/releases/download/v3.9.4/sops-v3.9.4.linux.amd64" + - echo "5488e32bc471de7982ad895dd054bbab3ab91c417a118426134551e9626e4e85 /tmp/sops" | sha256sum -c + - install -m 755 /tmp/sops /usr/local/bin/sops + + # Install yq (with checksum verification) + - curl -fsSLo /tmp/yq "https://github.com/mikefarah/yq/releases/download/v4.45.4/yq_linux_amd64" + - echo "b96de04645707e14a12f52c37e6266832e03c29e95b9b139cddcae7314466e69 /tmp/yq" | sha256sum -c + - install -m 755 /tmp/yq /usr/local/bin/yq + + # Initialize Docker Swarm on private network + - docker swarm init --advertise-addr ${private_ip} + + # Save join token for the worker + - docker swarm join-token worker -q > /root/swarm-worker-token.txt + + # Label this node + - docker node update --label-add role=manager $(hostname) + + # Install dargstack (with checksum verification) + - curl -fsSLo /tmp/dargstack "https://raw.githubusercontent.com/dargstack/dargstack/refs/tags/3.0.0/src/dargstack" + - echo "311aaed8a9fb0f82167ed329564f55677baf9b9c65342da1e36463120395b0a1 /tmp/dargstack" | sha256sum -c + - install -m 755 /tmp/dargstack /usr/local/bin/dargstack + + # Clone the stack repository + - git clone ${stack_repo_url} /opt/vibetype diff --git a/src/production/terraform/cloud-init/worker.yaml b/src/production/terraform/cloud-init/worker.yaml new file mode 100644 index 00000000..c5eb0ee5 --- /dev/null +++ b/src/production/terraform/cloud-init/worker.yaml @@ -0,0 +1,4 @@ +#cloud-config + +# The worker node joins the Swarm via a Terraform provisioner on the manager +# after both servers are running. See main.tf for the orchestration. diff --git a/src/production/terraform/docs/provisioning.md b/src/production/terraform/docs/provisioning.md new file mode 100644 index 00000000..899f827e --- /dev/null +++ b/src/production/terraform/docs/provisioning.md @@ -0,0 +1,196 @@ +# Provisioning + +The infrastructure runs on [Hetzner Cloud](https://console.hetzner.cloud/) as a two-node Docker Swarm cluster, managed by [Terraform](https://developer.hashicorp.com/terraform/install). + +## Architecture + +| Node | Hostname | Private IP | Role | +|---|---|---|---| +| Manager | `vibetype-manager` | 10.0.1.1 | Swarm manager, monitoring, secrets | +| Worker | `vibetype-worker` | 10.0.1.2 | Application services | + +Both nodes use the Hetzner `docker-ce` image in the nbg1 location and are connected via a private network (`vibetype-swarm`, 10.0.0.0/16, subnet 10.0.1.0/24). The manager uses a CX23 (2 vCPU, 4 GB) and the worker uses a CX33 (4 vCPU, 8 GB) to handle the application workload. + +> **TODO:** After validating real-world memory usage via cAdvisor + Grafana, add Docker resource limits/reservations for memory-hungry worker services (PostgreSQL, Redpanda, Zammad, Debezium, Memcached, Redis). Elasticsearch already has limits (2560M limit, 1536M reservation). + +## Terraform Resources + +| Resource | Name | Purpose | +|---|---|---| +| SSH key | `vibetype` | Injected from local `~/.ssh/id_ed25519.pub` | +| Network | `vibetype-swarm` | Private network (10.0.0.0/16) | +| Subnet | `vibetype-swarm` | 10.0.1.0/24 in eu-central | +| Firewall | `vibetype-swarm` | SSH, HTTP, HTTPS (public), Swarm ports 2377/7946/4789 (private) | +| Server | `vibetype-manager` | Swarm manager node | +| Server | `vibetype-worker` | Swarm worker node | +| terraform_data | `swarm_join` | Orchestrates Swarm join: waits for cloud-init, joins worker, labels nodes | +| terraform_data | `deploy` | Copies age key, creates secrets, generates env, deploys stack | + +## Terraform Variables + +| Variable | Description | Default | +|---|---|---| +| `age_secret_key` | Age private key for SOPS decryption (sensitive) | required | +| `hcloud_token` | Hetzner Cloud API token (sensitive) | required | +| `location` | Hetzner datacenter | `nbg1` | +| `server_type_manager` | Server type for the manager node | `cx23` | +| `server_type_worker` | Server type for the worker node | `cx33` | +| `ssh_public_key_path` | Path to SSH public key | `~/.ssh/id_ed25519.pub` | +| `ssh_source_ips` | CIDRs allowed to SSH into servers | required | +| `stack_repo_url` | Git URL of the stack repository | `https://github.com/maevsi/stack.git` | + +These are stored encrypted in `terraform.tfvars.enc.yaml` using SOPS and age. The `tf.sh` wrapper decrypts them before running Terraform. SOPS reads the age key from `~/.config/sops/age/keys.txt` automatically. + +> **Note:** `terraform.tfvars.enc.yaml` ships as a plaintext template with placeholder values. Before first use, fill in the real values and encrypt it with `sops -e -i terraform.tfvars.enc.yaml`. + +## Running Terraform + +Before running Terraform for the first time, ensure: +- `.sops.yaml` contains your real age public key (not the placeholder) +- `secrets.enc.yaml` has been created and encrypted (see [Secrets Management](secrets.md)) +- `terraform.tfvars.enc.yaml` has been encrypted with `sops -e -i` + +```sh +./tf.sh init +./tf.sh plan +./tf.sh apply +``` + +The wrapper decrypts `terraform.tfvars.enc.yaml` into a temporary `terraform.tfvars.json` file, runs the requested Terraform command, and cleans up the decrypted file. + +## Automated Provisioning Flow + +Running `terraform apply` triggers the following: + +### Manager node (cloud-init) + +1. Installs `sops`, `yq`, and `dargstack` (with checksum verification) +2. Initializes Docker Swarm on the private network +3. Labels itself with `role=manager` +4. Clones the stack repository to `/opt/vibetype` + +### Worker node (Terraform provisioner) + +1. Waits for manager and worker cloud-init to complete +2. Populates a known_hosts file via `ssh-keyscan` for both nodes +3. Fetches the Swarm join token from the manager +4. Joins the worker to the Swarm +5. Labels the worker with `role=worker` + +### Deployment (Terraform provisioner) + +1. Copies the age private key to `/root/.config/sops/age/keys.txt` on the manager (via SSH, never stored in Terraform state) +2. Runs `scripts/create-secrets.sh` to create all Docker Swarm secrets from `secrets.enc.yaml` +3. Runs `scripts/generate-env.sh` to generate `src/production/production.env` from the template and encrypted secrets +4. Deploys the full stack using `dargstack deploy -p latest --offline` + +## Placement Constraints + +| Placement | Services | +|---|---| +| `node.labels.role == manager` | adminer, cloudflared, grafana, prometheus | +| `node.labels.role == worker` | All application services (debezium, elasticsearch, jobber, postgraphile, postgres, reccoom, redis, vibetype, zammad, etc.) | +| `mode: global` (all nodes) | cadvisor, node-exporter, portainer-agent, traefik | + +## Production Environment + +The production environment is built from two layers, following the dargstack convention: + +- `src/development/stack.env`: Shared configuration values (committed to Git) +- `src/production/production.env.template`: Production-specific variables template (committed to Git) +- `src/production/production.env`: Generated from the template with secrets filled in by `scripts/generate-env.sh` (not committed) + +dargstack merges `development/stack.env` + `production/production.env` into `production/stack.env` at deploy time. For zero-touch and CD deployments, both files are sourced directly. + +Non-sensitive variables in `production.env.template` (pre-filled): + +| Variable | Description | +|---|---| +| `STACK_DOMAIN` | Production domain | +| `TRAEFIK_ACME_PROVIDER` | DNS provider for ACME | + +Sensitive variables filled from `secrets.enc.yaml` (prefixed with `env_`): + +| Variable | Description | +|---|---| +| `CLOUDFLARED_TUNNEL_TOKEN` | Cloudflare tunnel token | +| `SENTRY_CRONS` | Sentry cron monitoring URL | +| `TRAEFIK_ACME_EMAIL` | Email for Let's Encrypt certificates | + +## Deploying the Stack + +The stack is deployed automatically during provisioning and on every GitHub release via the CD workflow (`.github/workflows/cd.yml`). + +The CD workflow: +1. Triggers on every published GitHub release +2. Checks for major version upgrades (blocks deployment if the major version differs from `DEPLOYED_MAJOR_VERSION` repository variable) +3. SSHs to the manager node, checks out the release tag, creates any new secrets (existing secrets are skipped), regenerates the environment file, and redeploys the stack using dargstack + +Required GitHub secrets: +- `MANAGER_IPV6`: IPv6 address of the manager node +- `SSH_PRIVATE_KEY`: SSH private key for root access + +Required GitHub repository variable: +- `DEPLOYED_MAJOR_VERSION`: Current deployed major version (e.g. `1`). Update manually after a major version upgrade. + +For manual redeployment: + +```sh +ssh root@ +cd /opt/vibetype +bash src/production/terraform/scripts/create-secrets.sh +bash src/production/terraform/scripts/generate-env.sh +dargstack deploy -p +``` + +## Verifying Deployment + +```sh +# List all services and their status +docker stack services vibetype + +# Check service logs +docker service logs vibetype_traefik +docker service logs vibetype_grafana + +# Verify node placement +docker service ps vibetype_prometheus --format '{{.Node}}' # Should show vibetype-manager +docker service ps vibetype_vibetype --format '{{.Node}}' # Should show vibetype-worker +``` + +## Backup and Restore + +### Backup + +```sh +bash src/production/terraform/scripts/backup.sh [output-directory] +``` + +Creates SQL dumps of main PostgreSQL and Reccoom PostgreSQL. Defaults to `./backups//`. The script automatically discovers which node runs each database service and SSHes to the correct node if the task is remote. + +### Restore + +```sh +bash src/production/terraform/scripts/restore.sh +``` + +Restores SQL dumps from the specified directory into running PostgreSQL containers. Like backup, the script discovers the correct node for each database service automatically. Uses `psql -v ON_ERROR_STOP=1` to fail on the first SQL error. + +## Teardown + +```sh +# Backup databases first (runs from manager, SSHes to worker node for DB containers) +ssh root@ "cd /opt/vibetype && bash src/production/terraform/scripts/backup.sh" +scp -r root@:/opt/vibetype/backups/ ./backups/ + +# Destroy infrastructure +./tf.sh destroy +``` + +## Re-provisioning + +To rebuild the infrastructure from scratch: + +1. Run `./tf.sh destroy` +2. Run `./tf.sh apply` (cloud-init sets up Swarm and tools; Terraform provisioners handle secrets, environment, and deployment) +3. Run `src/production/terraform/scripts/restore.sh` to restore database backups (if applicable) diff --git a/src/production/terraform/docs/secrets.md b/src/production/terraform/docs/secrets.md new file mode 100644 index 00000000..4ef06dfc --- /dev/null +++ b/src/production/terraform/docs/secrets.md @@ -0,0 +1,156 @@ +# Secrets Management + +Secrets are managed using [SOPS](https://github.com/getsops/sops) with [age](https://github.com/FiloSottile/age) encryption. Encrypted secrets are committed to the repository. Only the age private key must be kept secret. + +## How It Works + +1. All Docker Swarm secrets are stored as key-value pairs in `secrets.enc.yaml` (encrypted, safe to commit publicly) +2. `.sops.yaml` in the repository root specifies the age public key used for encryption (must be updated with your real public key before encrypting) +3. The `scripts/create-secrets.sh` script decrypts the file and feeds each entry into `docker secret create` +4. The age private key is copied to the manager node by a Terraform provisioner at provisioning time (never stored in Terraform state) + +> **Important:** Before first provisioning, both `.sops.yaml` (with the real age public key) and `secrets.enc.yaml` (created and encrypted) must be committed to the repository. The Terraform `deploy` provisioner runs `create-secrets.sh` after cloud-init completes, which will fail if `secrets.enc.yaml` is missing from the cloned repo. + +## Tools + +| Tool | Purpose | Install | +|---|---|---| +| [age](https://github.com/FiloSottile/age) | Encryption keypair | `apt install age` | +| [SOPS](https://github.com/getsops/sops) | Encrypt/decrypt YAML values | [GitHub releases](https://github.com/getsops/sops/releases) | +| [yq](https://github.com/mikefarah/yq) | YAML processing | [GitHub releases](https://github.com/mikefarah/yq/releases) | + +## age Keypair + +The keypair is generated with: + +```sh +age-keygen -o ~/.config/sops/age/keys.txt +``` + +This outputs a public key (e.g. `age1abc...`) and stores the private key in the file. The public key is referenced in `.sops.yaml`: + +```yaml +creation_rules: + - path_regex: secrets\.enc\.yaml$ + age: "age1abc..." +``` + +## Encrypted Secrets File + +`secrets.enc.yaml` contains all Docker Swarm secrets. Keys are readable, values are encrypted. This file must be created and encrypted before first deployment: + +```sh +# Create from the provided template, fill in values, then encrypt +cp secrets.example.yaml secrets.enc.yaml +# Edit secrets.enc.yaml with your real values +sops -e -i secrets.enc.yaml +``` + +```yaml +postgres_password: ENC[AES256_GCM,data:abc123...,iv:...,tag:...] +grafana_admin_password: ENC[AES256_GCM,data:def456...,iv:...,tag:...] +``` + +The full list of secret keys matches the `secrets:` section in `src/development/stack.yml`: + +```yaml +elasticsearch-keystore_password: "" +elasticsearch-password: "" +grafana_admin_email: "" +grafana_admin_password: "" +grafana_admin_user: "" +grafana_discord_webhook: "" +jobber_aliases: "" +jobber_aws-bucket: "" +jobber_aws-configuration: "" +jobber_aws-credentials: "" +jobber_msmtprc: "" +portainer_admin-password: "" +postgraphile_connection: "" +postgraphile_jwt-secret: "" +postgraphile_owner-connection: "" +postgres-backup_db: "" +postgres_db: "" +postgres_password: "" +postgres_role_service_grafana_password: "" +postgres_role_service_grafana_username: "" +postgres_role_service_postgraphile_password: "" +postgres_role_service_postgraphile_username: "" +postgres_role_service_vibetype_password: "" +postgres_role_service_vibetype_username: "" +postgres_role_service_zammad_password: "" +postgres_role_service_zammad_username: "" +postgres_user: "" +reccoom_ingest-api-key: "" +reccoom_openai-api-key: "" +sqitch_target: "" +traefik_cf-dns-api-token: "" +traefik_cf-zone-api-token: "" +tusd_aws: "" +vibetype_api-notification-secret: "" +vibetype_aws-credentials: "" +vibetype_firebase-service-account-credentials: "" +vibetype_monday: "" +vibetype_openai-api-key: "" +vibetype_turnstile-key: "" +``` + +## Editing Secrets + +```sh +sops secrets.enc.yaml +``` + +Opens the decrypted file in `$EDITOR`. Re-encrypts automatically on save and close. + +## Creating Docker Swarm Secrets + +```sh +bash src/production/terraform/scripts/create-secrets.sh +``` + +Decrypts `secrets.enc.yaml` and runs `docker secret create` for each entry. Existing secrets are skipped. + +## Rotating a Secret + +Docker Swarm secrets are immutable. To rotate a secret, all services using it must be scaled down first: + +```sh +# 1. Edit the secret value +sops secrets.enc.yaml + +# 2. Scale down services that use the secret +docker service scale vibetype_postgres=0 + +# 3. Remove and recreate the secret +docker secret rm postgres_password +bash src/production/terraform/scripts/create-secrets.sh + +# 4. Scale services back up +docker service scale vibetype_postgres=1 +``` + +Alternatively, redeploy the entire stack (which removes and recreates all services): + +```sh +docker stack rm vibetype +# Wait for all services to fully stop +bash src/production/terraform/scripts/create-secrets.sh +dargstack deploy -p +``` + +## Where the Age Key Lives + +| Location | Purpose | +|---|---| +| `~/.config/sops/age/keys.txt` | Developer machine | +| `/root/.config/sops/age/keys.txt` | Manager node (copied by Terraform `deploy` provisioner) | +| `terraform.tfvars.enc.yaml` (`age_secret_key`) | Passed to Terraform, decrypted by `tf.sh` wrapper | + +## Security Notes + +- `terraform.tfvars.json` (decrypted) and `keys.txt` must never be committed (both are git-ignored) +- The age private key is copied to the manager via SSH stdin and never stored in Terraform state +- `.sops.yaml` contains only the public key and is safe to commit +- `secrets.enc.yaml` is encrypted and safe to commit to a public repository +- Only the age private key needs to be kept secret diff --git a/src/production/terraform/main.tf b/src/production/terraform/main.tf new file mode 100644 index 00000000..47358507 --- /dev/null +++ b/src/production/terraform/main.tf @@ -0,0 +1,268 @@ +provider "hcloud" { + token = var.hcloud_token +} + +locals { + manager_private_ip = "10.0.1.1" + worker_private_ip = "10.0.1.2" +} + +resource "hcloud_ssh_key" "default" { + name = "vibetype" + public_key = file(pathexpand(var.ssh_public_key_path)) +} + +resource "hcloud_network" "swarm" { + ip_range = "10.0.0.0/16" + name = "vibetype-swarm" +} + +resource "hcloud_network_subnet" "swarm" { + ip_range = "10.0.1.0/24" + network_id = hcloud_network.swarm.id + network_zone = "eu-central" + type = "cloud" +} + +resource "hcloud_firewall" "swarm" { + name = "vibetype-swarm" + + # SSH (operator access) + rule { + description = "SSH" + direction = "in" + port = "22" + protocol = "tcp" + source_ips = var.ssh_source_ips + } + + # SSH (inter-node, for backup/restore scripts) + rule { + description = "SSH (private network)" + direction = "in" + port = "22" + protocol = "tcp" + source_ips = ["10.0.1.0/24"] + } + + # HTTP + rule { + description = "HTTP" + direction = "in" + port = "80" + protocol = "tcp" + source_ips = ["0.0.0.0/0", "::/0"] + } + + # HTTPS + rule { + description = "HTTPS" + direction = "in" + port = "443" + protocol = "tcp" + source_ips = ["0.0.0.0/0", "::/0"] + } + + # Docker Swarm management (private network) + rule { + description = "Swarm management" + direction = "in" + port = "2377" + protocol = "tcp" + source_ips = ["10.0.1.0/24"] + } + + # Docker Swarm node communication (private network) + rule { + description = "Swarm TCP communication" + direction = "in" + port = "7946" + protocol = "tcp" + source_ips = ["10.0.1.0/24"] + } + + rule { + description = "Swarm UDP communication" + direction = "in" + port = "7946" + protocol = "udp" + source_ips = ["10.0.1.0/24"] + } + + # Docker Swarm overlay network (private network) + rule { + description = "Swarm overlay network" + direction = "in" + port = "4789" + protocol = "udp" + source_ips = ["10.0.1.0/24"] + } +} + +resource "hcloud_server" "manager" { + firewall_ids = [hcloud_firewall.swarm.id] + image = "docker-ce" + location = var.location + name = "vibetype-manager" + server_type = var.server_type_manager + ssh_keys = [hcloud_ssh_key.default.id] + + public_net { + ipv4_enabled = false + ipv6_enabled = true + } + + user_data = templatefile("${path.module}/cloud-init/manager.yaml", { + private_ip = local.manager_private_ip + stack_repo_url = var.stack_repo_url + }) + + network { + network_id = hcloud_network.swarm.id + ip = local.manager_private_ip + } + + depends_on = [hcloud_network_subnet.swarm] +} + +resource "hcloud_server" "worker" { + firewall_ids = [hcloud_firewall.swarm.id] + image = "docker-ce" + location = var.location + name = "vibetype-worker" + server_type = var.server_type_worker + ssh_keys = [hcloud_ssh_key.default.id] + + public_net { + ipv4_enabled = false + ipv6_enabled = true + } + + user_data = templatefile("${path.module}/cloud-init/worker.yaml", {}) + + network { + network_id = hcloud_network.swarm.id + ip = local.worker_private_ip + } + + depends_on = [hcloud_network_subnet.swarm] +} + +resource "terraform_data" "swarm_join" { + depends_on = [hcloud_server.manager, hcloud_server.worker] + + triggers_replace = [ + hcloud_server.manager.id, + hcloud_server.worker.id, + ] + + # Wait for cloud-init and save the join token to a file. + provisioner "remote-exec" { + connection { + agent = true + host = hcloud_server.manager.ipv6_address + type = "ssh" + user = "root" + } + + inline = [ + "cloud-init status --wait", + "docker swarm join-token worker -q > /root/swarm-worker-token.txt", + ] + } + + # Join the worker to the Swarm using the operator's SSH agent. + provisioner "remote-exec" { + connection { + agent = true + host = hcloud_server.worker.ipv6_address + type = "ssh" + user = "root" + } + + inline = [ + "cloud-init status --wait", + ] + } + + # The worker's token must be fetched from the manager first. + # Note: ssh-keyscan uses TOFU (trust on first use). For newly provisioned servers + # this is acceptable since the IPs are known and the firewall restricts SSH access. + # For higher assurance, pin expected host key fingerprints via a secure channel. + provisioner "local-exec" { + command = <<-EOT + set -euo pipefail + KNOWN_HOSTS=$(mktemp) + trap 'rm -f "$KNOWN_HOSTS"' EXIT + ssh-keyscan -H ${hcloud_server.manager.ipv6_address} >> "$KNOWN_HOSTS" 2>/dev/null + ssh-keyscan -H ${hcloud_server.worker.ipv6_address} >> "$KNOWN_HOSTS" 2>/dev/null + if [ ! -s "$KNOWN_HOSTS" ]; then + echo "Error: ssh-keyscan returned no host keys" >&2 + exit 1 + fi + TOKEN=$(ssh -o UserKnownHostsFile="$KNOWN_HOSTS" root@${hcloud_server.manager.ipv6_address} cat /root/swarm-worker-token.txt | tr -d '[:space:]') + ssh -o UserKnownHostsFile="$KNOWN_HOSTS" root@${hcloud_server.worker.ipv6_address} "docker swarm join --token '$TOKEN' ${local.manager_private_ip}:2377" + EOT + } + + # Label the worker node. + provisioner "remote-exec" { + connection { + agent = true + host = hcloud_server.manager.ipv6_address + type = "ssh" + user = "root" + } + + inline = [ + "docker node update --label-add role=worker vibetype-worker", + ] + } +} + +resource "terraform_data" "deploy" { + depends_on = [terraform_data.swarm_join] + + triggers_replace = [ + hcloud_server.manager.id, + var.stack_repo_url, + ] + + connection { + agent = true + host = hcloud_server.manager.ipv6_address + type = "ssh" + user = "root" + } + + # Copy the age private key to the manager (avoids embedding it in user_data/state). + provisioner "remote-exec" { + inline = [ + "mkdir -p /root/.config/sops/age", + ] + } + + provisioner "local-exec" { + environment = { + AGE_KEY_B64 = base64encode(var.age_secret_key) + MANAGER_HOST = hcloud_server.manager.ipv6_address + } + + command = <<-EOT + set -euo pipefail + KNOWN_HOSTS=$(mktemp) + trap 'rm -f "$KNOWN_HOSTS"' EXIT + ssh-keyscan -H "$MANAGER_HOST" >> "$KNOWN_HOSTS" 2>/dev/null + printf '%s' "$AGE_KEY_B64" | ssh -o UserKnownHostsFile="$KNOWN_HOSTS" "root@$MANAGER_HOST" "base64 -d > /root/.config/sops/age/keys.txt && chmod 600 /root/.config/sops/age/keys.txt" + EOT + } + + # Create Docker secrets, generate environment, and deploy. + provisioner "remote-exec" { + inline = [ + "bash /opt/vibetype/src/production/terraform/scripts/create-secrets.sh", + "bash /opt/vibetype/src/production/terraform/scripts/generate-env.sh", + "cd /opt/vibetype && dargstack deploy -p latest --offline", + ] + } +} diff --git a/src/production/terraform/outputs.tf b/src/production/terraform/outputs.tf new file mode 100644 index 00000000..45c61a2f --- /dev/null +++ b/src/production/terraform/outputs.tf @@ -0,0 +1,19 @@ +output "manager_ipv6" { + description = "Public IPv6 address of the manager node." + value = hcloud_server.manager.ipv6_address +} + +output "manager_private_ip" { + description = "Private IP of the manager node." + value = local.manager_private_ip +} + +output "worker_ipv6" { + description = "Public IPv6 address of the worker node." + value = hcloud_server.worker.ipv6_address +} + +output "worker_private_ip" { + description = "Private IP of the worker node." + value = local.worker_private_ip +} diff --git a/src/production/terraform/scripts/backup.sh b/src/production/terraform/scripts/backup.sh new file mode 100755 index 00000000..aabd66a4 --- /dev/null +++ b/src/production/terraform/scripts/backup.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd "$(dirname "$0")/../../../.." + +BACKUP_DIR="${1:-./backups/$(date +%Y%m%d-%H%M%S)}" +umask 077 +mkdir -p "$BACKUP_DIR" + +echo "Backing up databases to $BACKUP_DIR..." + +# Find the node running a service task and execute a command there. +# If the task runs locally, uses docker exec directly. +# If the task runs on another node, SSHes to that node first. +run_on_task_node() { + local service_name="$1" + shift + + local node + node=$(docker service ps --filter desired-state=running --format '{{.Node}}' "vibetype_${service_name}" 2>/dev/null | head -1) + + if [[ -z "$node" ]]; then + echo " Warning: service vibetype_${service_name} has no running tasks" + return 1 + fi + + local container + if [[ "$node" == "$(hostname)" ]]; then + container=$(docker ps -q -f "label=com.docker.swarm.service.name=vibetype_${service_name}" 2>/dev/null | head -1) + if [[ -n "$container" ]]; then + docker exec "$container" "$@" + return 0 + fi + else + local node_addr + node_addr=$(docker node inspect --format '{{.Status.Addr}}' "$node") + ssh -o BatchMode=yes "root@${node_addr}" "docker exec \$(docker ps -q -f label=com.docker.swarm.service.name=vibetype_${service_name} | head -1) $(printf '%q ' "$@")" + return + fi + + echo " Warning: container for vibetype_${service_name} not found on $node" + return 1 +} + +failed=0 + +# Main PostgreSQL +echo " Dumping main PostgreSQL..." +if run_on_task_node postgres pg_dumpall -U postgres > "$BACKUP_DIR/postgres.sql"; then + echo " Saved: $BACKUP_DIR/postgres.sql" +else + echo " Warning: main PostgreSQL backup failed" + failed=1 +fi + +# Reccoom PostgreSQL +echo " Dumping Reccoom PostgreSQL..." +if run_on_task_node reccoom_postgres pg_dumpall -U postgres > "$BACKUP_DIR/reccoom-postgres.sql"; then + echo " Saved: $BACKUP_DIR/reccoom-postgres.sql" +else + echo " Warning: Reccoom PostgreSQL backup failed" + failed=1 +fi + +echo "Backup complete: $BACKUP_DIR" +ls -lh "$BACKUP_DIR" +exit "$failed" diff --git a/src/production/terraform/scripts/create-secrets.sh b/src/production/terraform/scripts/create-secrets.sh new file mode 100755 index 00000000..af117232 --- /dev/null +++ b/src/production/terraform/scripts/create-secrets.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd "$(dirname "$0")/../../../.." + +if ! command -v sops &>/dev/null; then + echo "Error: sops is not installed" >&2 + exit 1 +fi + +if ! command -v yq &>/dev/null; then + echo "Error: yq is not installed" >&2 + exit 1 +fi + +SECRETS_FILE="${1:-secrets.enc.yaml}" + +if [[ ! -f "$SECRETS_FILE" ]]; then + echo "Error: $SECRETS_FILE not found" >&2 + exit 1 +fi + +echo "Decrypting $SECRETS_FILE and creating Docker secrets..." + +sops --decrypt "$SECRETS_FILE" | yq -r 'del(.sops) | to_entries[] | .key + " " + (.value | @base64)' | \ + while IFS=' ' read -r name b64value; do + if docker secret inspect "$name" >/dev/null 2>&1; then + echo " exists: $name" + else + printf '%s' "$b64value" | base64 -d | docker secret create "$name" - + echo " created: $name" + fi + done + +echo "Done." diff --git a/src/production/terraform/scripts/generate-env.sh b/src/production/terraform/scripts/generate-env.sh new file mode 100755 index 00000000..75276bdb --- /dev/null +++ b/src/production/terraform/scripts/generate-env.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Generates src/production/production.env from production.env.template and SOPS-encrypted secrets. +# Non-empty values from the template are kept as-is. +# Empty values are filled from secrets.enc.yaml entries with the "env_" prefix. +# SOPS reads the age key from ~/.config/sops/age/keys.txt automatically. + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +REPO_DIR="$(cd "$SCRIPT_DIR/../../../.." && pwd)" +SECRETS_FILE="$REPO_DIR/secrets.enc.yaml" +TEMPLATE_FILE="$REPO_DIR/src/production/production.env.template" +OUTPUT_FILE="$REPO_DIR/src/production/production.env" + +if ! command -v sops &>/dev/null; then + echo "Error: sops is not installed" >&2 + exit 1 +fi + +if ! command -v yq &>/dev/null; then + echo "Error: yq is not installed" >&2 + exit 1 +fi + +if [ ! -f "$SECRETS_FILE" ]; then + echo "Error: $SECRETS_FILE not found" >&2 + exit 1 +fi + +if [ ! -f "$TEMPLATE_FILE" ]; then + echo "Error: $TEMPLATE_FILE not found" >&2 + exit 1 +fi + +# Decrypt secrets into an associative array using base64 to handle special characters. +declare -A secrets +while IFS=' ' read -r key b64value; do + secrets["$key"]="$(printf '%s' "$b64value" | base64 -d)" +done < <(sops -d "$SECRETS_FILE" | yq -r 'to_entries[] | select(.key | test("^env_")) | (.key | sub("^env_"; "")) + " " + (.value | @base64)') + +# Build production.env from template, filling empty values from secrets. +umask 077 +: > "$OUTPUT_FILE" +while IFS= read -r line; do + # Skip empty lines and comments. + if [[ -z "$line" || "$line" == \#* ]]; then + echo "$line" >> "$OUTPUT_FILE" + continue + fi + + key="${line%%=*}" + value="${line#*=}" + + if [[ -z "$value" && -n "${secrets[$key]+x}" ]]; then + echo "${key}=${secrets[$key]}" >> "$OUTPUT_FILE" + else + echo "$line" >> "$OUTPUT_FILE" + fi +done < "$TEMPLATE_FILE" + +echo "Generated $OUTPUT_FILE with $(grep -c '.' "$OUTPUT_FILE") entries." diff --git a/src/production/terraform/scripts/restore.sh b/src/production/terraform/scripts/restore.sh new file mode 100755 index 00000000..c8c4cade --- /dev/null +++ b/src/production/terraform/scripts/restore.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +set -euo pipefail + +cd "$(dirname "$0")/../../../.." + +BACKUP_DIR="${1:?Usage: $0 }" + +if [[ ! -d "$BACKUP_DIR" ]]; then + echo "Error: $BACKUP_DIR does not exist" >&2 + exit 1 +fi + +echo "Restoring databases from $BACKUP_DIR..." + +# Find the node running a service task and execute a command there. +# If the task runs locally, uses docker exec directly. +# If the task runs on another node, SSHes to that node first. +run_on_task_node() { + local service_name="$1" + shift + + local node + node=$(docker service ps --filter desired-state=running --format '{{.Node}}' "vibetype_${service_name}" 2>/dev/null | head -1) + + if [[ -z "$node" ]]; then + echo " Error: service vibetype_${service_name} has no running tasks" >&2 + return 1 + fi + + local container + if [[ "$node" == "$(hostname)" ]]; then + container=$(docker ps -q -f "label=com.docker.swarm.service.name=vibetype_${service_name}" 2>/dev/null | head -1) + if [[ -n "$container" ]]; then + docker exec -i "$container" "$@" + return 0 + fi + else + local node_addr + node_addr=$(docker node inspect --format '{{.Status.Addr}}' "$node") + ssh -o BatchMode=yes "root@${node_addr}" "docker exec -i \$(docker ps -q -f label=com.docker.swarm.service.name=vibetype_${service_name} | head -1) $(printf '%q ' "$@")" + return + fi + + echo " Error: container for vibetype_${service_name} not found on $node" >&2 + return 1 +} + +failed=0 + +# Main PostgreSQL +if [[ -f "$BACKUP_DIR/postgres.sql" ]]; then + echo " Restoring main PostgreSQL..." + if run_on_task_node postgres psql -v ON_ERROR_STOP=1 -U postgres < "$BACKUP_DIR/postgres.sql"; then + echo " Restored: postgres.sql" + else + echo " Error: main PostgreSQL restore failed" >&2 + failed=1 + fi +else + echo " Skipping: postgres.sql not found in backup" +fi + +# Reccoom PostgreSQL +if [[ -f "$BACKUP_DIR/reccoom-postgres.sql" ]]; then + echo " Restoring Reccoom PostgreSQL..." + if run_on_task_node reccoom_postgres psql -v ON_ERROR_STOP=1 -U postgres < "$BACKUP_DIR/reccoom-postgres.sql"; then + echo " Restored: reccoom-postgres.sql" + else + echo " Error: Reccoom PostgreSQL restore failed" >&2 + failed=1 + fi +else + echo " Skipping: reccoom-postgres.sql not found in backup" +fi + +echo "Restore complete." +exit "$failed" diff --git a/src/production/terraform/terraform.tfvars.enc.yaml b/src/production/terraform/terraform.tfvars.enc.yaml new file mode 100644 index 00000000..c25be556 --- /dev/null +++ b/src/production/terraform/terraform.tfvars.enc.yaml @@ -0,0 +1,6 @@ +# Encrypt this file with `sops -e -i terraform.tfvars.enc.yaml`. +# SOPS reads the age key from ~/.config/sops/age/keys.txt automatically. +age_secret_key: "" +hcloud_token: "" +ssh_source_ips: + - "/32" diff --git a/src/production/terraform/tf.sh b/src/production/terraform/tf.sh new file mode 100755 index 00000000..c1ba5aff --- /dev/null +++ b/src/production/terraform/tf.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Decrypts SOPS-encrypted variables and runs Terraform. +# SOPS reads the age key from ~/.config/sops/age/keys.txt automatically. + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +TFVARS_ENC="$SCRIPT_DIR/terraform.tfvars.enc.yaml" +TFVARS_JSON="$SCRIPT_DIR/terraform.tfvars.json" + +cleanup() { + rm -f "$TFVARS_JSON" +} +trap cleanup EXIT + +# Only decrypt and pass -var-file for subcommands that support it. +case "${1:-}" in + plan|apply|destroy|import|refresh|console) + if [ ! -f "$TFVARS_ENC" ]; then + echo "Error: $TFVARS_ENC not found" >&2 + exit 1 + fi + + if ! command -v sops &>/dev/null; then + echo "Error: sops is not installed" >&2 + exit 1 + fi + + if ! command -v yq &>/dev/null; then + echo "Error: yq is not installed" >&2 + exit 1 + fi + + # Decrypt SOPS YAML to JSON tfvars, stripping the sops metadata key. + # Create with restrictive permissions to protect decrypted secrets. + umask 077 + sops -d "$TFVARS_ENC" | yq -o=json 'del(.sops)' > "$TFVARS_JSON" + + terraform -chdir="$SCRIPT_DIR" "$@" -var-file="$TFVARS_JSON" + ;; + *) + terraform -chdir="$SCRIPT_DIR" "$@" + ;; +esac diff --git a/src/production/terraform/variables.tf b/src/production/terraform/variables.tf new file mode 100644 index 00000000..0a1bf348 --- /dev/null +++ b/src/production/terraform/variables.tf @@ -0,0 +1,51 @@ +variable "age_secret_key" { + description = "age private key for SOPS decryption (placed on manager node)." + sensitive = true + type = string +} + +variable "hcloud_token" { + description = "Hetzner Cloud API token." + sensitive = true + type = string +} + +variable "location" { + default = "nbg1" + description = "Hetzner Cloud location." + type = string +} + +variable "server_type_manager" { + default = "cx23" + description = "Hetzner Cloud server type for the manager node." + type = string +} + +variable "server_type_worker" { + default = "cx33" + description = "Hetzner Cloud server type for the worker node." + type = string +} + +variable "ssh_public_key_path" { + default = "~/.ssh/id_ed25519.pub" + description = "Path to the SSH public key to inject into servers." + type = string +} + +variable "ssh_source_ips" { + description = "CIDRs allowed to SSH into the servers. Must be set explicitly (e.g. your operator IP or VPN range)." + type = list(string) + + validation { + condition = length(var.ssh_source_ips) > 0 + error_message = "ssh_source_ips must not be empty. Specify at least one CIDR (e.g. your operator IP)." + } +} + +variable "stack_repo_url" { + default = "https://github.com/maevsi/stack.git" + description = "Git URL of the stack repository to clone on the manager node." + type = string +} diff --git a/src/production/terraform/versions.tf b/src/production/terraform/versions.tf new file mode 100644 index 00000000..cb9c2b5b --- /dev/null +++ b/src/production/terraform/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 1.4" + + required_providers { + hcloud = { + source = "hetznercloud/hcloud" + version = "~> 1.60" + } + } +}