diff --git a/.env.example b/.env.example index 3d1caef..510f500 100644 --- a/.env.example +++ b/.env.example @@ -1,13 +1,13 @@ -# CSF-Core Environment Configuration +# CSFX-Core Environment Configuration # Copy to .env and fill in values before running # JWT secret — generate with: openssl rand -hex 64 JWT_SECRET= # PostgreSQL -POSTGRES_USER=csf_user -POSTGRES_PASSWORD=csf-test-password -POSTGRES_DB=csf_core +POSTGRES_USER=csfx_user +POSTGRES_PASSWORD=csfx-test-password +POSTGRES_DB=csfx_core # Rust log level (trace, debug, info, warn, error) RUST_LOG=info @@ -15,12 +15,12 @@ RUST_LOG=info # Docker image registry (GHCR org name, lowercase) GHCR_ORG=local # Image version tag — use "dev" for local builds, semver for prod (e.g. 0.2.2) -CSF_VERSION=dev +CSFX_VERSION=dev # etcd auth — generate with: openssl rand -hex 32 ETCD_ROOT_PASSWORD= -ETCD_CSF_PASSWORD= +ETCD_CSFX_PASSWORD= -# GHCR read token for image digest verification (csf-updater) +# GHCR read token for image digest verification (csfx-updater) # generate at: https://github.com/settings/tokens — scope: read:packages GHCR_TOKEN= diff --git a/.github/workflows/README.md b/.github/workflows/README.md deleted file mode 100644 index 525978e..0000000 --- a/.github/workflows/README.md +++ /dev/null @@ -1,106 +0,0 @@ -# GitHub Actions Workflows - -## Übersicht - -### Release & Deployment Workflows - -#### `main-release.yml` (Haupt-Release-Pipeline) - -Läuft automatisch bei jedem Push auf `main`: - -1. **Semantic Release** - Erstellt neue Releases basierend auf Conventional Commits -2. **Docker Build Backend** - Baut und pusht Backend-Image nach ghcr.io -3. **Docker Build Frontend** - Baut und pusht Frontend-Image nach ghcr.io -4. **Summary** - Zeigt Übersicht aller Artefakte - -**Outputs:** - -- GitHub Release mit Binaries -- Docker Images: `ghcr.io/cs-foundry/csf-core-backend:latest` & `:version` -- Docker Images: `ghcr.io/cs-foundry/csf-core-frontend:latest` & `:version` - -#### `release.yml` (Wiederverwendbarer Release-Workflow) - -Wird von `main-release.yml` aufgerufen: - -- Führt Semantic Release aus -- Baut Backend-Binaries (Linux/macOS) -- Baut Frontend-Package -- Lädt alle Artefakte zum Release hoch - -#### `docker-build-manual.yml` (Manuelles Docker-Build) - -Manueller Workflow für Docker-Builds: - -- Auswahl: Backend, Frontend oder beides -- Eigene Versionsnummer angeben -- Erstellt Tags: `` und `manual-latest` - -### Weitere Workflows - -#### `beta-release.yml` - -Release-Pipeline für Beta-Versionen auf dem `beta` Branch - -#### `docker-build-push.yml` - -Legacy-Workflow für das vereinigte Backend+Frontend Image - -#### `build-artifacts.yml` - -Standalone-Workflow für Binary-Builds - -#### `lint.yml` - -Code-Quality-Checks (Rust, TypeScript, etc.) - -## Verwendung - -### Automatischer Release (main) - -```bash -git commit -m "feat: neue Feature" -git push origin main -# → Automatischer Release + Docker Images -``` - -### Manueller Docker-Build - -1. GitHub Actions → **Manual Docker Build** -2. **Run workflow** klicken -3. Version eingeben (z.B. `1.2.3`) -4. Target auswählen (backend/frontend/both) -5. **Run workflow** ausführen - -## Image-URLs - -Nach erfolgreichem Build sind die Images verfügbar unter: - -```bash -# Backend -ghcr.io/cs-foundry/csf-core-backend:latest -ghcr.io/cs-foundry/csf-core-backend: - -# Frontend -ghcr.io/cs-foundry/csf-core-frontend:latest -ghcr.io/cs-foundry/csf-core-frontend: -``` - -## Permissions - -Die Workflows benötigen folgende Permissions: - -- `contents: write` - Für Releases -- `packages: write` - Für Docker Registry -- `issues: write` - Für Issue-Updates -- `pull-requests: write` - Für PR-Updates - -## Secrets - -Keine zusätzlichen Secrets erforderlich - verwendet `GITHUB_TOKEN` automatisch. - -## Weitere Dokumentation - -- [Docker Registry Integration](../docs/deployment/DOCKER_REGISTRY.md) -- [NixOS Deployment](../docs/deployment/DEPLOYMENT.md) -- [Installation Guide](../docs/deployment/INSTALLATION.md) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 1909865..67d27e0 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -75,6 +75,7 @@ jobs: - volume-manager - failover-controller - sdn-controller + - patroni arch: - amd64 - arm64 @@ -85,6 +86,9 @@ jobs: - arch: arm64 runner: ubuntu-24.04-arm platform: linux/arm64 + - service: patroni + dockerfile: deployments/docker/patroni/Dockerfile + context: deployments/docker/patroni steps: - uses: actions/checkout@v4 @@ -92,7 +96,21 @@ jobs: id: image run: | ORG=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') - echo "name=ghcr.io/${ORG}/csf-ce-${{ matrix.service }}" >> $GITHUB_OUTPUT + echo "name=ghcr.io/${ORG}/csfx-ce-${{ matrix.service }}" >> $GITHUB_OUTPUT + + - name: Resolve Dockerfile and context + id: dockerctx + run: | + if [ -n "${{ matrix.dockerfile }}" ]; then + echo "file=${{ matrix.dockerfile }}" >> $GITHUB_OUTPUT + echo "context=${{ matrix.context }}" >> $GITHUB_OUTPUT + echo "build_args=" >> $GITHUB_OUTPUT + else + echo "file=control-plane/Dockerfile.prod.shared" >> $GITHUB_OUTPUT + echo "context=." >> $GITHUB_OUTPUT + printf 'build_args=SERVICE_BIN=%s\nBUILD_JOBS=2\nCSFX_BUILD_VERSION=%s\n' \ + "${{ matrix.service }}" "${{ needs.prepare.outputs.version }}" >> $GITHUB_OUTPUT + fi - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 @@ -108,12 +126,9 @@ jobs: id: build uses: docker/build-push-action@v6 with: - context: . - file: control-plane/Dockerfile.prod.shared - build-args: | - SERVICE_BIN=${{ matrix.service }} - BUILD_JOBS=2 - CSF_BUILD_VERSION=${{ needs.prepare.outputs.version }} + context: ${{ steps.dockerctx.outputs.context }} + file: ${{ steps.dockerctx.outputs.file }} + build-args: ${{ steps.dockerctx.outputs.build_args }} push: true outputs: type=registry,name=${{ steps.image.outputs.name }},push-by-digest=true platforms: ${{ matrix.platform }} @@ -141,8 +156,8 @@ jobs: fail-fast: false matrix: binary: - - csf-updater - - csf-agent + - csfx-updater + - csfx-agent arch: - amd64 - arm64 @@ -183,7 +198,7 @@ jobs: env: CARGO_TARGET_X86_64_UNKNOWN_LINUX_MUSL_LINKER: musl-gcc CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_LINKER: aarch64-linux-gnu-gcc - CSF_BUILD_VERSION: ${{ needs.prepare.outputs.version }} + CSFX_BUILD_VERSION: ${{ needs.prepare.outputs.version }} run: | cargo build --release --bin ${{ matrix.binary }} --target ${{ matrix.target }} cp target/${{ matrix.target }}/release/${{ matrix.binary }} ${{ matrix.binary }}-${{ matrix.arch }} @@ -211,12 +226,13 @@ jobs: - volume-manager - failover-controller - sdn-controller + - patroni steps: - name: Set image name id: image run: | ORG=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') - echo "name=ghcr.io/${ORG}/csf-ce-${{ matrix.service }}" >> $GITHUB_OUTPUT + echo "name=ghcr.io/${ORG}/csfx-ce-${{ matrix.service }}" >> $GITHUB_OUTPUT - uses: actions/download-artifact@v4 with: @@ -262,7 +278,7 @@ jobs: - uses: actions/download-artifact@v4 with: - pattern: csf-* + pattern: csfx-* merge-multiple: true - name: Create or update release @@ -271,32 +287,126 @@ jobs: TAG="v${VERSION}" if gh release view "${TAG}" &>/dev/null; then gh release upload "${TAG}" \ - csf-updater-amd64 \ - csf-updater-amd64.sha256 \ - csf-updater-arm64 \ - csf-updater-arm64.sha256 \ - csf-agent-amd64 \ - csf-agent-amd64.sha256 \ - csf-agent-arm64 \ - csf-agent-arm64.sha256 \ + csfx-updater-amd64 \ + csfx-updater-amd64.sha256 \ + csfx-updater-arm64 \ + csfx-updater-arm64.sha256 \ + csfx-agent-amd64 \ + csfx-agent-amd64.sha256 \ + csfx-agent-arm64 \ + csfx-agent-arm64.sha256 \ --clobber else gh release create "${TAG}" \ --title "v${VERSION}" \ --prerelease \ --notes "Alpha build ${VERSION}" \ - csf-updater-amd64 \ - csf-updater-amd64.sha256 \ - csf-updater-arm64 \ - csf-updater-arm64.sha256 \ - csf-agent-amd64 \ - csf-agent-amd64.sha256 \ - csf-agent-arm64 \ - csf-agent-arm64.sha256 + csfx-updater-amd64 \ + csfx-updater-amd64.sha256 \ + csfx-updater-arm64 \ + csfx-updater-arm64.sha256 \ + csfx-agent-amd64 \ + csfx-agent-amd64.sha256 \ + csfx-agent-arm64 \ + csfx-agent-arm64.sha256 fi env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + update-infra: + name: Update CSFX-Infra versions.nix + runs-on: ubuntu-latest + needs: [prepare, manifest, build-binaries, attach-binaries-release] + if: needs.prepare.outputs.is_release == 'true' + steps: + - uses: actions/checkout@v4 + with: + repository: ${{ github.repository_owner }}/CSFX-Infra + token: ${{ secrets.INFRA_REPO_TOKEN }} + path: infra + + - uses: actions/download-artifact@v4 + with: + pattern: digest-* + path: /tmp/digests + merge-multiple: true + + - uses: actions/download-artifact@v4 + with: + pattern: csfx-agent-* + path: /tmp/binaries + merge-multiple: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Write versions.nix + run: | + VERSION="${{ needs.prepare.outputs.version }}" + ORG=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + REPO="${{ github.repository }}" + RELEASE_BASE="https://github.com/${REPO}/releases/download/v${VERSION}" + + get_manifest_digest() { + local svc=$1 + local image="ghcr.io/${ORG}/csfx-ce-${svc}:${VERSION}" + docker buildx imagetools inspect "${image}" \ + --format '{{json .Manifest}}' | jq -r '.digest' + } + + get_sha256() { + local binary=$1 + local arch=$2 + awk '{print $1}' /tmp/binaries/${binary}-${arch}.sha256 2>/dev/null + } + + cat > infra/versions.nix <> $GITHUB_STEP_SUMMARY echo "|---------|-------|" >> $GITHUB_STEP_SUMMARY for svc in api-gateway registry scheduler volume-manager failover-controller sdn-controller; do - echo "| ${svc} | \`ghcr.io/${ORG}/csf-ce-${svc}:${VERSION}\` |" >> $GITHUB_STEP_SUMMARY + echo "| ${svc} | \`ghcr.io/${ORG}/csfx-ce-${svc}:${VERSION}\` |" >> $GITHUB_STEP_SUMMARY done echo "" >> $GITHUB_STEP_SUMMARY echo "### Binaries" >> $GITHUB_STEP_SUMMARY echo "| Binary | Arch | Artifact |" >> $GITHUB_STEP_SUMMARY echo "|--------|------|----------|" >> $GITHUB_STEP_SUMMARY - for bin in csf-updater csf-agent; do + for bin in csfx-updater csfx-agent; do echo "| ${bin} | amd64 | \`${bin}-amd64\` |" >> $GITHUB_STEP_SUMMARY echo "| ${bin} | arm64 | \`${bin}-arm64\` |" >> $GITHUB_STEP_SUMMARY done diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml index 0cee68c..f3df0ab 100644 --- a/.github/workflows/prerelease.yml +++ b/.github/workflows/prerelease.yml @@ -74,7 +74,7 @@ jobs: id: image run: | ORG=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') - echo "name=ghcr.io/${ORG}/csf-ce-${{ matrix.service }}" >> $GITHUB_OUTPUT + echo "name=ghcr.io/${ORG}/csfx-ce-${{ matrix.service }}" >> $GITHUB_OUTPUT - uses: docker/setup-buildx-action@v3 @@ -130,7 +130,7 @@ jobs: id: image run: | ORG=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') - echo "name=ghcr.io/${ORG}/csf-ce-${{ matrix.service }}" >> $GITHUB_OUTPUT + echo "name=ghcr.io/${ORG}/csfx-ce-${{ matrix.service }}" >> $GITHUB_OUTPUT - uses: actions/download-artifact@v4 with: @@ -165,8 +165,8 @@ jobs: fail-fast: false matrix: binary: - - csf-updater - - csf-agent + - csfx-updater + - csfx-agent arch: - amd64 - arm64 @@ -207,7 +207,7 @@ jobs: env: CARGO_TARGET_X86_64_UNKNOWN_LINUX_MUSL_LINKER: musl-gcc CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_LINKER: aarch64-linux-gnu-gcc - CSF_BUILD_VERSION: ${{ needs.version.outputs.version }} + CSFX_BUILD_VERSION: ${{ needs.version.outputs.version }} run: | cargo build --release --bin ${{ matrix.binary }} --target ${{ matrix.target }} cp target/${{ matrix.target }}/release/${{ matrix.binary }} ${{ matrix.binary }}-${{ matrix.arch }} @@ -230,7 +230,7 @@ jobs: - uses: actions/download-artifact@v4 with: - pattern: csf-* + pattern: csfx-* merge-multiple: true - name: Create pre-release @@ -240,13 +240,101 @@ jobs: --title "v${VERSION}" \ --prerelease \ --generate-notes \ - csf-updater-amd64 \ - csf-updater-amd64.sha256 \ - csf-updater-arm64 \ - csf-updater-arm64.sha256 \ - csf-agent-amd64 \ - csf-agent-amd64.sha256 \ - csf-agent-arm64 \ - csf-agent-arm64.sha256 + csfx-updater-amd64 \ + csfx-updater-amd64.sha256 \ + csfx-updater-arm64 \ + csfx-updater-arm64.sha256 \ + csfx-agent-amd64 \ + csfx-agent-amd64.sha256 \ + csfx-agent-arm64 \ + csfx-agent-arm64.sha256 env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + update-infra: + name: Update CSFX-Infra versions.nix + runs-on: ubuntu-latest + needs: [version, manifest, build-binaries, github-release] + steps: + - uses: actions/checkout@v4 + with: + repository: ${{ github.repository_owner }}/CSFX-Infra + token: ${{ secrets.INFRA_REPO_TOKEN }} + ref: develop + path: infra + + - uses: actions/download-artifact@v4 + with: + pattern: csfx-agent-* + path: /tmp/binaries + merge-multiple: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Write versions.nix + run: | + VERSION="${{ needs.version.outputs.version }}" + ORG=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + REPO="${{ github.repository }}" + RELEASE_BASE="https://github.com/${REPO}/releases/download/v${VERSION}" + + get_manifest_digest() { + local svc=$1 + local image="ghcr.io/${ORG}/csfx-ce-${svc}:${VERSION}" + docker buildx imagetools inspect "${image}" \ + --format '{{json .Manifest}}' | jq -r '.digest' + } + + get_sha256() { + local binary=$1 + local arch=$2 + awk '{print $1}' /tmp/binaries/${binary}-${arch}.sha256 2>/dev/null + } + + cat > infra/versions.nix <>>>>>> origin/main -## [0.2.2](https://github.com/CS-Foundry/CSF-Core/compare/v0.2.1...v0.2.2) (2026-01-05) +## [0.2.2](https://github.com/CS-Foundry/CSFX-Core/compare/v0.2.1...v0.2.2) (2026-01-05) ### Bug Fixes -* frontend build error ([afec643](https://github.com/CS-Foundry/CSF-Core/commit/afec64354d33c9e70cf32cee2483a03250c1b108)) +* frontend build error ([afec643](https://github.com/CS-Foundry/CSFX-Core/commit/afec64354d33c9e70cf32cee2483a03250c1b108)) -## [0.2.1](https://github.com/CS-Foundry/CSF-Core/compare/v0.2.0...v0.2.1) (2026-01-05) +## [0.2.1](https://github.com/CS-Foundry/CSFX-Core/compare/v0.2.0...v0.2.1) (2026-01-05) ### Bug Fixes -* semantic release commit befor build ([9927644](https://github.com/CS-Foundry/CSF-Core/commit/99276446079e169853a7b2b7848a369b45d0f930)) +* semantic release commit befor build ([9927644](https://github.com/CS-Foundry/CSFX-Core/commit/99276446079e169853a7b2b7848a369b45d0f930)) -# [0.2.0](https://github.com/CS-Foundry/CSF-Core/compare/v0.1.3...v0.2.0) (2026-01-05) +# [0.2.0](https://github.com/CS-Foundry/CSFX-Core/compare/v0.1.3...v0.2.0) (2026-01-05) ### Features -* new beta branch features ([b88b509](https://github.com/CS-Foundry/CSF-Core/commit/b88b509342da00aeea618ece55bc6d911ac543e5)) +* new beta branch features ([b88b509](https://github.com/CS-Foundry/CSFX-Core/commit/b88b509342da00aeea618ece55bc6d911ac543e5)) -## [0.1.3](https://github.com/CS-Foundry/CSF-Core/compare/v0.1.2...v0.1.3) (2026-01-04) +## [0.1.3](https://github.com/CS-Foundry/CSFX-Core/compare/v0.1.2...v0.1.3) (2026-01-04) ### Bug Fixes -* semantiv release versioning ([4b4ce16](https://github.com/CS-Foundry/CSF-Core/commit/4b4ce161a29b96531248f11b228a71d2cce0b950)) +* semantiv release versioning ([4b4ce16](https://github.com/CS-Foundry/CSFX-Core/commit/4b4ce161a29b96531248f11b228a71d2cce0b950)) -## [0.1.2](https://github.com/CS-Foundry/CSF-Core/compare/v0.1.1...v0.1.2) (2026-01-04) +## [0.1.2](https://github.com/CS-Foundry/CSFX-Core/compare/v0.1.1...v0.1.2) (2026-01-04) ### Bug Fixes -* version ([3d63017](https://github.com/CS-Foundry/CSF-Core/commit/3d63017237d93288ba1645d9eb6b6f0f318c2ec3)) -* version ([23573b8](https://github.com/CS-Foundry/CSF-Core/commit/23573b862761811ef1b8234477ccb63307687750)) +* version ([3d63017](https://github.com/CS-Foundry/CSFX-Core/commit/3d63017237d93288ba1645d9eb6b6f0f318c2ec3)) +* version ([23573b8](https://github.com/CS-Foundry/CSFX-Core/commit/23573b862761811ef1b8234477ccb63307687750)) -## [0.1.1](https://github.com/CS-Foundry/CSF-Core/compare/v0.1.0...v0.1.1) (2026-01-04) +## [0.1.1](https://github.com/CS-Foundry/CSFX-Core/compare/v0.1.0...v0.1.1) (2026-01-04) ### Bug Fixes -* updater pull ([3ef7e36](https://github.com/CS-Foundry/CSF-Core/commit/3ef7e36cee7a2aeac7d6b6aa11107ccc712c12b5)) +* updater pull ([3ef7e36](https://github.com/CS-Foundry/CSFX-Core/commit/3ef7e36cee7a2aeac7d6b6aa11107ccc712c12b5)) -# [0.1.0](https://github.com/CS-Foundry/CSF-Core/compare/v0.0.8...v0.1.0) (2026-01-04) +# [0.1.0](https://github.com/CS-Foundry/CSFX-Core/compare/v0.0.8...v0.1.0) (2026-01-04) ### Features -* updater for programm ([7b064b8](https://github.com/CS-Foundry/CSF-Core/commit/7b064b8255b34cde174a591e93c7c67604997f2c)) +* updater for programm ([7b064b8](https://github.com/CS-Foundry/CSFX-Core/commit/7b064b8255b34cde174a591e93c7c67604997f2c)) -## [0.0.8](https://github.com/CS-Foundry/CSF-Core/compare/v0.0.7...v0.0.8) (2026-01-04) +## [0.0.8](https://github.com/CS-Foundry/CSFX-Core/compare/v0.0.7...v0.0.8) (2026-01-04) ### Bug Fixes -* docker warn on linux kernel ([1de9a08](https://github.com/CS-Foundry/CSF-Core/commit/1de9a084cbbe5cec93fc2205415c3f1f5ab5b597)) +* docker warn on linux kernel ([1de9a08](https://github.com/CS-Foundry/CSFX-Core/commit/1de9a084cbbe5cec93fc2205415c3f1f5ab5b597)) diff --git a/Cargo.lock b/Cargo.lock index 7f3eebb..ed8cee1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1098,7 +1098,7 @@ dependencies = [ ] [[package]] -name = "csf-agent" +name = "csfx-agent" version = "0.2.2" dependencies = [ "anyhow", @@ -1117,7 +1117,7 @@ dependencies = [ ] [[package]] -name = "csf-migrate" +name = "csfx-migrate" version = "0.2.2" dependencies = [ "dotenvy", @@ -1130,18 +1130,15 @@ dependencies = [ ] [[package]] -name = "csf-updater" +name = "csfx-updater" version = "0.2.2" dependencies = [ - "aes-gcm", "anyhow", - "base64 0.22.1", "dotenvy", "etcd-client", "reqwest 0.11.27", "serde", "serde_json", - "tempfile", "tokio", "tracing", "tracing-subscriber", @@ -3854,6 +3851,7 @@ dependencies = [ "chrono", "dotenvy", "entity", + "etcd-client", "migration", "opentelemetry", "opentelemetry-otlp", diff --git a/Cargo.toml b/Cargo.toml index 9bfe508..df041e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,8 +3,8 @@ resolver = "2" members = [ "agent", "control-plane/api-gateway", - "control-plane/csf-migrate", - "control-plane/csf-updater", + "control-plane/csfx-migrate", + "control-plane/csfx-updater", "control-plane/scheduler", "control-plane/failover-controller", "control-plane/sdn-controller", @@ -20,7 +20,7 @@ version = "0.2.2" edition = "2021" authors = ["CS-Foundry"] license = "SEE LICENSE IN LICENSE" -repository = "https://github.com/CSFX-cloud/CSF-Core" +repository = "https://github.com/CSFX-cloud/CSFX-Core" [workspace.dependencies] # Async runtime diff --git a/README.md b/README.md index cdee1b9..a2653b4 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@
-# CSF-Core +# CSFX-Core ### Unified Infrastructure Management Platform -[![Pipeline Status](https://img.shields.io/github/actions/workflow/status/CS-Foundry/CSF-Core/main-release.yml?branch=main&label=Release-Pipeline&style=for-the-badge&logo=github)](https://github.com/CS-Foundry/CSF-Core/actions/runs/20679215646) -[![Lint Status](https://img.shields.io/github/actions/workflow/status/CS-Foundry/CSF-Core/lint.yml?branch=main&label=Lint%20%26%20Format&style=for-the-badge&logo=github&color=blueviolet)](https://github.com/CS-Foundry/CSF-Core/actions/runs/20679215645) -[![Version](https://img.shields.io/github/v/release/CS-Foundry/CSF-Core?style=for-the-badge&color=blue)](https://github.com/CS-Foundry/CSF-Core/releases) -[![Downloads](https://img.shields.io/github/downloads/CS-Foundry/CSF-Core/total?style=for-the-badge&color=success)](https://github.com/CS-Foundry/CSF-Core/releases) -[![License](https://img.shields.io/badge/License-CSF--Internal-purple?style=for-the-badge)](LICENSE) +[![Pipeline Status](https://img.shields.io/github/actions/workflow/status/CS-Foundry/CSFX-Core/main-release.yml?branch=main&label=Release-Pipeline&style=for-the-badge&logo=github)](https://github.com/CS-Foundry/CSFX-Core/actions/runs/20679215646) +[![Lint Status](https://img.shields.io/github/actions/workflow/status/CS-Foundry/CSFX-Core/lint.yml?branch=main&label=Lint%20%26%20Format&style=for-the-badge&logo=github&color=blueviolet)](https://github.com/CS-Foundry/CSFX-Core/actions/runs/20679215645) +[![Version](https://img.shields.io/github/v/release/CS-Foundry/CSFX-Core?style=for-the-badge&color=blue)](https://github.com/CS-Foundry/CSFX-Core/releases) +[![Downloads](https://img.shields.io/github/downloads/CS-Foundry/CSFX-Core/total?style=for-the-badge&color=success)](https://github.com/CS-Foundry/CSFX-Core/releases) +[![License](https://img.shields.io/badge/License-CSFX--Internal-purple?style=for-the-badge)](LICENSE)

High-Performance Backend & Frontend in a single systemd service.
@@ -23,7 +23,7 @@ ## ⚡ About the Project -**CSF-Core** revolutionizes infrastructure management through a **Unified Architecture** approach. Instead of manually orchestrating complex microservices, CSF-Core delivers a monolithic yet modular binary that serves both the API backend and the frontend. +**CSFX-Core** revolutionizes infrastructure management through a **Unified Architecture** approach. Instead of manually orchestrating complex microservices, CSFX-Core delivers a monolithic yet modular binary that serves both the API backend and the frontend. ### Key Features @@ -36,12 +36,12 @@ ## 🚀 Quick Start -Install CSF-Core on your Linux system in under 30 seconds using our one-line installer. +Install CSFX-Core on your Linux system in under 30 seconds using our one-line installer. ### Installation ```bash -curl -fsSL [https://raw.githubusercontent.com/CS-Foundry/CSF-Core/main/scripts/install.sh](https://raw.githubusercontent.com/CS-Foundry/CSF-Core/main/scripts/install.sh) | sudo bash +curl -fsSL [https://raw.githubusercontent.com/CS-Foundry/CSFX-Core/main/scripts/install.sh](https://raw.githubusercontent.com/CS-Foundry/CSFX-Core/main/scripts/install.sh) | sudo bash ``` ### Technology Stack @@ -63,7 +63,7 @@ You can find our complete documentation in the [`docs/`](https://www.google.com/ We actively support our users with integration and troubleshooting. -- **🐛 Bug Reports:** Please use [GitHub Issues](https://github.com/CS-Foundry/CSF-Core/issues) to report bugs. +- **🐛 Bug Reports:** Please use [GitHub Issues](https://github.com/CS-Foundry/CSFX-Core/issues) to report bugs. - **📖 Documentation:** Check the [`docs/`](https://www.google.com/search?q=./docs/) folder for detailed instructions. - **🔧 Debugging:** For connectivity issues, refer to the [Connection Debugging Guide](https://www.google.com/search?q=./docs/troubleshooting/DEBUG_CONNECTION.md). diff --git a/agent/Cargo.toml b/agent/Cargo.toml index 35f26fe..4deecb6 100644 --- a/agent/Cargo.toml +++ b/agent/Cargo.toml @@ -1,11 +1,11 @@ [package] -name = "csf-agent" +name = "csfx-agent" version.workspace = true edition.workspace = true license.workspace = true [[bin]] -name = "csf-agent" +name = "csfx-agent" path = "src/main.rs" [dependencies] diff --git a/agent/build.rs b/agent/build.rs index 8b7e815..65082fb 100644 --- a/agent/build.rs +++ b/agent/build.rs @@ -1,6 +1,6 @@ fn main() { - if let Ok(v) = std::env::var("CSF_BUILD_VERSION") { + if let Ok(v) = std::env::var("CSFX_BUILD_VERSION") { println!("cargo:rustc-env=CARGO_PKG_VERSION={}", v); } - println!("cargo:rerun-if-env-changed=CSF_BUILD_VERSION"); + println!("cargo:rerun-if-env-changed=CSFX_BUILD_VERSION"); } diff --git a/agent/src/client.rs b/agent/src/client.rs index ab6bcbf..ec33ebc 100644 --- a/agent/src/client.rs +++ b/agent/src/client.rs @@ -56,6 +56,12 @@ pub struct ContainerStatus { pub status: String, } +#[derive(Debug, Deserialize)] +pub struct HeartbeatResponse { + pub desired_flake_rev: Option, + pub post_update_heartbeats: Option, +} + #[derive(Debug, Deserialize)] pub struct AssignedWorkload { pub id: String, @@ -143,7 +149,7 @@ impl ApiClient { api_key: &str, container_statuses: Option>, metrics: Option, - ) -> Result<()> { + ) -> Result { let url = format!( "{}/api/registry/agents/{}/heartbeat", self.gateway_url, agent_id @@ -189,7 +195,9 @@ impl ApiClient { anyhow::bail!("Heartbeat failed status={}", status); } - Ok(()) + resp.json::() + .await + .context("Failed to parse heartbeat response") } pub async fn fetch_assigned_workloads( diff --git a/agent/src/config.rs b/agent/src/config.rs index a1e7e86..3a99fd8 100644 --- a/agent/src/config.rs +++ b/agent/src/config.rs @@ -3,9 +3,9 @@ use serde::{Deserialize, Serialize}; use std::path::Path; use uuid::Uuid; -const STATE_DIR: &str = "/var/lib/csf-daemon"; -const CREDENTIALS_FILE: &str = "/var/lib/csf-daemon/credentials"; -const CONFIG_FILE: &str = "/var/lib/csf-daemon/config.json"; +const STATE_DIR: &str = "/var/lib/csfx-daemon"; +const CREDENTIALS_FILE: &str = "/var/lib/csfx-daemon/credentials"; +const CONFIG_FILE: &str = "/var/lib/csfx-daemon/config.json"; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct DaemonConfig { diff --git a/agent/src/docker.rs b/agent/src/docker.rs index d8af7da..97e9977 100644 --- a/agent/src/docker.rs +++ b/agent/src/docker.rs @@ -70,7 +70,7 @@ impl DockerManager { } pub async fn start_container(&self, spec: &WorkloadSpec) -> Result { - let container_name = format!("csf-{}", spec.workload_id); + let container_name = format!("csfx-{}", spec.workload_id); let env: Option> = spec.env_vars.as_ref().map(|vars| { vars.iter() @@ -99,8 +99,8 @@ impl DockerManager { }, host_config: Some(host_config), labels: Some(HashMap::from([ - ("csf.workload_id".to_string(), spec.workload_id.clone()), - ("csf.managed".to_string(), "true".to_string()), + ("csfx.workload_id".to_string(), spec.workload_id.clone()), + ("csfx.managed".to_string(), "true".to_string()), ])), ..Default::default() }; diff --git a/agent/src/main.rs b/agent/src/main.rs index 320afa6..66ecc86 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -4,6 +4,7 @@ mod docker; mod pki; mod rbd; mod system; +mod update_watch; use anyhow::{Context, Result}; use std::collections::HashMap; @@ -22,12 +23,12 @@ async fn main() -> Result<()> { .with_target(false) .init(); - info!(version = env!("CARGO_PKG_VERSION"), "csf-agent starting"); + info!(version = env!("CARGO_PKG_VERSION"), "csfx-agent starting"); - let gateway_url = std::env::var("CSF_GATEWAY_URL") - .context("CSF_GATEWAY_URL environment variable is required")?; + let gateway_url = std::env::var("CSFX_GATEWAY_URL") + .context("CSFX_GATEWAY_URL environment variable is required")?; - let heartbeat_interval_secs: u64 = std::env::var("CSF_HEARTBEAT_INTERVAL") + let heartbeat_interval_secs: u64 = std::env::var("CSFX_HEARTBEAT_INTERVAL") .ok() .and_then(|v| v.parse().ok()) .unwrap_or(60); @@ -103,10 +104,10 @@ async fn perform_registration( heartbeat_interval_secs: u64, agent_pki: &pki::AgentPki, ) -> Result<(uuid::Uuid, String)> { - let token = match std::env::var("CSF_REGISTRATION_TOKEN") { + let token = match std::env::var("CSFX_REGISTRATION_TOKEN") { Ok(t) => t, Err(_) => { - info!("CSF_REGISTRATION_TOKEN not set, fetching bootstrap token from gateway"); + info!("CSFX_REGISTRATION_TOKEN not set, fetching bootstrap token from gateway"); client .fetch_bootstrap_token() .await @@ -169,6 +170,7 @@ async fn run_heartbeat_loop( ) { let mut interval = tokio::time::interval(Duration::from_secs(interval_secs)); let mut failure_count: u32 = 0; + let mut current_flake_rev = String::new(); loop { tokio::select! { @@ -183,11 +185,24 @@ async fn run_heartbeat_loop( let metrics = system::collect_metrics(); match client.heartbeat(agent_id, api_key, Some(statuses), Some(metrics)).await { - Ok(_) => { + Ok(resp) => { if failure_count > 0 { info!(agent_id = %agent_id, "Heartbeat recovered after {} failures", failure_count); failure_count = 0; } + + if let Some(count) = resp.post_update_heartbeats { + update_watch::write_heartbeat_counter(count).await; + } + + if let Some(rev) = resp.desired_flake_rev { + let rev_clone = rev.clone(); + let current = current_flake_rev.clone(); + tokio::spawn(async move { + update_watch::handle(agent_id, &rev_clone, ¤t).await; + }); + current_flake_rev = rev; + } } Err(e) => { failure_count += 1; diff --git a/agent/src/pki.rs b/agent/src/pki.rs index 227246e..4b103dc 100644 --- a/agent/src/pki.rs +++ b/agent/src/pki.rs @@ -2,10 +2,10 @@ use anyhow::{Context, Result}; use rcgen::{CertificateParams, DnType, KeyPair, PKCS_ECDSA_P256_SHA256}; use std::path::Path; -const KEY_FILE: &str = "/var/lib/csf-daemon/agent.key"; -const CSR_FILE: &str = "/var/lib/csf-daemon/agent.csr"; -const CERT_FILE: &str = "/var/lib/csf-daemon/agent.crt"; -const CA_FILE: &str = "/var/lib/csf-daemon/ca.crt"; +const KEY_FILE: &str = "/var/lib/csfx-daemon/agent.key"; +const CSR_FILE: &str = "/var/lib/csfx-daemon/agent.csr"; +const CERT_FILE: &str = "/var/lib/csfx-daemon/agent.crt"; +const CA_FILE: &str = "/var/lib/csfx-daemon/ca.crt"; pub struct AgentPki { key_pem: String, diff --git a/agent/src/rbd.rs b/agent/src/rbd.rs index 70fda46..7b3aad8 100644 --- a/agent/src/rbd.rs +++ b/agent/src/rbd.rs @@ -87,5 +87,5 @@ pub async fn umount(mount_point: &str) -> Result<()> { } pub fn mount_point_for(volume_id: &str) -> String { - format!("/mnt/csf-volumes/{}", volume_id) + format!("/mnt/csfx-volumes/{}", volume_id) } diff --git a/agent/src/update_watch.rs b/agent/src/update_watch.rs new file mode 100644 index 0000000..cd7f9c9 --- /dev/null +++ b/agent/src/update_watch.rs @@ -0,0 +1,59 @@ +use std::time::Duration; +use tokio::fs; +use tracing::{info, warn}; +use uuid::Uuid; + +const TRIGGER_FILE: &str = "/var/lib/csfx/update_trigger"; +const HEARTBEAT_COUNTER_FILE: &str = "/var/lib/csfx/post_update_heartbeats"; +const MAX_JITTER_SECS: u64 = 300; + +pub async fn handle(agent_id: Uuid, desired_flake_rev: &str, current_flake_rev: &str) { + if desired_flake_rev == current_flake_rev { + return; + } + + if !is_valid_sha(desired_flake_rev) { + warn!(flake_rev = %desired_flake_rev, "received invalid flake rev in heartbeat response"); + return; + } + + let jitter = jitter_delay(agent_id); + info!( + flake_rev = %desired_flake_rev, + jitter_secs = jitter, + "update signal received, waiting before writing trigger" + ); + + tokio::time::sleep(Duration::from_secs(jitter)).await; + + if let Err(e) = write_trigger(desired_flake_rev).await { + warn!(error = %e, flake_rev = %desired_flake_rev, "failed to write update trigger file"); + } else { + info!(flake_rev = %desired_flake_rev, "update trigger written"); + } +} + +pub async fn write_heartbeat_counter(count: u32) { + if let Some(parent) = std::path::Path::new(HEARTBEAT_COUNTER_FILE).parent() { + let _ = fs::create_dir_all(parent).await; + } + let _ = fs::write(HEARTBEAT_COUNTER_FILE, count.to_string()).await; +} + +async fn write_trigger(flake_rev: &str) -> anyhow::Result<()> { + if let Some(parent) = std::path::Path::new(TRIGGER_FILE).parent() { + fs::create_dir_all(parent).await?; + } + fs::write(TRIGGER_FILE, flake_rev).await?; + Ok(()) +} + +fn jitter_delay(agent_id: Uuid) -> u64 { + let bytes = agent_id.as_bytes(); + let val = u64::from_le_bytes(bytes[..8].try_into().unwrap_or([0u8; 8])); + val % MAX_JITTER_SECS +} + +fn is_valid_sha(rev: &str) -> bool { + rev.len() == 40 && rev.chars().all(|c| c.is_ascii_hexdigit()) +} diff --git a/control-plane/Dockerfile.csf-updater b/control-plane/Dockerfile.csfx-updater similarity index 80% rename from control-plane/Dockerfile.csf-updater rename to control-plane/Dockerfile.csfx-updater index a6f2c2f..998a976 100644 --- a/control-plane/Dockerfile.csf-updater +++ b/control-plane/Dockerfile.csfx-updater @@ -12,8 +12,8 @@ RUN apt-get update && apt-get install -y \ COPY Cargo.toml Cargo.lock ./ COPY agent/Cargo.toml ./agent/ COPY control-plane/api-gateway/Cargo.toml ./control-plane/api-gateway/ -COPY control-plane/csf-migrate/Cargo.toml ./control-plane/csf-migrate/ -COPY control-plane/csf-updater/Cargo.toml ./control-plane/csf-updater/ +COPY control-plane/csfx-migrate/Cargo.toml ./control-plane/csfx-migrate/ +COPY control-plane/csfx-updater/Cargo.toml ./control-plane/csfx-updater/ COPY control-plane/scheduler/Cargo.toml ./control-plane/scheduler/ COPY control-plane/failover-controller/Cargo.toml ./control-plane/failover-controller/ COPY control-plane/sdn-controller/Cargo.toml ./control-plane/sdn-controller/ @@ -27,14 +27,14 @@ COPY control-plane/shared/ ./control-plane/shared/ RUN mkdir -p agent/src \ control-plane/api-gateway/src \ - control-plane/csf-migrate/src \ - control-plane/csf-updater/src \ + control-plane/csfx-migrate/src \ + control-plane/csfx-updater/src \ control-plane/scheduler/src \ control-plane/failover-controller/src \ control-plane/sdn-controller/src \ control-plane/volume-manager/src \ control-plane/registry/src \ - && for d in agent control-plane/api-gateway control-plane/csf-migrate control-plane/csf-updater \ + && for d in agent control-plane/api-gateway control-plane/csfx-migrate control-plane/csfx-updater \ control-plane/scheduler control-plane/failover-controller control-plane/sdn-controller \ control-plane/volume-manager control-plane/registry; do \ echo "fn main() {}" > $d/src/main.rs; \ @@ -42,7 +42,7 @@ RUN mkdir -p agent/src \ COPY . . -RUN cargo build --release --bin csf-updater +RUN cargo build --release --bin csfx-updater FROM scratch AS export -COPY --from=builder /app/target/release/csf-updater /csf-updater +COPY --from=builder /app/target/release/csfx-updater /csfx-updater diff --git a/control-plane/Dockerfile.prod.shared b/control-plane/Dockerfile.prod.shared index bf95142..e1a5722 100644 --- a/control-plane/Dockerfile.prod.shared +++ b/control-plane/Dockerfile.prod.shared @@ -24,18 +24,18 @@ COPY control-plane/registry/Cargo.toml ./control-plane/registry/ COPY control-plane/shared/entity/Cargo.toml ./control-plane/shared/entity/ COPY control-plane/shared/migration/Cargo.toml ./control-plane/shared/migration/ COPY control-plane/shared/shared/Cargo.toml ./control-plane/shared/shared/ -COPY control-plane/csf-migrate/Cargo.toml ./control-plane/csf-migrate/ -COPY control-plane/csf-updater/Cargo.toml ./control-plane/csf-updater/ +COPY control-plane/csfx-migrate/Cargo.toml ./control-plane/csfx-migrate/ +COPY control-plane/csfx-updater/Cargo.toml ./control-plane/csfx-updater/ COPY agent/build.rs ./agent/ COPY control-plane/api-gateway/build.rs ./control-plane/api-gateway/ -COPY control-plane/csf-updater/build.rs ./control-plane/csf-updater/ +COPY control-plane/csfx-updater/build.rs ./control-plane/csfx-updater/ COPY control-plane/shared/ ./control-plane/shared/ RUN mkdir -p agent/src \ control-plane/api-gateway/src \ - control-plane/csf-migrate/src \ - control-plane/csf-updater/src \ + control-plane/csfx-migrate/src \ + control-plane/csfx-updater/src \ control-plane/scheduler/src \ control-plane/failover-controller/src \ control-plane/sdn-controller/src \ @@ -43,8 +43,8 @@ RUN mkdir -p agent/src \ control-plane/registry/src \ && echo "fn main() {}" > agent/src/main.rs \ && echo "fn main() {}" > control-plane/api-gateway/src/main.rs \ - && echo "fn main() {}" > control-plane/csf-migrate/src/main.rs \ - && echo "fn main() {}" > control-plane/csf-updater/src/main.rs \ + && echo "fn main() {}" > control-plane/csfx-migrate/src/main.rs \ + && echo "fn main() {}" > control-plane/csfx-updater/src/main.rs \ && echo "fn main() {}" > control-plane/scheduler/src/main.rs \ && echo "fn main() {}" > control-plane/failover-controller/src/main.rs \ && echo "fn main() {}" > control-plane/sdn-controller/src/main.rs \ @@ -57,10 +57,10 @@ FROM base AS builder ARG SERVICE_BIN ARG BUILD_JOBS=2 -ARG CSF_BUILD_VERSION +ARG CSFX_BUILD_VERSION ENV CARGO_BUILD_JOBS=${BUILD_JOBS} -ENV CSF_BUILD_VERSION=${CSF_BUILD_VERSION} +ENV CSFX_BUILD_VERSION=${CSFX_BUILD_VERSION} COPY --from=planner /app/recipe.json recipe.json @@ -68,7 +68,7 @@ RUN cargo chef cook --profile docker-release --recipe-path recipe.json COPY . . -RUN cargo build --profile docker-release --bin ${SERVICE_BIN} --bin csf-migrate +RUN cargo build --profile docker-release --bin ${SERVICE_BIN} --bin csfx-migrate FROM debian:bookworm-slim AS runtime @@ -83,9 +83,9 @@ WORKDIR /app ARG SERVICE_BIN COPY --from=builder /app/target/docker-release/${SERVICE_BIN} /app/service -COPY --from=builder /app/target/docker-release/csf-migrate /csf-migrate +COPY --from=builder /app/target/docker-release/csfx-migrate /csfx-migrate -RUN useradd -r -s /bin/false csf -USER csf +RUN useradd -r -s /bin/false csfx +USER csfx CMD ["/app/service"] diff --git a/control-plane/api-gateway/build.rs b/control-plane/api-gateway/build.rs index 8b7e815..65082fb 100644 --- a/control-plane/api-gateway/build.rs +++ b/control-plane/api-gateway/build.rs @@ -1,6 +1,6 @@ fn main() { - if let Ok(v) = std::env::var("CSF_BUILD_VERSION") { + if let Ok(v) = std::env::var("CSFX_BUILD_VERSION") { println!("cargo:rustc-env=CARGO_PKG_VERSION={}", v); } - println!("cargo:rerun-if-env-changed=CSF_BUILD_VERSION"); + println!("cargo:rerun-if-env-changed=CSFX_BUILD_VERSION"); } diff --git a/control-plane/api-gateway/src/auth_service.rs b/control-plane/api-gateway/src/auth_service.rs index 3620eca..036e74e 100644 --- a/control-plane/api-gateway/src/auth_service.rs +++ b/control-plane/api-gateway/src/auth_service.rs @@ -219,7 +219,7 @@ impl AuthService { 1, 30, secret.to_bytes().unwrap(), - Some("CSF-Core".to_string()), + Some("CSFX-Core".to_string()), user.name.clone(), ) .unwrap(); @@ -284,7 +284,7 @@ impl AuthService { 1, 30, Secret::Encoded(secret.clone()).to_bytes().unwrap(), - Some("CSF-Core".to_string()), + Some("CSFX-Core".to_string()), user.name.clone(), ) .unwrap(); diff --git a/control-plane/api-gateway/src/main.rs b/control-plane/api-gateway/src/main.rs index c94bced..113fdb9 100644 --- a/control-plane/api-gateway/src/main.rs +++ b/control-plane/api-gateway/src/main.rs @@ -81,9 +81,9 @@ use routes::users::{ ), modifiers(&SecurityAddon), info( - title = "CSF Control Plane API", + title = "CSFX Control Plane API", version = "0.2.0", - description = "CS-Foundry Control Plane — agent registry, workload scheduling, volume management, SDN, failover, RBAC", + description = "CSFX Control Plane — agent registry, workload scheduling, volume management, SDN, failover, RBAC", contact( name = "CS-Foundry Team", email = "support@cs-foundry.com" diff --git a/control-plane/api-gateway/src/metrics.rs b/control-plane/api-gateway/src/metrics.rs index f20e2dc..f69af41 100644 --- a/control-plane/api-gateway/src/metrics.rs +++ b/control-plane/api-gateway/src/metrics.rs @@ -8,20 +8,20 @@ static HTTP_REQUEST_DURATION_SECONDS: OnceLock = OnceLock::new(); pub fn init() { HTTP_REQUESTS_TOTAL.get_or_init(|| { register_counter_vec!( - "csf_gateway_http_requests_total", + "csfx_gateway_http_requests_total", "Total HTTP requests on API gateway", &["method", "path", "status"] ) - .expect("failed to register csf_gateway_http_requests_total") + .expect("failed to register csfx_gateway_http_requests_total") }); HTTP_REQUEST_DURATION_SECONDS.get_or_init(|| { register_histogram_vec!( - "csf_gateway_http_request_duration_seconds", + "csfx_gateway_http_request_duration_seconds", "HTTP request duration on API gateway in seconds", &["method", "path"] ) - .expect("failed to register csf_gateway_http_request_duration_seconds") + .expect("failed to register csfx_gateway_http_request_duration_seconds") }); } diff --git a/control-plane/api-gateway/src/routes/system.rs b/control-plane/api-gateway/src/routes/system.rs index 760876c..8e39fd3 100644 --- a/control-plane/api-gateway/src/routes/system.rs +++ b/control-plane/api-gateway/src/routes/system.rs @@ -68,7 +68,7 @@ pub fn routes() -> Router { async fn health_check() -> Json { Json(serde_json::json!({ "status": "healthy", - "service": "csf-core-backend" + "service": "csfx-core-backend" })) } diff --git a/control-plane/api-gateway/src/routes/update.rs b/control-plane/api-gateway/src/routes/update.rs index 44db094..d054c53 100644 --- a/control-plane/api-gateway/src/routes/update.rs +++ b/control-plane/api-gateway/src/routes/update.rs @@ -3,14 +3,15 @@ use etcd_client::Client; use serde::{Deserialize, Serialize}; use std::env; -use crate::auth::crypto::{decrypt_secret, encrypt_secret}; use crate::auth::rbac::CanManageSystem; use crate::AppState; -const ETCD_DESIRED_VERSION_KEY: &str = "/csf/config/desired_cp_version"; -const ETCD_UPDATE_RESULT_KEY: &str = "/csf/config/last_update_result"; -const ETCD_GHCR_TOKEN_KEY: &str = "/csf/config/ghcr_token"; -const ETCD_PAUSED_KEY: &str = "/csf/config/update_paused"; +const ETCD_DESIRED_VERSION_KEY: &str = "/csfx/config/desired_version"; +const ETCD_AVAILABLE_FLAKE_REV_KEY: &str = "/csfx/config/available_flake_rev"; +const ETCD_DESIRED_FLAKE_REV_KEY: &str = "/csfx/config/desired_flake_rev"; +const ETCD_BUILD_STATUS_KEY: &str = "/csfx/config/cp_build_status"; +const ETCD_RESULT_KEY: &str = "/csfx/config/last_build_result"; +const ETCD_PAUSED_KEY: &str = "/csfx/config/update_paused"; #[derive(Debug, Deserialize)] pub struct UpdateRequest { @@ -27,16 +28,11 @@ pub struct UpdateResponse { pub struct UpdateStatusResponse { pub current_version: String, pub desired_version: Option, + pub available_flake_rev: Option, + pub desired_flake_rev: Option, + pub build_status: Option, pub last_result: Option, pub paused: bool, - pub agent_version: Option, - pub updater_version: Option, -} - -#[derive(Debug, Deserialize)] -pub struct GhcrTokenRequest { - pub token: String, - pub username: String, } pub fn routes() -> Router { @@ -45,7 +41,6 @@ pub fn routes() -> Router { .route("/system/update/status", get(update_status)) .route("/system/update/pause", post(pause_updates)) .route("/system/update/resume", post(resume_updates)) - .route("/system/ghcr-token", post(set_ghcr_token)) } async fn etcd_client() -> Result { @@ -64,7 +59,7 @@ async fn trigger_update( State(_state): State, Json(req): Json, ) -> Result, StatusCode> { - if !is_valid_semver(&req.version) { + if !is_valid_version(&req.version) { return Err(StatusCode::UNPROCESSABLE_ENTITY); } @@ -78,18 +73,8 @@ async fn trigger_update( StatusCode::INTERNAL_SERVER_ERROR })?; - client - .put(ETCD_UPDATE_RESULT_KEY, b"in_progress", None) - .await - .map_err(|e| { - tracing::error!(error = %e, "failed to write update result to etcd"); - StatusCode::INTERNAL_SERVER_ERROR - })?; - tracing::info!(version = %req.version, "update requested"); - spawn_update(req.version.clone()); - Ok(Json(UpdateResponse { status: "update_scheduled".to_string(), version: req.version, @@ -102,34 +87,24 @@ async fn update_status( ) -> Result, StatusCode> { let mut client = etcd_client().await?; - let desired = etcd_get(&mut client, ETCD_DESIRED_VERSION_KEY).await?; - let last_result = etcd_get(&mut client, ETCD_UPDATE_RESULT_KEY).await?; + let desired_version = etcd_get(&mut client, ETCD_DESIRED_VERSION_KEY).await?; + let available_flake_rev = etcd_get(&mut client, ETCD_AVAILABLE_FLAKE_REV_KEY).await?; + let desired_flake_rev = etcd_get(&mut client, ETCD_DESIRED_FLAKE_REV_KEY).await?; + let build_status = etcd_get(&mut client, ETCD_BUILD_STATUS_KEY).await?; + let last_result = etcd_get(&mut client, ETCD_RESULT_KEY).await?; let paused = etcd_get(&mut client, ETCD_PAUSED_KEY).await?.as_deref() == Some("true"); - let binary_dir = env::var("BINARY_DIR").unwrap_or_else(|_| "/usr/local/bin".to_string()); - let agent_version = binary_version(&format!("{}/csf-agent", binary_dir)).await; - let updater_version = binary_version(&format!("{}/csf-updater", binary_dir)).await; - Ok(Json(UpdateStatusResponse { current_version: env!("CARGO_PKG_VERSION").to_string(), - desired_version: desired, + desired_version, + available_flake_rev, + desired_flake_rev, + build_status, last_result, paused, - agent_version, - updater_version, })) } -async fn binary_version(path: &str) -> Option { - let output = tokio::process::Command::new(path) - .arg("--version") - .output() - .await - .ok()?; - let raw = String::from_utf8(output.stdout).ok()?; - raw.split_whitespace().last().map(|s| s.trim().to_string()) -} - async fn etcd_get(client: &mut Client, key: &str) -> Result, StatusCode> { let resp = client.get(key, None).await.map_err(|e| { tracing::error!(error = %e, key = key, "failed to read from etcd"); @@ -143,71 +118,9 @@ async fn etcd_get(client: &mut Client, key: &str) -> Result, Stat .map(|s| s.to_string())) } -fn is_valid_semver(version: &str) -> bool { - let v = version.strip_prefix('v').unwrap_or(version); - let (base, _pre) = match v.split_once('-') { - Some((b, p)) => (b, Some(p)), - None => (v, None), - }; - let parts: Vec<&str> = base.split('.').collect(); - parts.len() == 3 && parts.iter().all(|p| p.parse::().is_ok()) -} - -fn spawn_update(version: String) { - tokio::spawn(async move { - if let Err(e) = run_update(&version).await { - tracing::error!(error = %e, version = %version, "update failed"); - write_result("failed").await; - } else { - tracing::info!(version = %version, "update completed"); - write_result("success").await; - } - }); -} - -async fn run_update(version: &str) -> Result<(), String> { - let compose_file = env::var("COMPOSE_FILE") - .unwrap_or_else(|_| "docker-compose.prod.yml".to_string()); - let ghcr_org = env::var("GHCR_ORG").map_err(|_| "GHCR_ORG not set".to_string())?; - - pull_images(&compose_file, &ghcr_org, version).await?; - restart_services(&compose_file, &ghcr_org, version).await -} - -async fn pull_images(compose_file: &str, ghcr_org: &str, version: &str) -> Result<(), String> { - let status = tokio::process::Command::new("docker") - .args(["compose", "-f", compose_file, "pull"]) - .env("GHCR_ORG", ghcr_org) - .env("CSF_VERSION", version) - .status() - .await - .map_err(|e| format!("docker compose pull failed: {}", e))?; - - if !status.success() { - return Err(format!("docker compose pull exited with {}", status)); - } - Ok(()) -} - -async fn restart_services(compose_file: &str, ghcr_org: &str, version: &str) -> Result<(), String> { - let status = tokio::process::Command::new("docker") - .args(["compose", "-f", compose_file, "up", "-d"]) - .env("GHCR_ORG", ghcr_org) - .env("CSF_VERSION", version) - .status() - .await - .map_err(|e| format!("docker compose up failed: {}", e))?; - - if !status.success() { - return Err(format!("docker compose up exited with {}", status)); - } - Ok(()) -} - -async fn write_result(result: &str) { - if let Ok(mut client) = etcd_client().await { - let _ = client.put(ETCD_UPDATE_RESULT_KEY, result.as_bytes(), None).await; - } +fn is_valid_version(version: &str) -> bool { + let v = version.trim_start_matches('v'); + !v.is_empty() && v.chars().all(|c| c.is_ascii_alphanumeric() || c == '.' || c == '-') } async fn pause_updates( @@ -241,36 +154,3 @@ async fn resume_updates( tracing::info!("updates resumed"); Ok(StatusCode::NO_CONTENT) } - -async fn set_ghcr_token( - _auth: CanManageSystem, - State(_state): State, - Json(req): Json, -) -> Result { - if req.token.is_empty() || req.username.is_empty() { - return Err(StatusCode::UNPROCESSABLE_ENTITY); - } - - let encryption_key = env::var("SECRET_ENCRYPTION_KEY").map_err(|_| { - tracing::error!("SECRET_ENCRYPTION_KEY not set"); - StatusCode::INTERNAL_SERVER_ERROR - })?; - - let payload = format!("{}:{}", req.username, req.token); - let encrypted = encrypt_secret(&payload, &encryption_key).map_err(|e| { - tracing::error!(error = %e, "failed to encrypt ghcr token"); - StatusCode::INTERNAL_SERVER_ERROR - })?; - - let mut client = etcd_client().await?; - client - .put(ETCD_GHCR_TOKEN_KEY, encrypted.as_bytes(), None) - .await - .map_err(|e| { - tracing::error!(error = %e, "failed to write ghcr token to etcd"); - StatusCode::INTERNAL_SERVER_ERROR - })?; - - tracing::info!(username = %req.username, "ghcr token updated"); - Ok(StatusCode::NO_CONTENT) -} diff --git a/control-plane/api-gateway/src/self_monitor.rs b/control-plane/api-gateway/src/self_monitor.rs index 0aa492e..c457225 100644 --- a/control-plane/api-gateway/src/self_monitor.rs +++ b/control-plane/api-gateway/src/self_monitor.rs @@ -55,7 +55,7 @@ impl SelfMonitor { pub async fn new(db_conn: Arc) -> Result { // Get or create local agent let hostname = System::host_name().unwrap_or_else(|| "localhost".to_string()); - let agent_name = format!("CSF-Core-{}", hostname); + let agent_name = format!("CSFX-Core-{}", hostname); // Check if agent already exists let existing_agent = agents::Entity::find() diff --git a/control-plane/csf-updater/build.rs b/control-plane/csf-updater/build.rs deleted file mode 100644 index 8b7e815..0000000 --- a/control-plane/csf-updater/build.rs +++ /dev/null @@ -1,6 +0,0 @@ -fn main() { - if let Ok(v) = std::env::var("CSF_BUILD_VERSION") { - println!("cargo:rustc-env=CARGO_PKG_VERSION={}", v); - } - println!("cargo:rerun-if-env-changed=CSF_BUILD_VERSION"); -} diff --git a/control-plane/csf-updater/src/config.rs b/control-plane/csf-updater/src/config.rs deleted file mode 100644 index 2b18015..0000000 --- a/control-plane/csf-updater/src/config.rs +++ /dev/null @@ -1,37 +0,0 @@ -use anyhow::{Context, Result}; -use std::env; - -pub struct Config { - pub etcd_endpoints: Vec, - pub ghcr_org: String, - pub compose_file: String, - pub poll_interval_secs: u64, - pub secret_encryption_key: String, - pub binary_dir: String, - pub github_release_base_url: String, -} - -impl Config { - pub fn from_env() -> Result { - Ok(Self { - etcd_endpoints: env::var("ETCD_ENDPOINTS") - .unwrap_or_else(|_| "http://localhost:2379".to_string()) - .split(',') - .map(|s| s.trim().to_string()) - .collect(), - ghcr_org: env::var("GHCR_ORG").context("GHCR_ORG must be set")?, - compose_file: env::var("COMPOSE_FILE") - .unwrap_or_else(|_| "/etc/csf-core/docker-compose.yml".to_string()), - poll_interval_secs: env::var("POLL_INTERVAL_SECS") - .ok() - .and_then(|v| v.parse().ok()) - .unwrap_or(30), - secret_encryption_key: env::var("SECRET_ENCRYPTION_KEY") - .context("SECRET_ENCRYPTION_KEY must be set")?, - binary_dir: env::var("BINARY_DIR") - .unwrap_or_else(|_| "/usr/local/bin".to_string()), - github_release_base_url: env::var("GITHUB_RELEASE_BASE_URL") - .unwrap_or_else(|_| "https://github.com/csfx-cloud/CSF-Core/releases/download".to_string()), - }) - } -} diff --git a/control-plane/csf-updater/src/main.rs b/control-plane/csf-updater/src/main.rs deleted file mode 100644 index d474148..0000000 --- a/control-plane/csf-updater/src/main.rs +++ /dev/null @@ -1,87 +0,0 @@ -mod config; -mod etcd; -mod secret; -mod updater; -mod verify; - -use std::time::Duration; -use tracing::info; - -#[tokio::main] -async fn main() -> anyhow::Result<()> { - dotenvy::dotenv().ok(); - - tracing_subscriber::fmt() - .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) - .init(); - - let cfg = config::Config::from_env()?; - let poll_interval = Duration::from_secs(cfg.poll_interval_secs); - - info!(poll_interval_secs = cfg.poll_interval_secs, "csf-updater started"); - - let mut last_applied = String::new(); - - loop { - match run_once(&cfg, &last_applied).await { - Ok(Some(version)) => { - last_applied = version; - } - Ok(None) => {} - Err(e) => { - tracing::error!(error = %e, "update cycle error"); - } - } - tokio::time::sleep(poll_interval).await; - } -} - -async fn run_once(cfg: &config::Config, last_applied: &str) -> anyhow::Result> { - let mut etcd = etcd::Client::connect(cfg).await?; - - if etcd.get(etcd::PAUSED_KEY).await?.as_deref() == Some("true") { - tracing::info!("updates paused, skipping"); - return Ok(None); - } - - let desired = match etcd.get(etcd::DESIRED_VERSION_KEY).await? { - Some(v) => v, - None => return Ok(None), - }; - - if desired.is_empty() || desired == last_applied { - return Ok(None); - } - - if !is_valid_version(&desired) { - tracing::warn!(version = %desired, "rejected invalid version string"); - etcd.put(etcd::RESULT_KEY, "failed").await?; - return Ok(Some(desired)); - } - - info!(version = %desired, last_applied = %last_applied, "starting update"); - etcd.put(etcd::RESULT_KEY, "in_progress").await?; - - match updater::run(cfg, &desired, &mut etcd).await { - Ok(()) => { - etcd.put(etcd::RESULT_KEY, "success").await?; - info!(version = %desired, "update complete"); - Ok(Some(desired)) - } - Err(e) => { - tracing::error!(error = %e, version = %desired, "update failed"); - etcd.put(etcd::RESULT_KEY, "failed").await?; - Ok(Some(desired)) - } - } -} - -fn is_valid_version(v: &str) -> bool { - let v = v.trim_start_matches('v'); - let (base, _pre) = match v.split_once('-') { - Some((b, p)) => (b, Some(p)), - None => (v, None), - }; - let parts: Vec<&str> = base.split('.').collect(); - parts.len() == 3 && parts.iter().all(|p| p.parse::().is_ok()) -} diff --git a/control-plane/csf-updater/src/secret.rs b/control-plane/csf-updater/src/secret.rs deleted file mode 100644 index 5d51b60..0000000 --- a/control-plane/csf-updater/src/secret.rs +++ /dev/null @@ -1,26 +0,0 @@ -use aes_gcm::{aead::{Aead, KeyInit}, Aes256Gcm, Nonce}; -use anyhow::{bail, Result}; -use base64::Engine; - -pub fn decrypt_secret(encoded: &str, key_b64: &str) -> Result { - let key_bytes = base64::engine::general_purpose::STANDARD.decode(key_b64)?; - if key_bytes.len() != 32 { - bail!("invalid encryption key length"); - } - - let combined = base64::engine::general_purpose::STANDARD.decode(encoded)?; - if combined.len() < 12 { - bail!("invalid ciphertext"); - } - - let (nonce_bytes, ciphertext) = combined.split_at(12); - let cipher = Aes256Gcm::new_from_slice(&key_bytes) - .map_err(|e| anyhow::anyhow!("cipher init failed: {}", e))?; - let nonce = Nonce::from_slice(nonce_bytes); - - let plaintext = cipher - .decrypt(nonce, ciphertext) - .map_err(|e| anyhow::anyhow!("decryption failed: {}", e))?; - - Ok(String::from_utf8(plaintext)?) -} diff --git a/control-plane/csf-updater/src/updater.rs b/control-plane/csf-updater/src/updater.rs deleted file mode 100644 index 71e451a..0000000 --- a/control-plane/csf-updater/src/updater.rs +++ /dev/null @@ -1,194 +0,0 @@ -use anyhow::{bail, Result}; -use sha2::{Digest, Sha256}; -use std::process::Stdio; -use tokio::process::Command; -use tracing::info; - -use crate::config::Config; -use crate::etcd; -use crate::secret::decrypt_secret; -use crate::verify; - -pub async fn run(cfg: &Config, version: &str, etcd: &mut etcd::Client) -> Result<()> { - let (docker_config_dir, ghcr_auth) = setup_docker_auth(cfg, etcd).await?; - pull(cfg, version, docker_config_dir.as_deref()).await?; - verify::verify_images(cfg, version, ghcr_auth.as_deref()).await?; - up(cfg, version, docker_config_dir.as_deref()).await?; - health_check(cfg, version).await?; - update_agent_binary(cfg, version).await?; - update_self_binary(cfg, version).await -} - -async fn setup_docker_auth(cfg: &Config, etcd: &mut etcd::Client) -> Result<(Option, Option)> { - let encrypted = match etcd.get(etcd::GHCR_TOKEN_KEY).await? { - Some(v) => v, - None => return Ok((None, None)), - }; - - let payload = decrypt_secret(&encrypted, &cfg.secret_encryption_key)?; - let (username, token) = payload - .split_once(':') - .ok_or_else(|| anyhow::anyhow!("invalid ghcr token payload"))?; - - let dir = tempfile::tempdir()?; - let config_path = dir.path().join("config.json"); - - let auth_raw = format!("{}:{}", username, token); - let auth_b64 = base64::Engine::encode( - &base64::engine::general_purpose::STANDARD, - auth_raw.as_bytes(), - ); - let config = serde_json::json!({ - "auths": { - "ghcr.io": { - "auth": auth_b64 - } - } - }); - - tokio::fs::write(&config_path, serde_json::to_string(&config)?).await?; - let dir_path = dir.into_path().to_string_lossy().to_string(); - Ok((Some(dir_path), Some(auth_b64))) -} - -async fn pull(cfg: &Config, version: &str, docker_config_dir: Option<&str>) -> Result<()> { - info!(version = %version, "pulling images"); - compose(cfg, version, docker_config_dir, &["pull"]).await -} - -async fn up(cfg: &Config, version: &str, docker_config_dir: Option<&str>) -> Result<()> { - info!(version = %version, "restarting services"); - compose(cfg, version, docker_config_dir, &["up", "-d", "--remove-orphans"]).await -} - -async fn health_check(cfg: &Config, version: &str) -> Result<()> { - info!("waiting for health checks"); - tokio::time::sleep(std::time::Duration::from_secs(15)).await; - - let output = Command::new("docker") - .args(["compose", "-f", &cfg.compose_file, "ps", "--format", "json"]) - .env("GHCR_ORG", &cfg.ghcr_org) - .env("CSF_VERSION", version) - .output() - .await?; - - let stdout = String::from_utf8_lossy(&output.stdout); - for line in stdout.lines() { - if let Ok(svc) = serde_json::from_str::(line) { - if svc["Health"].as_str() == Some("unhealthy") { - bail!("service {} is unhealthy after update", svc["Name"].as_str().unwrap_or("unknown")); - } - } - } - - info!("all services healthy"); - Ok(()) -} - -async fn update_agent_binary(cfg: &Config, version: &str) -> Result<()> { - info!(version = %version, "updating csf-agent binary"); - let arch = detect_arch(); - let url = format!( - "{}/v{}/csf-agent-{}", - cfg.github_release_base_url, version, arch - ); - let dest = format!("{}/csf-agent", cfg.binary_dir); - download_and_swap(&url, &dest).await?; - restart_unit("csf-daemon").await -} - -async fn update_self_binary(cfg: &Config, version: &str) -> Result<()> { - info!(version = %version, "updating csf-updater binary"); - let arch = detect_arch(); - let url = format!( - "{}/v{}/csf-updater-{}", - cfg.github_release_base_url, version, arch - ); - let dest = format!("{}/csf-updater", cfg.binary_dir); - download_and_swap(&url, &dest).await?; - restart_unit("csf-updater").await -} - -async fn download_and_swap(url: &str, dest: &str) -> Result<()> { - let tmp = format!("{}.new", dest); - - let bytes = fetch(url).await?; - let expected = fetch_checksum(&format!("{}.sha256", url)).await?; - verify_checksum(&bytes, &expected)?; - - tokio::fs::write(&tmp, &bytes).await?; - - let mut perms = tokio::fs::metadata(&tmp).await?.permissions(); - std::os::unix::fs::PermissionsExt::set_mode(&mut perms, 0o750); - tokio::fs::set_permissions(&tmp, perms).await?; - - tokio::fs::rename(&tmp, dest).await?; - info!(dest = %dest, "binary swapped"); - Ok(()) -} - -async fn fetch(url: &str) -> Result { - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - bail!("failed to download {}: {}", url, resp.status()); - } - Ok(resp.bytes().await?) -} - -async fn fetch_checksum(url: &str) -> Result { - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - bail!("failed to download checksum {}: {}", url, resp.status()); - } - let text = resp.text().await?; - text.split_whitespace() - .next() - .map(|s| s.to_string()) - .ok_or_else(|| anyhow::anyhow!("empty checksum file at {}", url)) -} - -fn verify_checksum(data: &[u8], expected: &str) -> Result<()> { - let digest = hex::encode(Sha256::digest(data)); - if digest != expected { - bail!("checksum mismatch: expected={} got={}", expected, digest); - } - info!("checksum verified"); - Ok(()) -} - -async fn restart_unit(unit: &str) -> Result<()> { - let status = Command::new("sudo") - .args(["systemctl", "restart", unit]) - .status() - .await?; - if !status.success() { - bail!("systemctl restart {} failed: {}", unit, status); - } - Ok(()) -} - -fn detect_arch() -> &'static str { - if cfg!(target_arch = "aarch64") { "arm64" } else { "amd64" } -} - -async fn compose(cfg: &Config, version: &str, docker_config_dir: Option<&str>, args: &[&str]) -> Result<()> { - let mut cmd_args = vec!["compose", "-f", cfg.compose_file.as_str()]; - cmd_args.extend_from_slice(args); - - let mut cmd = Command::new("docker"); - cmd.args(&cmd_args) - .env("GHCR_ORG", &cfg.ghcr_org) - .env("CSF_VERSION", version) - .stdout(Stdio::inherit()) - .stderr(Stdio::inherit()); - - if let Some(dir) = docker_config_dir { - cmd.env("DOCKER_CONFIG", dir); - } - - let status = cmd.status().await?; - if !status.success() { - bail!("docker compose {} failed: {}", args.join(" "), status); - } - Ok(()) -} diff --git a/control-plane/csf-updater/src/verify.rs b/control-plane/csf-updater/src/verify.rs deleted file mode 100644 index c074b47..0000000 --- a/control-plane/csf-updater/src/verify.rs +++ /dev/null @@ -1,100 +0,0 @@ -use anyhow::{bail, Result}; -use tracing::info; - -use crate::config::Config; - -const SERVICES: &[&str] = &[ - "api-gateway", - "registry", - "scheduler", - "volume-manager", - "failover-controller", - "sdn-controller", -]; - -pub async fn verify_images(cfg: &Config, version: &str, ghcr_auth: Option<&str>) -> Result<()> { - let client = reqwest::Client::new(); - - for svc in SERVICES { - let image = format!("{}/csf-ce-{}", cfg.ghcr_org, svc); - let remote = remote_digest(&client, &image, version, ghcr_auth).await?; - let local = local_digest(&format!("ghcr.io/{}/csf-ce-{}:{}", cfg.ghcr_org, svc, version))?; - - if remote != local { - bail!( - "digest mismatch for {}: remote={} local={}", - svc, remote, local - ); - } - - info!(service = svc, digest = %remote, "image verified"); - } - - Ok(()) -} - -async fn exchange_token(client: &reqwest::Client, image: &str, basic_auth: &str) -> Result { - let url = format!( - "https://ghcr.io/token?scope=repository:{}:pull", - image - ); - let resp = client - .get(&url) - .header("Authorization", format!("Basic {}", basic_auth)) - .send() - .await?; - - if !resp.status().is_success() { - bail!("GHCR token exchange failed for {}: {}", image, resp.status()); - } - - let body: serde_json::Value = resp.json().await?; - body["token"] - .as_str() - .map(|s| s.to_string()) - .ok_or_else(|| anyhow::anyhow!("no token in GHCR token response for {}", image)) -} - -async fn remote_digest(client: &reqwest::Client, image: &str, tag: &str, ghcr_auth: Option<&str>) -> Result { - let bearer = match ghcr_auth { - Some(auth) => exchange_token(client, image, auth).await?, - None => bail!("no GHCR auth configured"), - }; - - let url = format!("https://ghcr.io/v2/{}/manifests/{}", image, tag); - let resp = client - .head(&url) - .header("Authorization", format!("Bearer {}", bearer)) - .header("Accept", "application/vnd.docker.distribution.manifest.v2+json") - .send() - .await?; - - if !resp.status().is_success() { - bail!("GHCR manifest request failed for {}: {}", image, resp.status()); - } - - resp.headers() - .get("docker-content-digest") - .and_then(|v| v.to_str().ok()) - .map(|s| s.to_string()) - .ok_or_else(|| anyhow::anyhow!("no docker-content-digest header for {}", image)) -} - -fn local_digest(image: &str) -> Result { - let output = std::process::Command::new("docker") - .args(["image", "inspect", "--format", "{{json .RepoDigests}}", image]) - .output()?; - - if !output.status.success() { - bail!("docker inspect failed for {}", image); - } - - let raw = String::from_utf8(output.stdout)?; - let digests: Vec = serde_json::from_str(raw.trim()) - .map_err(|e| anyhow::anyhow!("failed to parse RepoDigests for {}: {}", image, e))?; - - digests - .into_iter() - .find_map(|d| d.split('@').nth(1).map(|s| s.to_string())) - .ok_or_else(|| anyhow::anyhow!("no repo digest found for {}", image)) -} diff --git a/control-plane/csf-migrate/Cargo.toml b/control-plane/csfx-migrate/Cargo.toml similarity index 90% rename from control-plane/csf-migrate/Cargo.toml rename to control-plane/csfx-migrate/Cargo.toml index bf278d4..f3e1573 100644 --- a/control-plane/csf-migrate/Cargo.toml +++ b/control-plane/csfx-migrate/Cargo.toml @@ -1,12 +1,12 @@ [package] -name = "csf-migrate" +name = "csfx-migrate" version.workspace = true edition.workspace = true license.workspace = true publish = false [[bin]] -name = "csf-migrate" +name = "csfx-migrate" path = "src/main.rs" [dependencies] diff --git a/control-plane/csf-migrate/src/main.rs b/control-plane/csfx-migrate/src/main.rs similarity index 92% rename from control-plane/csf-migrate/src/main.rs rename to control-plane/csfx-migrate/src/main.rs index b6eed99..03892a8 100644 --- a/control-plane/csf-migrate/src/main.rs +++ b/control-plane/csfx-migrate/src/main.rs @@ -9,7 +9,7 @@ async fn main() { tracing_subscriber::fmt() .with_env_filter( tracing_subscriber::EnvFilter::from_default_env() - .add_directive("csf_migrate=info".parse().unwrap()), + .add_directive("csfx_migrate=info".parse().unwrap()), ) .init(); diff --git a/control-plane/csf-updater/Cargo.toml b/control-plane/csfx-updater/Cargo.toml similarity index 75% rename from control-plane/csf-updater/Cargo.toml rename to control-plane/csfx-updater/Cargo.toml index 2f63777..e8ca0e5 100644 --- a/control-plane/csf-updater/Cargo.toml +++ b/control-plane/csfx-updater/Cargo.toml @@ -1,12 +1,12 @@ [package] -name = "csf-updater" +name = "csfx-updater" version.workspace = true edition.workspace = true license.workspace = true publish = false [[bin]] -name = "csf-updater" +name = "csfx-updater" path = "src/main.rs" [dependencies] @@ -19,9 +19,3 @@ etcd-client = { workspace = true } reqwest = { version = "0.11", features = ["json", "rustls-tls-webpki-roots"], default-features = false } serde = { workspace = true } serde_json = { workspace = true } -aes-gcm = { workspace = true } -base64 = { workspace = true } -sha2 = { workspace = true } -hex = "0.4" -bytes = "1" -tempfile = "3" diff --git a/control-plane/csfx-updater/build.rs b/control-plane/csfx-updater/build.rs new file mode 100644 index 0000000..65082fb --- /dev/null +++ b/control-plane/csfx-updater/build.rs @@ -0,0 +1,6 @@ +fn main() { + if let Ok(v) = std::env::var("CSFX_BUILD_VERSION") { + println!("cargo:rustc-env=CARGO_PKG_VERSION={}", v); + } + println!("cargo:rerun-if-env-changed=CSFX_BUILD_VERSION"); +} diff --git a/control-plane/csfx-updater/src/config.rs b/control-plane/csfx-updater/src/config.rs new file mode 100644 index 0000000..22292ed --- /dev/null +++ b/control-plane/csfx-updater/src/config.rs @@ -0,0 +1,35 @@ +use anyhow::{Context, Result}; +use std::env; + +pub struct Config { + pub etcd_endpoints: Vec, + pub poll_interval_secs: u64, + pub infra_repo_mirror_dir: String, + pub infra_repo_mirror_url: String, + pub infra_repo_github: String, + pub infra_repo_branch: String, +} + +impl Config { + pub fn from_env() -> Result { + Ok(Self { + etcd_endpoints: env::var("ETCD_ENDPOINTS") + .unwrap_or_else(|_| "http://localhost:2379".to_string()) + .split(',') + .map(|s| s.trim().to_string()) + .collect(), + poll_interval_secs: env::var("POLL_INTERVAL_SECS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(120), + infra_repo_mirror_dir: env::var("INFRA_REPO_MIRROR_DIR") + .unwrap_or_else(|_| "/var/lib/csfx-updater/infra.git".to_string()), + infra_repo_mirror_url: env::var("INFRA_REPO_MIRROR_URL") + .context("INFRA_REPO_MIRROR_URL must be set")?, + infra_repo_github: env::var("INFRA_REPO_GITHUB") + .context("INFRA_REPO_GITHUB must be set (e.g. csfx-cloud/CSFX-Infra)")?, + infra_repo_branch: env::var("INFRA_REPO_BRANCH") + .unwrap_or_else(|_| "main".to_string()), + }) + } +} diff --git a/control-plane/csf-updater/src/etcd.rs b/control-plane/csfx-updater/src/etcd.rs similarity index 53% rename from control-plane/csf-updater/src/etcd.rs rename to control-plane/csfx-updater/src/etcd.rs index fff8468..7052752 100644 --- a/control-plane/csf-updater/src/etcd.rs +++ b/control-plane/csfx-updater/src/etcd.rs @@ -2,10 +2,13 @@ use anyhow::Result; use crate::config::Config; -pub const DESIRED_VERSION_KEY: &str = "/csf/config/desired_cp_version"; -pub const RESULT_KEY: &str = "/csf/config/last_update_result"; -pub const GHCR_TOKEN_KEY: &str = "/csf/config/ghcr_token"; -pub const PAUSED_KEY: &str = "/csf/config/update_paused"; +pub const DESIRED_VERSION_KEY: &str = "/csfx/config/desired_version"; +pub const AVAILABLE_FLAKE_REV_KEY: &str = "/csfx/config/available_flake_rev"; +pub const DESIRED_FLAKE_REV_KEY: &str = "/csfx/config/desired_flake_rev"; +pub const BUILD_STATUS_KEY: &str = "/csfx/config/cp_build_status"; +pub const RESULT_KEY: &str = "/csfx/config/last_build_result"; +pub const PAUSED_KEY: &str = "/csfx/config/update_paused"; +pub const NODE_HEARTBEAT_PREFIX: &str = "/csfx/nodes/"; pub struct Client { inner: etcd_client::Client, @@ -31,4 +34,12 @@ impl Client { self.inner.put(key, value.as_bytes(), None).await?; Ok(()) } + + pub async fn delete_prefix(&mut self, prefix: &str) -> Result<()> { + use etcd_client::DeleteOptions; + self.inner + .delete(prefix, Some(DeleteOptions::new().with_prefix())) + .await?; + Ok(()) + } } diff --git a/control-plane/csfx-updater/src/git_mirror.rs b/control-plane/csfx-updater/src/git_mirror.rs new file mode 100644 index 0000000..9153205 --- /dev/null +++ b/control-plane/csfx-updater/src/git_mirror.rs @@ -0,0 +1,54 @@ +use anyhow::{bail, Result}; +use std::path::Path; +use tokio::process::Command; +use tracing::info; + +pub async fn sync(mirror_dir: &str, remote_url: &str) -> Result<()> { + if Path::new(mirror_dir).join("HEAD").exists() { + fetch(mirror_dir).await + } else { + clone(mirror_dir, remote_url).await + } +} + +async fn clone(mirror_dir: &str, remote_url: &str) -> Result<()> { + info!(mirror_dir = %mirror_dir, remote_url = %remote_url, "cloning infra repo mirror"); + + let status = Command::new("git") + .args(["clone", "--mirror", remote_url, mirror_dir]) + .status() + .await?; + + if !status.success() { + bail!("git clone --mirror failed for {}", remote_url); + } + + info!(mirror_dir = %mirror_dir, "mirror clone complete"); + Ok(()) +} + +async fn fetch(mirror_dir: &str) -> Result<()> { + info!(mirror_dir = %mirror_dir, "fetching infra repo mirror"); + + let status = Command::new("git") + .args(["--git-dir", mirror_dir, "fetch", "--prune"]) + .status() + .await?; + + if !status.success() { + bail!("git fetch --prune failed in {}", mirror_dir); + } + + info!(mirror_dir = %mirror_dir, "mirror fetch complete"); + Ok(()) +} + +pub async fn rev_exists(mirror_dir: &str, rev: &str) -> Result { + let output = Command::new("git") + .args(["--git-dir", mirror_dir, "cat-file", "-t", rev]) + .output() + .await?; + + Ok(output.status.success() + && String::from_utf8_lossy(&output.stdout).trim() == "commit") +} diff --git a/control-plane/csfx-updater/src/main.rs b/control-plane/csfx-updater/src/main.rs new file mode 100644 index 0000000..1d80ba1 --- /dev/null +++ b/control-plane/csfx-updater/src/main.rs @@ -0,0 +1,162 @@ +mod config; +mod etcd; +mod git_mirror; +mod nix_build; +mod poller; +mod updater; + +use std::time::Duration; +use tokio::sync::watch; +use tracing::info; + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + dotenvy::dotenv().ok(); + + tracing_subscriber::fmt() + .with_env_filter(tracing_subscriber::EnvFilter::from_default_env()) + .init(); + + let cfg = config::Config::from_env()?; + + info!( + poll_interval_secs = cfg.poll_interval_secs, + infra_repo_github = %cfg.infra_repo_github, + "csfx-updater started" + ); + + let cfg = std::sync::Arc::new(cfg); + let cfg_poller = cfg.clone(); + let cfg_executor = cfg.clone(); + + let poller_task = tokio::spawn(async move { + run_poller_loop(&cfg_poller).await; + }); + + let executor_task = tokio::spawn(async move { + run_executor_loop(&cfg_executor).await; + }); + + tokio::select! { + _ = poller_task => tracing::error!("poller task exited unexpectedly"), + _ = executor_task => tracing::error!("executor task exited unexpectedly"), + } + + Ok(()) +} + +async fn run_poller_loop(cfg: &config::Config) { + let mut last_etag: Option = None; + let interval = Duration::from_secs(cfg.poll_interval_secs); + + loop { + match git_mirror::sync(&cfg.infra_repo_mirror_dir, &cfg.infra_repo_mirror_url).await { + Ok(()) => {} + Err(e) => { + tracing::error!(error = %e, "git mirror sync failed"); + tokio::time::sleep(interval).await; + continue; + } + } + + let mut etcd = match etcd::Client::connect(cfg).await { + Ok(c) => c, + Err(e) => { + tracing::error!(error = %e, "etcd connect failed in poller"); + tokio::time::sleep(interval).await; + continue; + } + }; + + match poller::poll_and_update(cfg, &mut etcd, &mut last_etag).await { + Ok(Some(sha)) => info!(sha = %sha, "available_flake_rev updated"), + Ok(None) => {} + Err(e) => tracing::error!(error = %e, "poll failed"), + } + + tokio::time::sleep(interval).await; + } +} + +async fn run_executor_loop(cfg: &config::Config) { + let mut last_applied = String::new(); + let interval = Duration::from_secs(10); + + loop { + tokio::time::sleep(interval).await; + + match execute_once(cfg, &last_applied).await { + Ok(Some(rev)) => last_applied = rev, + Ok(None) => {} + Err(e) => tracing::error!(error = %e, "executor cycle failed"), + } + } +} + +async fn execute_once(cfg: &config::Config, last_applied: &str) -> anyhow::Result> { + let mut etcd = etcd::Client::connect(cfg).await?; + + if etcd.get(etcd::PAUSED_KEY).await?.as_deref() == Some("true") { + return Ok(None); + } + + let desired = match etcd.get(etcd::DESIRED_FLAKE_REV_KEY).await? { + Some(v) => v, + None => return Ok(None), + }; + + if desired.is_empty() || desired == last_applied { + return Ok(None); + } + + if !is_valid_sha(&desired) { + tracing::warn!(flake_rev = %desired, "rejected invalid flake rev"); + etcd.put(etcd::RESULT_KEY, "failed").await?; + return Ok(Some(desired)); + } + + if !git_mirror::rev_exists(&cfg.infra_repo_mirror_dir, &desired).await? { + tracing::warn!(flake_rev = %desired, "rev not found in mirror, triggering fetch"); + git_mirror::sync(&cfg.infra_repo_mirror_dir, &cfg.infra_repo_mirror_url).await?; + + if !git_mirror::rev_exists(&cfg.infra_repo_mirror_dir, &desired).await? { + tracing::error!(flake_rev = %desired, "rev still not found after fetch"); + etcd.put(etcd::RESULT_KEY, "failed").await?; + return Ok(Some(desired)); + } + } + + info!(flake_rev = %desired, last_applied = %last_applied, "starting update"); + etcd.put(etcd::BUILD_STATUS_KEY, "building").await?; + + let (_cancel_tx, cancel_rx) = watch::channel(false); + + match nix_build::build(&cfg.infra_repo_mirror_dir, &desired, cancel_rx).await { + Ok(()) => {} + Err(e) => { + tracing::error!(error = %e, flake_rev = %desired, "nix build failed"); + etcd.put(etcd::BUILD_STATUS_KEY, "failed").await?; + etcd.put(etcd::RESULT_KEY, "failed").await?; + return Ok(Some(desired)); + } + } + + match updater::switch(cfg, &desired).await { + Ok(()) => { + etcd.put(etcd::BUILD_STATUS_KEY, "ready").await?; + etcd.put(etcd::RESULT_KEY, "success").await?; + info!(flake_rev = %desired, "update complete"); + Ok(Some(desired)) + } + Err(e) => { + tracing::error!(error = %e, flake_rev = %desired, "nixos-rebuild switch failed"); + etcd.put(etcd::BUILD_STATUS_KEY, "failed").await?; + etcd.put(etcd::RESULT_KEY, "failed").await?; + Ok(Some(desired)) + } + } +} + +fn is_valid_sha(rev: &str) -> bool { + rev.len() == 40 && rev.chars().all(|c| c.is_ascii_hexdigit()) +} diff --git a/control-plane/csfx-updater/src/nix_build.rs b/control-plane/csfx-updater/src/nix_build.rs new file mode 100644 index 0000000..bed7e34 --- /dev/null +++ b/control-plane/csfx-updater/src/nix_build.rs @@ -0,0 +1,36 @@ +use anyhow::{bail, Result}; +use tokio::process::Command; +use tokio::sync::watch; +use tracing::info; + +pub async fn build(mirror_dir: &str, rev: &str, mut cancel: watch::Receiver) -> Result<()> { + let flake_url = format!("git+file://{}?rev={}", mirror_dir, rev); + + info!(flake_rev = %rev, "starting nix build"); + + let mut child = Command::new("nixos-rebuild") + .args(["build", "--flake", &flake_url]) + .spawn()?; + + tokio::select! { + result = child.wait() => { + let status = result?; + if !status.success() { + bail!("nix build failed for rev {}", rev); + } + info!(flake_rev = %rev, "nix build complete"); + Ok(()) + } + _ = cancel.changed() => { + if *cancel.borrow() { + let _ = child.kill().await; + bail!("nix build cancelled for rev {}", rev); + } + let status = child.wait().await?; + if !status.success() { + bail!("nix build failed for rev {}", rev); + } + Ok(()) + } + } +} diff --git a/control-plane/csfx-updater/src/poller.rs b/control-plane/csfx-updater/src/poller.rs new file mode 100644 index 0000000..7fa7601 --- /dev/null +++ b/control-plane/csfx-updater/src/poller.rs @@ -0,0 +1,119 @@ +use anyhow::{bail, Result}; +use reqwest::header::{ETAG, IF_NONE_MATCH}; +use serde::Deserialize; +use tracing::info; + +use crate::config::Config; +use crate::etcd; + +#[derive(Debug, Deserialize)] +struct GitHubTag { + object: GitHubObject, +} + +#[derive(Debug, Deserialize)] +struct GitHubObject { + sha: String, + #[serde(rename = "type")] + kind: String, +} + +#[derive(Debug, Deserialize)] +struct GitHubCommit { + sha: String, +} + +pub async fn poll_and_update( + cfg: &Config, + etcd: &mut etcd::Client, + last_etag: &mut Option, +) -> Result> { + let desired_version = match etcd.get(etcd::DESIRED_VERSION_KEY).await? { + Some(v) if !v.is_empty() => v, + _ => return Ok(None), + }; + + let sha = resolve_version_to_sha(cfg, &desired_version, last_etag).await?; + + let current = etcd.get(etcd::AVAILABLE_FLAKE_REV_KEY).await?; + if current.as_deref() == Some(sha.as_str()) { + return Ok(None); + } + + etcd.put(etcd::AVAILABLE_FLAKE_REV_KEY, &sha).await?; + etcd.put(etcd::DESIRED_FLAKE_REV_KEY, &sha).await?; + info!(version = %desired_version, sha = %sha, "resolved version to flake rev"); + + Ok(Some(sha)) +} + +async fn resolve_version_to_sha( + cfg: &Config, + version: &str, + last_etag: &mut Option, +) -> Result { + let tag = format!("v{}", version.trim_start_matches('v')); + let url = format!( + "https://api.github.com/repos/{}/git/ref/tags/{}", + cfg.infra_repo_github, tag + ); + + let client = reqwest::Client::new(); + let mut req = client + .get(&url) + .header("User-Agent", "csfx-updater") + .header("Accept", "application/vnd.github.v3+json"); + + if let Some(etag) = last_etag.as_deref() { + req = req.header(IF_NONE_MATCH, etag); + } + + let resp = req.send().await?; + + if resp.status() == reqwest::StatusCode::NOT_MODIFIED { + bail!("tag not modified, no new sha available"); + } + + if !resp.status().is_success() { + bail!( + "GitHub API returned {} for tag {}", + resp.status(), + tag + ); + } + + if let Some(etag) = resp.headers().get(ETAG) { + *last_etag = Some(etag.to_str()?.to_string()); + } + + let tag_ref: GitHubTag = resp.json().await?; + + let sha = if tag_ref.object.kind == "tag" { + dereference_tag(cfg, &tag_ref.object.sha).await? + } else { + tag_ref.object.sha + }; + + Ok(sha) +} + +async fn dereference_tag(cfg: &Config, tag_sha: &str) -> Result { + let url = format!( + "https://api.github.com/repos/{}/git/tags/{}", + cfg.infra_repo_github, tag_sha + ); + + let resp = reqwest::Client::new() + .get(&url) + .header("User-Agent", "csfx-updater") + .header("Accept", "application/vnd.github.v3+json") + .send() + .await?; + + if !resp.status().is_success() { + bail!("GitHub API returned {} when dereferencing tag", resp.status()); + } + + let tag: GitHubTag = resp.json().await?; + Ok(tag.object.sha) +} diff --git a/control-plane/csfx-updater/src/updater.rs b/control-plane/csfx-updater/src/updater.rs new file mode 100644 index 0000000..fa0d092 --- /dev/null +++ b/control-plane/csfx-updater/src/updater.rs @@ -0,0 +1,26 @@ +use anyhow::{bail, Result}; +use tokio::process::Command; +use tracing::info; + +use crate::config::Config; + +pub async fn switch(cfg: &Config, flake_rev: &str) -> Result<()> { + let flake_url = format!( + "git+file://{}?rev={}", + cfg.infra_repo_mirror_dir, flake_rev + ); + + info!(flake_rev = %flake_rev, "running nixos-rebuild switch"); + + let status = Command::new("nixos-rebuild") + .args(["switch", "--flake", &flake_url]) + .status() + .await?; + + if !status.success() { + bail!("nixos-rebuild switch failed for rev {}", flake_rev); + } + + info!(flake_rev = %flake_rev, "nixos-rebuild switch complete"); + Ok(()) +} diff --git a/control-plane/failover-controller/src/main.rs b/control-plane/failover-controller/src/main.rs index 6369b46..926e5ed 100644 --- a/control-plane/failover-controller/src/main.rs +++ b/control-plane/failover-controller/src/main.rs @@ -15,7 +15,7 @@ async fn main() -> anyhow::Result<()> { logger::init_logger(); metrics::init(); - log_info!("main", "CSF Failover Controller starting..."); + log_info!("main", "CSFX Failover Controller starting..."); log_info!("main", &format!("Version: {}", env!("CARGO_PKG_VERSION"))); log_info!("main", "Connecting to database..."); @@ -34,7 +34,10 @@ async fn main() -> anyhow::Result<()> { .unwrap_or(8004); let addr = SocketAddr::from(([0, 0, 0, 0], port)); - log_info!("main", &format!("Failover Controller listening port={}", port)); + log_info!( + "main", + &format!("Failover Controller listening port={}", port) + ); let listener = tokio::net::TcpListener::bind(addr).await?; diff --git a/control-plane/failover-controller/src/metrics.rs b/control-plane/failover-controller/src/metrics.rs index 3c7cf5a..64d068b 100644 --- a/control-plane/failover-controller/src/metrics.rs +++ b/control-plane/failover-controller/src/metrics.rs @@ -8,20 +8,20 @@ static HTTP_REQUEST_DURATION_SECONDS: OnceLock = OnceLock::new(); pub fn init() { HTTP_REQUESTS_TOTAL.get_or_init(|| { register_counter_vec!( - "csf_http_requests_total", + "csfx_http_requests_total", "Total HTTP requests", &["method", "path", "status"] ) - .expect("failed to register csf_http_requests_total") + .expect("failed to register csfx_http_requests_total") }); HTTP_REQUEST_DURATION_SECONDS.get_or_init(|| { register_histogram_vec!( - "csf_http_request_duration_seconds", + "csfx_http_request_duration_seconds", "HTTP request duration in seconds", &["method", "path"] ) - .expect("failed to register csf_http_request_duration_seconds") + .expect("failed to register csfx_http_request_duration_seconds") }); } diff --git a/control-plane/registry/Cargo.toml b/control-plane/registry/Cargo.toml index c603976..744e278 100644 --- a/control-plane/registry/Cargo.toml +++ b/control-plane/registry/Cargo.toml @@ -38,6 +38,7 @@ chrono = { workspace = true, features = ["serde"] } sea-orm = { workspace = true } reqwest = { workspace = true } +etcd-client = { workspace = true } # Crypto sha2 = { workspace = true } diff --git a/control-plane/registry/README.md b/control-plane/registry/README.md index 9e990ec..e9c3682 100644 --- a/control-plane/registry/README.md +++ b/control-plane/registry/README.md @@ -1,4 +1,4 @@ -# CSF Registry Service +# CSFX Registry Service Sicherer Agent Registry Service mit Token-basierter Registrierung und API Key Management. @@ -126,7 +126,7 @@ Response: ```json { "agent_id": "660e8400-e29b-41d4-a716-446655440000", - "api_key": "csf_agent_xyz789...", + "api_key": "csfx_agent_xyz789...", "message": "Agent successfully registered" } ``` @@ -136,7 +136,7 @@ Response: ```bash curl -X POST http://localhost:8000/api/registry/agents/660e8400-e29b-41d4-a716-446655440000/heartbeat \ -H "Content-Type: application/json" \ - -H "X-API-Key: csf_agent_xyz789..." \ + -H "X-API-Key: csfx_agent_xyz789..." \ -d '{ "status": "online" }' @@ -256,7 +256,7 @@ lsof -i :8001 1. **API Key korrekt?** - Verwende den API Key aus der Registrierungs-Response - - Format: `X-API-Key: csf_agent_...` + - Format: `X-API-Key: csfx_agent_...` 2. **Agent ID korrekt?** - URL muss die korrekte Agent ID enthalten diff --git a/control-plane/registry/docker-compose.dev.yml b/control-plane/registry/docker-compose.dev.yml index 1ec29e1..aeef9ba 100644 --- a/control-plane/registry/docker-compose.dev.yml +++ b/control-plane/registry/docker-compose.dev.yml @@ -4,7 +4,7 @@ services: build: context: ../.. dockerfile: control-plane/registry/Dockerfile.dev - container_name: csf-registry + container_name: csfx-registry ports: - "8001:8001" environment: @@ -15,7 +15,7 @@ services: - ../../control-plane/registry/src:/app/control-plane/registry/src - ../../control-plane/shared:/app/control-plane/shared networks: - - csf-network + - csfx-network healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8001/health"] interval: 30s @@ -24,5 +24,5 @@ services: start_period: 40s networks: - csf-network: + csfx-network: driver: bridge diff --git a/control-plane/registry/src/handlers/agent.rs b/control-plane/registry/src/handlers/agent.rs index 7727626..23a8378 100644 --- a/control-plane/registry/src/handlers/agent.rs +++ b/control-plane/registry/src/handlers/agent.rs @@ -202,9 +202,15 @@ pub async fn heartbeat( } } + let desired_flake_rev = read_desired_flake_rev(&state.etcd_endpoints).await; + let post_update_heartbeats = + increment_post_update_heartbeats(&state.etcd_endpoints, agent_id).await; + Ok(Json(HeartbeatResponse { success: true, message: "Heartbeat recorded".to_string(), + desired_flake_rev, + post_update_heartbeats, })) } Err(e) => Err(( @@ -216,6 +222,47 @@ pub async fn heartbeat( } } +async fn read_desired_flake_rev(etcd_endpoints: &str) -> Option { + let mut client = etcd_client::Client::connect([etcd_endpoints], None) + .await + .ok()?; + + let resp = client + .get("/csfx/config/desired_flake_rev", None) + .await + .ok()?; + + resp.kvs() + .first() + .and_then(|kv| std::str::from_utf8(kv.value()).ok()) + .map(|s| s.to_string()) +} + +async fn increment_post_update_heartbeats(etcd_endpoints: &str, agent_id: Uuid) -> Option { + let key = format!("/csfx/nodes/{}/post_update_heartbeats", agent_id); + + let mut client = etcd_client::Client::connect([etcd_endpoints], None) + .await + .ok()?; + + let current: u32 = client + .get(key.as_str(), None) + .await + .ok() + .and_then(|r| r.kvs().first().map(|kv| kv.value().to_vec())) + .and_then(|v| std::str::from_utf8(&v).ok().and_then(|s| s.parse().ok())) + .unwrap_or(0); + + let next = current + 1; + + client + .put(key.as_str(), next.to_string().as_bytes(), None) + .await + .ok()?; + + Some(next) +} + async fn forward_container_statuses( state: &crate::server::AppState, statuses: Vec, diff --git a/control-plane/registry/src/main.rs b/control-plane/registry/src/main.rs index 79d0fc4..86b353c 100644 --- a/control-plane/registry/src/main.rs +++ b/control-plane/registry/src/main.rs @@ -16,7 +16,7 @@ async fn main() -> anyhow::Result<()> { logger::init_logger(); metrics::init(); - log_info!("main", "CSF Registry Service starting..."); + log_info!("main", "CSFX Registry Service starting..."); log_info!("main", &format!("Version: {}", env!("CARGO_PKG_VERSION"))); log_info!("main", "Connecting to database..."); @@ -25,7 +25,7 @@ async fn main() -> anyhow::Result<()> { .expect("Failed to connect to database"); log_info!("main", "Database connection established"); - let cert_ttl_hours: i64 = std::env::var("CSF_CERT_TTL_HOURS") + let cert_ttl_hours: i64 = std::env::var("CSFX_CERT_TTL_HOURS") .ok() .and_then(|v| v.parse().ok()) .unwrap_or(24); @@ -34,7 +34,9 @@ async fn main() -> anyhow::Result<()> { .expect("Failed to initialize PKI service"); let token_manager = Arc::new(services::tokens::TokenManager::new(db_conn.clone())); - let bootstrap_token_manager = Arc::new(services::bootstrap_tokens::BootstrapTokenManager::new(db_conn.clone())); + let bootstrap_token_manager = Arc::new(services::bootstrap_tokens::BootstrapTokenManager::new( + db_conn.clone(), + )); let api_key_manager = Arc::new(services::api_keys::ApiKeyManager::new(db_conn.clone())); let agent_registry = Arc::new(services::registry::AgentRegistry::new(db_conn.clone())); @@ -43,14 +45,17 @@ async fn main() -> anyhow::Result<()> { let scheduler_url = std::env::var("SCHEDULER_SERVICE_URL") .unwrap_or_else(|_| "http://localhost:8002".to_string()); - let gateway_url = std::env::var("API_GATEWAY_URL") - .unwrap_or_else(|_| "http://localhost:8000".to_string()); + let gateway_url = + std::env::var("API_GATEWAY_URL").unwrap_or_else(|_| "http://localhost:8000".to_string()); let http_client = reqwest::Client::builder() .timeout(std::time::Duration::from_secs(5)) .build() .expect("Failed to build HTTP client"); + let etcd_endpoints = + std::env::var("ETCD_ENDPOINTS").unwrap_or_else(|_| "http://localhost:2379".to_string()); + let state = server::AppState { token_manager: token_manager.clone(), bootstrap_token_manager: bootstrap_token_manager.clone(), @@ -61,6 +66,7 @@ async fn main() -> anyhow::Result<()> { scheduler_url, gateway_url, http_client, + etcd_endpoints, }; let token_cleanup_handle = { diff --git a/control-plane/registry/src/metrics.rs b/control-plane/registry/src/metrics.rs index 3c7cf5a..64d068b 100644 --- a/control-plane/registry/src/metrics.rs +++ b/control-plane/registry/src/metrics.rs @@ -8,20 +8,20 @@ static HTTP_REQUEST_DURATION_SECONDS: OnceLock = OnceLock::new(); pub fn init() { HTTP_REQUESTS_TOTAL.get_or_init(|| { register_counter_vec!( - "csf_http_requests_total", + "csfx_http_requests_total", "Total HTTP requests", &["method", "path", "status"] ) - .expect("failed to register csf_http_requests_total") + .expect("failed to register csfx_http_requests_total") }); HTTP_REQUEST_DURATION_SECONDS.get_or_init(|| { register_histogram_vec!( - "csf_http_request_duration_seconds", + "csfx_http_request_duration_seconds", "HTTP request duration in seconds", &["method", "path"] ) - .expect("failed to register csf_http_request_duration_seconds") + .expect("failed to register csfx_http_request_duration_seconds") }); } diff --git a/control-plane/registry/src/models/agent.rs b/control-plane/registry/src/models/agent.rs index 4804ec6..7f5e857 100644 --- a/control-plane/registry/src/models/agent.rs +++ b/control-plane/registry/src/models/agent.rs @@ -135,6 +135,8 @@ pub struct HeartbeatRequest { pub struct HeartbeatResponse { pub success: bool, pub message: String, + pub desired_flake_rev: Option, + pub post_update_heartbeats: Option, } #[derive(Debug, Serialize, Deserialize)] diff --git a/control-plane/registry/src/server.rs b/control-plane/registry/src/server.rs index fbdccc6..a84966b 100644 --- a/control-plane/registry/src/server.rs +++ b/control-plane/registry/src/server.rs @@ -31,6 +31,7 @@ pub struct AppState { pub scheduler_url: String, pub gateway_url: String, pub http_client: Client, + pub etcd_endpoints: String, } pub async fn health_check() -> impl IntoResponse { diff --git a/control-plane/registry/src/services/api_keys.rs b/control-plane/registry/src/services/api_keys.rs index 574e3a6..b1001b2 100644 --- a/control-plane/registry/src/services/api_keys.rs +++ b/control-plane/registry/src/services/api_keys.rs @@ -17,7 +17,7 @@ impl ApiKey { pub fn new(agent_id: Uuid) -> Self { Self { id: Uuid::new_v4(), - key: format!("csf_agent_{}", Uuid::new_v4().simple()), + key: format!("csfx_agent_{}", Uuid::new_v4().simple()), agent_id, created_at: Utc::now(), last_used: None, diff --git a/control-plane/registry/src/services/bootstrap_tokens.rs b/control-plane/registry/src/services/bootstrap_tokens.rs index 5190062..14754a1 100644 --- a/control-plane/registry/src/services/bootstrap_tokens.rs +++ b/control-plane/registry/src/services/bootstrap_tokens.rs @@ -3,7 +3,7 @@ use sea_orm::DatabaseConnection; use serde::{Deserialize, Serialize}; use uuid::Uuid; -const TOKEN_PREFIX: &str = "csf-bootstrap."; +const TOKEN_PREFIX: &str = "csfx-bootstrap."; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BootstrapToken { diff --git a/control-plane/registry/src/services/pki.rs b/control-plane/registry/src/services/pki.rs index 680cb09..8f6ba56 100644 --- a/control-plane/registry/src/services/pki.rs +++ b/control-plane/registry/src/services/pki.rs @@ -39,12 +39,11 @@ impl PkiService { fn load_or_generate_ca() -> Result { match ( - std::env::var("CSF_CA_CERT_PEM"), - std::env::var("CSF_CA_KEY_PEM"), + std::env::var("CSFX_CA_CERT_PEM"), + std::env::var("CSFX_CA_KEY_PEM"), ) { (Ok(cert_pem), Ok(key_pem)) => { - KeyPair::from_pem(&key_pem) - .map_err(|e| anyhow!("Failed to load CA key: {}", e))?; + KeyPair::from_pem(&key_pem).map_err(|e| anyhow!("Failed to load CA key: {}", e))?; crate::log_info!("pki", "CA loaded from environment"); @@ -57,7 +56,7 @@ impl PkiService { _ => { crate::log_warn!( "pki", - "CSF_CA_CERT_PEM/CSF_CA_KEY_PEM not set, generating ephemeral CA" + "CSFX_CA_CERT_PEM/CSFX_CA_KEY_PEM not set, generating ephemeral CA" ); Self::generate_ca() } @@ -70,8 +69,12 @@ impl PkiService { let mut params = CertificateParams::default(); params.is_ca = IsCa::Ca(BasicConstraints::Unconstrained); - params.distinguished_name.push(DnType::CommonName, "CSF Internal CA"); - params.distinguished_name.push(DnType::OrganizationName, "CS-Foundry"); + params + .distinguished_name + .push(DnType::CommonName, "CSFX Internal CA"); + params + .distinguished_name + .push(DnType::OrganizationName, "CSFX"); params.key_usages = vec![KeyUsagePurpose::KeyCertSign, KeyUsagePurpose::CrlSign]; params.not_before = rcgen::date_time_ymd(2024, 1, 1); params.not_after = rcgen::date_time_ymd(2035, 1, 1); @@ -154,10 +157,9 @@ impl PkiService { agent_id: Uuid, new_csr_pem: &str, ) -> Result { - if let Some(old_cert) = - crate::db::certificates::get_active_certificate(&self.db, agent_id) - .await - .map_err(|e| anyhow!("DB error: {}", e))? + if let Some(old_cert) = crate::db::certificates::get_active_certificate(&self.db, agent_id) + .await + .map_err(|e| anyhow!("DB error: {}", e))? { crate::db::certificates::revoke_certificate( &self.db, @@ -180,24 +182,15 @@ impl PkiService { self.issue_certificate(agent_id, new_csr_pem).await } - pub async fn revoke_agent_certificate( - &self, - agent_id: Uuid, - reason: String, - ) -> Result<()> { + pub async fn revoke_agent_certificate(&self, agent_id: Uuid, reason: String) -> Result<()> { let cert = crate::db::certificates::get_active_certificate(&self.db, agent_id) .await .map_err(|e| anyhow!("DB error: {}", e))? .ok_or_else(|| anyhow!("No active certificate for agent: {}", agent_id))?; - crate::db::certificates::revoke_certificate( - &self.db, - cert.serial_number, - agent_id, - reason, - ) - .await - .map_err(|e| anyhow!("Failed to revoke certificate: {}", e))?; + crate::db::certificates::revoke_certificate(&self.db, cert.serial_number, agent_id, reason) + .await + .map_err(|e| anyhow!("Failed to revoke certificate: {}", e))?; crate::log_info!( "pki", diff --git a/control-plane/registry/test-registry.sh b/control-plane/registry/test-registry.sh index b45d540..a649876 100755 --- a/control-plane/registry/test-registry.sh +++ b/control-plane/registry/test-registry.sh @@ -1,12 +1,12 @@ #!/bin/bash -# CSF Registry Service Test Script +# CSFX Registry Service Test Script set -e REGISTRY_URL="http://localhost:8001" -echo "🧪 Testing CSF Registry Service" +echo "🧪 Testing CSFX Registry Service" echo "================================" echo "" @@ -137,7 +137,7 @@ echo "" # 11. Deregister Agent echo "1️⃣1️⃣ Deregistering agent..." -curl -s -X POST "${REGISTRY_URL}/admin/agents/${AGENT_ID}" +curl -s -X POST "${REGISTRY_URL}/admin/agents/${AGENT_ID}" echo "✅ Agent deregistered" echo "" diff --git a/control-plane/scheduler/src/main.rs b/control-plane/scheduler/src/main.rs index c4572ca..dab7fdf 100644 --- a/control-plane/scheduler/src/main.rs +++ b/control-plane/scheduler/src/main.rs @@ -17,7 +17,7 @@ async fn main() -> anyhow::Result<()> { logger::init_logger(); metrics::init(); - log_info!("main", "CSF Scheduler Service starting..."); + log_info!("main", "CSFX Scheduler Service starting..."); log_info!("main", &format!("Version: {}", env!("CARGO_PKG_VERSION"))); log_info!("main", "Connecting to database..."); @@ -26,18 +26,24 @@ async fn main() -> anyhow::Result<()> { .expect("Failed to connect to database"); log_info!("main", "Database connection established"); - let etcd_endpoints = std::env::var("ETCD_ENDPOINTS") - .unwrap_or_else(|_| "http://localhost:2379".to_string()); + let etcd_endpoints = + std::env::var("ETCD_ENDPOINTS").unwrap_or_else(|_| "http://localhost:2379".to_string()); let etcd_endpoints: Vec<&str> = etcd_endpoints.split(',').collect(); - log_info!("main", &format!("Connecting to etcd endpoints={}", etcd_endpoints.join(","))); + log_info!( + "main", + &format!("Connecting to etcd endpoints={}", etcd_endpoints.join(",")) + ); let etcd = etcd_client::Client::connect(etcd_endpoints, None) .await .expect("Failed to connect to etcd"); log_info!("main", "etcd connection established"); let etcd = Arc::new(Mutex::new(etcd)); - let scheduler = Arc::new(services::scheduler::SchedulerService::new(db.clone(), etcd.clone())); + let scheduler = Arc::new(services::scheduler::SchedulerService::new( + db.clone(), + etcd.clone(), + )); let state = server::AppState { db, diff --git a/control-plane/scheduler/src/metrics.rs b/control-plane/scheduler/src/metrics.rs index 3c7cf5a..64d068b 100644 --- a/control-plane/scheduler/src/metrics.rs +++ b/control-plane/scheduler/src/metrics.rs @@ -8,20 +8,20 @@ static HTTP_REQUEST_DURATION_SECONDS: OnceLock = OnceLock::new(); pub fn init() { HTTP_REQUESTS_TOTAL.get_or_init(|| { register_counter_vec!( - "csf_http_requests_total", + "csfx_http_requests_total", "Total HTTP requests", &["method", "path", "status"] ) - .expect("failed to register csf_http_requests_total") + .expect("failed to register csfx_http_requests_total") }); HTTP_REQUEST_DURATION_SECONDS.get_or_init(|| { register_histogram_vec!( - "csf_http_request_duration_seconds", + "csfx_http_request_duration_seconds", "HTTP request duration in seconds", &["method", "path"] ) - .expect("failed to register csf_http_request_duration_seconds") + .expect("failed to register csfx_http_request_duration_seconds") }); } diff --git a/control-plane/scheduler/src/services/etcd.rs b/control-plane/scheduler/src/services/etcd.rs index 7e9aa6a..18db45f 100644 --- a/control-plane/scheduler/src/services/etcd.rs +++ b/control-plane/scheduler/src/services/etcd.rs @@ -19,7 +19,7 @@ pub async fn put_placement( etcd: &Arc>, record: &PlacementRecord, ) -> Result<(), String> { - let key = format!("/csf/placements/{}", record.workload_id); + let key = format!("/csfx/placements/{}", record.workload_id); let value = serde_json::to_string(record) .map_err(|e| format!("Failed to serialize placement: {}", e))?; @@ -36,7 +36,7 @@ pub async fn delete_placement( etcd: &Arc>, workload_id: Uuid, ) -> Result<(), String> { - let key = format!("/csf/placements/{}", workload_id); + let key = format!("/csfx/placements/{}", workload_id); etcd.lock() .await diff --git a/control-plane/sdn-controller/src/main.rs b/control-plane/sdn-controller/src/main.rs index 9c13770..47fa396 100644 --- a/control-plane/sdn-controller/src/main.rs +++ b/control-plane/sdn-controller/src/main.rs @@ -15,7 +15,7 @@ async fn main() -> anyhow::Result<()> { logger::init_logger(); metrics::init(); - log_info!("main", "CSF SDN Controller starting..."); + log_info!("main", "CSFX SDN Controller starting..."); log_info!("main", &format!("Version: {}", env!("CARGO_PKG_VERSION"))); log_info!("main", "Connecting to database..."); @@ -24,7 +24,8 @@ async fn main() -> anyhow::Result<()> { .expect("Failed to connect to database"); log_info!("main", "Database connection established"); - let etcd_url = std::env::var("ETCD_URL").unwrap_or_else(|_| "http://localhost:2379".to_string()); + let etcd_url = + std::env::var("ETCD_URL").unwrap_or_else(|_| "http://localhost:2379".to_string()); log_info!("main", &format!("Connecting to etcd url={}", etcd_url)); let etcd = etcd_client::Client::connect([etcd_url.as_str()], None) .await diff --git a/control-plane/sdn-controller/src/metrics.rs b/control-plane/sdn-controller/src/metrics.rs index 3c7cf5a..64d068b 100644 --- a/control-plane/sdn-controller/src/metrics.rs +++ b/control-plane/sdn-controller/src/metrics.rs @@ -8,20 +8,20 @@ static HTTP_REQUEST_DURATION_SECONDS: OnceLock = OnceLock::new(); pub fn init() { HTTP_REQUESTS_TOTAL.get_or_init(|| { register_counter_vec!( - "csf_http_requests_total", + "csfx_http_requests_total", "Total HTTP requests", &["method", "path", "status"] ) - .expect("failed to register csf_http_requests_total") + .expect("failed to register csfx_http_requests_total") }); HTTP_REQUEST_DURATION_SECONDS.get_or_init(|| { register_histogram_vec!( - "csf_http_request_duration_seconds", + "csfx_http_request_duration_seconds", "HTTP request duration in seconds", &["method", "path"] ) - .expect("failed to register csf_http_request_duration_seconds") + .expect("failed to register csfx_http_request_duration_seconds") }); } diff --git a/control-plane/sdn-controller/src/services/ipam.rs b/control-plane/sdn-controller/src/services/ipam.rs index 5e38a2e..f1753bc 100644 --- a/control-plane/sdn-controller/src/services/ipam.rs +++ b/control-plane/sdn-controller/src/services/ipam.rs @@ -3,7 +3,7 @@ use etcd_client::Client; use std::net::Ipv4Addr; use uuid::Uuid; -const IPAM_PREFIX: &str = "/csf/ipam/"; +const IPAM_PREFIX: &str = "/csfx/ipam/"; #[derive(Clone)] pub struct IpamService { @@ -58,7 +58,7 @@ impl IpamService { overlay_ip: &str, public_key: Option<&str>, ) -> Result<()> { - let base_key = format!("/csf/peers/{}/{}", network_id, node_id); + let base_key = format!("/csfx/peers/{}/{}", network_id, node_id); self.etcd .put(format!("{}/overlay_ip", base_key).as_str(), overlay_ip, None) .await @@ -75,7 +75,7 @@ impl IpamService { } pub async fn remove_peer(&mut self, network_id: Uuid, node_id: &str) -> Result<()> { - let prefix = format!("/csf/peers/{}/{}", network_id, node_id); + let prefix = format!("/csfx/peers/{}/{}", network_id, node_id); self.etcd .delete( prefix.as_str(), diff --git a/control-plane/shared/Cargo.toml b/control-plane/shared/Cargo.toml index 072d6a6..4469933 100644 --- a/control-plane/shared/Cargo.toml +++ b/control-plane/shared/Cargo.toml @@ -11,4 +11,4 @@ version = "0.2.2" edition = "2021" authors = ["CS-Foundry"] license = "SEE LICENSE IN LICENSE" -repository = "https://github.com/CS-Foundry/CSF-Core" +repository = "https://github.com/CS-Foundry/CSFX-Core" diff --git a/control-plane/volume-manager/cleanup-etcd.sh b/control-plane/volume-manager/cleanup-etcd.sh index 93eb3aa..6aef160 100755 --- a/control-plane/volume-manager/cleanup-etcd.sh +++ b/control-plane/volume-manager/cleanup-etcd.sh @@ -9,19 +9,19 @@ COLOR_RESET='\033[0m' echo -e "${COLOR_YELLOW}🧹 Cleaning etcd data...${COLOR_RESET}" # Lösche alle alten Daten -echo "Deleting all keys under /csf/volume-manager/..." +echo "Deleting all keys under /csfx/volume-manager/..." # Nodes -ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 del /csf/volume-manager/nodes/ --prefix +ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 del /csfx/volume-manager/nodes/ --prefix # Leader -ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 del /csf/volume-manager/election/ --prefix +ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 del /csfx/volume-manager/election/ --prefix # Volumes -ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 del /csf/volume-manager/volumes/ --prefix +ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 del /csfx/volume-manager/volumes/ --prefix # Snapshots -ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 del /csf/volume-manager/snapshots/ --prefix +ETCDCTL_API=3 etcdctl --endpoints=localhost:2379 del /csfx/volume-manager/snapshots/ --prefix echo -e "${COLOR_GREEN}✅ etcd cleaned!${COLOR_RESET}" echo "" diff --git a/control-plane/volume-manager/docker-compose.dev.yml b/control-plane/volume-manager/docker-compose.dev.yml index 5b40d3a..c688094 100644 --- a/control-plane/volume-manager/docker-compose.dev.yml +++ b/control-plane/volume-manager/docker-compose.dev.yml @@ -20,7 +20,7 @@ services: - ceph-mon1-data:/var/lib/ceph - ./ceph-config/ceph.conf:/etc/ceph/ceph.conf:ro networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.21 cap_add: - ALL @@ -50,7 +50,7 @@ services: - ceph-mon2-data:/var/lib/ceph - ./ceph-config/ceph.conf:/etc/ceph/ceph.conf:ro networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.22 cap_add: - ALL @@ -82,7 +82,7 @@ services: - ceph-mon3-data:/var/lib/ceph - ./ceph-config/ceph.conf:/etc/ceph/ceph.conf:ro networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.23 cap_add: - ALL @@ -112,7 +112,7 @@ services: - ceph-osd1-data:/var/lib/ceph/osd - ./ceph-config/ceph.conf:/etc/ceph/ceph.conf:ro networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.31 cap_add: - ALL @@ -138,7 +138,7 @@ services: - ceph-osd2-data:/var/lib/ceph/osd - ./ceph-config/ceph.conf:/etc/ceph/ceph.conf:ro networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.32 cap_add: - ALL @@ -164,7 +164,7 @@ services: - ceph-osd3-data:/var/lib/ceph/osd - ./ceph-config/ceph.conf:/etc/ceph/ceph.conf:ro networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.33 cap_add: - ALL @@ -190,14 +190,14 @@ services: - ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380 - ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379 - ETCD_ADVERTISE_CLIENT_URLS=http://etcd1:2379 - - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster-csf + - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster-csfx - ETCD_INITIAL_CLUSTER=etcd1=http://etcd1:2380,etcd2=http://etcd2:2380,etcd3=http://etcd3:2380 - ETCD_INITIAL_CLUSTER_STATE=new ports: - "2379:2379" - "2380:2380" networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.11 volumes: - etcd1-data:/etcd-data @@ -214,14 +214,14 @@ services: - ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380 - ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379 - ETCD_ADVERTISE_CLIENT_URLS=http://etcd2:2379 - - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster-csf + - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster-csfx - ETCD_INITIAL_CLUSTER=etcd1=http://etcd1:2380,etcd2=http://etcd2:2380,etcd3=http://etcd3:2380 - ETCD_INITIAL_CLUSTER_STATE=new ports: - "2479:2379" - "2480:2380" networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.12 volumes: - etcd2-data:/etcd-data @@ -238,14 +238,14 @@ services: - ETCD_LISTEN_PEER_URLS=http://0.0.0.0:2380 - ETCD_LISTEN_CLIENT_URLS=http://0.0.0.0:2379 - ETCD_ADVERTISE_CLIENT_URLS=http://etcd3:2379 - - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster-csf + - ETCD_INITIAL_CLUSTER_TOKEN=etcd-cluster-csfx - ETCD_INITIAL_CLUSTER=etcd1=http://etcd1:2380,etcd2=http://etcd2:2380,etcd3=http://etcd3:2380 - ETCD_INITIAL_CLUSTER_STATE=new ports: - "2579:2379" - "2580:2380" networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.13 volumes: - etcd3-data:/etcd-data @@ -262,7 +262,7 @@ services: environment: - PATRONI_NAME=patroni1 - ETCD_HOST=etcd1:2379 - - PATRONI_SCOPE=postgres-csf + - PATRONI_SCOPE=postgres-csfxx - PATRONI_ETCD3_HOSTS=etcd1:2379,etcd2:2379,etcd3:2379 - PATRONI_ETCD3_PROTOCOL=http @@ -281,9 +281,9 @@ services: # Application User - PATRONI_POSTGRESQL_PGPASS=/tmp/pgpass - - POSTGRES_USER=csf - - POSTGRES_PASSWORD=csfpassword - - POSTGRES_DB=csf_core + - POSTGRES_USER=csfx + - POSTGRES_PASSWORD=csfxpassword + - POSTGRES_DB=csfx_core # REST API - PATRONI_RESTAPI_LISTEN=0.0.0.0:8008 @@ -307,7 +307,7 @@ services: volumes: - patroni1-data:/home/postgres/pgdata networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.41 ports: - "5441:5432" @@ -334,7 +334,7 @@ services: environment: - ETCD_HOST=etcd1:2379 - PATRONI_NAME=patroni2 - - PATRONI_SCOPE=postgres-csf + - PATRONI_SCOPE=postgres-csfxx - PATRONI_ETCD3_HOSTS=etcd1:2379,etcd2:2379,etcd3:2379 - PATRONI_ETCD3_PROTOCOL=http @@ -349,9 +349,9 @@ services: - PATRONI_SUPERUSER_PASSWORD=postgrespass - PATRONI_POSTGRESQL_PGPASS=/tmp/pgpass - - POSTGRES_USER=csf - - POSTGRES_PASSWORD=csfpassword - - POSTGRES_DB=csf_core + - POSTGRES_USER=csfx + - POSTGRES_PASSWORD=csfxpassword + - POSTGRES_DB=csfx_core - PATRONI_RESTAPI_LISTEN=0.0.0.0:8008 - PATRONI_RESTAPI_CONNECT_ADDRESS=patroni2:8008 @@ -372,7 +372,7 @@ services: volumes: - patroni2-data:/home/postgres/pgdata networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.42 ports: - "5442:5432" @@ -399,7 +399,7 @@ services: environment: - ETCD_HOST=etcd1:2379 - PATRONI_NAME=patroni3 - - PATRONI_SCOPE=postgres-csf + - PATRONI_SCOPE=postgres-csfxx - PATRONI_ETCD3_HOSTS=etcd1:2379,etcd2:2379,etcd3:2379 - PATRONI_ETCD3_PROTOCOL=http @@ -414,9 +414,9 @@ services: - PATRONI_SUPERUSER_PASSWORD=postgrespass - PATRONI_POSTGRESQL_PGPASS=/tmp/pgpass - - POSTGRES_USER=csf - - POSTGRES_PASSWORD=csfpassword - - POSTGRES_DB=csf_core + - POSTGRES_USER=csfx + - POSTGRES_PASSWORD=csfxpassword + - POSTGRES_DB=csfx_core - PATRONI_RESTAPI_LISTEN=0.0.0.0:8008 - PATRONI_RESTAPI_CONNECT_ADDRESS=patroni3:8008 @@ -437,7 +437,7 @@ services: volumes: - patroni3-data:/home/postgres/pgdata networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.43 ports: - "5443:5432" @@ -471,7 +471,7 @@ services: - "5433:5433" # Read Port (Replicas) - "8000:8000" # HAProxy Stats networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.40 depends_on: - patroni1 @@ -500,13 +500,13 @@ services: - ETCD_ENDPOINTS=http://etcd1:2379,http://etcd2:2379,http://etcd3:2379 - NODE_ID=vm-1 - CEPH_MON_HOSTS=ceph-mon1:6789,ceph-mon2:6789,ceph-mon3:6789 - - CEPH_DEFAULT_POOL=csf-data + - CEPH_DEFAULT_POOL=csfx-data - CEPH_PG_NUM=128 - CEPH_DEFAULT_REPLICATION=3 - - PATRONI_SCOPE=postgres-csf + - PATRONI_SCOPE=postgres-csfxx - PATRONI_NODES=patroni1:8008,patroni2:8008,patroni3:8008 networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.51 depends_on: - etcd1 @@ -532,13 +532,13 @@ services: - ETCD_ENDPOINTS=http://etcd1:2379,http://etcd2:2379,http://etcd3:2379 - NODE_ID=vm-2 - CEPH_MON_HOSTS=ceph-mon1:6789,ceph-mon2:6789,ceph-mon3:6789 - - CEPH_DEFAULT_POOL=csf-data + - CEPH_DEFAULT_POOL=csfx-data - CEPH_PG_NUM=128 - CEPH_DEFAULT_REPLICATION=3 - - PATRONI_SCOPE=postgres-csf + - PATRONI_SCOPE=postgres-csfxx - PATRONI_NODES=patroni1:8008,patroni2:8008,patroni3:8008 networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.52 depends_on: - etcd1 @@ -564,13 +564,13 @@ services: - ETCD_ENDPOINTS=http://etcd1:2379,http://etcd2:2379,http://etcd3:2379 - NODE_ID=vm-3 - CEPH_MON_HOSTS=ceph-mon1:6789,ceph-mon2:6789,ceph-mon3:6789 - - CEPH_DEFAULT_POOL=csf-data + - CEPH_DEFAULT_POOL=csfx-data - CEPH_PG_NUM=128 - CEPH_DEFAULT_REPLICATION=3 - - PATRONI_SCOPE=postgres-csf + - PATRONI_SCOPE=postgres-csfxx - PATRONI_NODES=patroni1:8008,patroni2:8008,patroni3:8008 networks: - csf-test: + csfx-test: ipv4_address: 172.20.0.53 depends_on: - etcd1 @@ -585,7 +585,7 @@ services: restart: unless-stopped networks: - csf-test: + csfx-test: driver: bridge ipam: config: diff --git a/control-plane/volume-manager/src/ceph/core/config.rs b/control-plane/volume-manager/src/ceph/core/config.rs index 7bcf4f7..735ae1f 100644 --- a/control-plane/volume-manager/src/ceph/core/config.rs +++ b/control-plane/volume-manager/src/ceph/core/config.rs @@ -24,7 +24,7 @@ impl CephConfig { keyring_path: env::var("CEPH_KEYRING").ok(), client_name: env::var("CEPH_CLIENT_NAME").unwrap_or_else(|_| "admin".to_string()), default_pool: env::var("CEPH_DEFAULT_POOL") - .unwrap_or_else(|_| "csf-volumes".to_string()), + .unwrap_or_else(|_| "csfx-volumes".to_string()), default_pg_num: env::var("CEPH_PG_NUM") .ok() .and_then(|s| s.parse().ok()) @@ -72,7 +72,7 @@ impl Default for CephConfig { ], keyring_path: None, client_name: "admin".to_string(), - default_pool: "csf-volumes".to_string(), + default_pool: "csfx-volumes".to_string(), default_pg_num: 128, default_replication: 3, } diff --git a/control-plane/volume-manager/src/ceph/ops/init.rs b/control-plane/volume-manager/src/ceph/ops/init.rs index d0015a8..405be9f 100644 --- a/control-plane/volume-manager/src/ceph/ops/init.rs +++ b/control-plane/volume-manager/src/ceph/ops/init.rs @@ -54,14 +54,14 @@ pub async fn init_ceph() -> Result { min_size: 2, }, CephPool { - name: "csf-postgres".to_string(), + name: "csfx-postgres".to_string(), pg_num: 64, pgp_num: 64, size: config.default_replication, min_size: 2, }, CephPool { - name: "csf-metadata".to_string(), + name: "csfx-metadata".to_string(), pg_num: 32, pgp_num: 32, size: config.default_replication, @@ -108,7 +108,7 @@ pub async fn create_postgres_volumes(ceph: &CephManager, node_count: u32) -> Res let volume = crate::ceph::storage::types::CephVolume { name: volume_name.clone(), - pool: "csf-postgres".to_string(), + pool: "csfx-postgres".to_string(), size_mb: 10240, // 10 GB features: vec!["layering".to_string(), "exclusive-lock".to_string()], encrypted: false, diff --git a/control-plane/volume-manager/src/db/volumes.rs b/control-plane/volume-manager/src/db/volumes.rs index 59592cf..44a3d2a 100644 --- a/control-plane/volume-manager/src/db/volumes.rs +++ b/control-plane/volume-manager/src/db/volumes.rs @@ -14,7 +14,7 @@ pub async fn create( db: &DatabaseConnection, req: &CreateVolumeRequest, ) -> Result { - let pool = req.pool.clone().unwrap_or_else(|| "csf-volumes".to_string()); + let pool = req.pool.clone().unwrap_or_else(|| "csfx-volumes".to_string()); let image_name = format!("{}-{}", req.name, Uuid::new_v4()); let model = volumes::ActiveModel { diff --git a/control-plane/volume-manager/src/etcd/core/config.rs b/control-plane/volume-manager/src/etcd/core/config.rs index 9f9e23c..40091dc 100644 --- a/control-plane/volume-manager/src/etcd/core/config.rs +++ b/control-plane/volume-manager/src/etcd/core/config.rs @@ -36,7 +36,7 @@ impl Default for EtcdConfig { request_timeout: Duration::from_secs(10), keepalive_interval: Duration::from_secs(30), keepalive_timeout: Duration::from_secs(10), - namespace: "/csf/volume-manager".to_string(), + namespace: "/csfx/volume-manager".to_string(), username: None, password: None, } @@ -53,7 +53,7 @@ impl EtcdConfig { .collect(); let namespace = - std::env::var("ETCD_NAMESPACE").unwrap_or_else(|_| "/csf/volume-manager".to_string()); + std::env::var("ETCD_NAMESPACE").unwrap_or_else(|_| "/csfx/volume-manager".to_string()); let username = std::env::var("ETCD_USERNAME").ok(); let password = std::env::var("ETCD_PASSWORD").ok(); diff --git a/control-plane/volume-manager/src/main.rs b/control-plane/volume-manager/src/main.rs index 148ab15..a411c5d 100644 --- a/control-plane/volume-manager/src/main.rs +++ b/control-plane/volume-manager/src/main.rs @@ -19,7 +19,7 @@ async fn main() -> anyhow::Result<()> { logger::init_logger(); metrics::init(); - log_info!("main", "CSF Volume Manager starting"); + log_info!("main", "CSFX Volume Manager starting"); log_info!("main", &format!("Version: {}", env!("CARGO_PKG_VERSION"))); let db = shared::establish_connection() @@ -27,11 +27,14 @@ async fn main() -> anyhow::Result<()> { .expect("Failed to connect to database"); log_info!("main", "Database connection established"); - let etcd_endpoints = std::env::var("ETCD_ENDPOINTS") - .unwrap_or_else(|_| "http://localhost:2379".to_string()); + let etcd_endpoints = + std::env::var("ETCD_ENDPOINTS").unwrap_or_else(|_| "http://localhost:2379".to_string()); let etcd_endpoints: Vec<&str> = etcd_endpoints.split(',').collect(); - log_info!("main", &format!("Connecting to etcd endpoints={}", etcd_endpoints.join(","))); + log_info!( + "main", + &format!("Connecting to etcd endpoints={}", etcd_endpoints.join(",")) + ); let etcd = etcd_client::Client::connect(etcd_endpoints, None) .await .expect("Failed to connect to etcd"); @@ -45,7 +48,10 @@ async fn main() -> anyhow::Result<()> { Some(Arc::new(manager)) } Err(e) => { - log_warn!("main", &format!("Ceph not available (continuing without): {}", e)); + log_warn!( + "main", + &format!("Ceph not available (continuing without): {}", e) + ); None } }; diff --git a/control-plane/volume-manager/src/metrics.rs b/control-plane/volume-manager/src/metrics.rs index 3c7cf5a..64d068b 100644 --- a/control-plane/volume-manager/src/metrics.rs +++ b/control-plane/volume-manager/src/metrics.rs @@ -8,20 +8,20 @@ static HTTP_REQUEST_DURATION_SECONDS: OnceLock = OnceLock::new(); pub fn init() { HTTP_REQUESTS_TOTAL.get_or_init(|| { register_counter_vec!( - "csf_http_requests_total", + "csfx_http_requests_total", "Total HTTP requests", &["method", "path", "status"] ) - .expect("failed to register csf_http_requests_total") + .expect("failed to register csfx_http_requests_total") }); HTTP_REQUEST_DURATION_SECONDS.get_or_init(|| { register_histogram_vec!( - "csf_http_request_duration_seconds", + "csfx_http_request_duration_seconds", "HTTP request duration in seconds", &["method", "path"] ) - .expect("failed to register csf_http_request_duration_seconds") + .expect("failed to register csfx_http_request_duration_seconds") }); } diff --git a/control-plane/volume-manager/test-hybrid-system.sh b/control-plane/volume-manager/test-hybrid-system.sh index 2a9e5a6..e4034c5 100755 --- a/control-plane/volume-manager/test-hybrid-system.sh +++ b/control-plane/volume-manager/test-hybrid-system.sh @@ -157,13 +157,13 @@ check_patroni_health() { check_volume_manager_health() { log_step "Checking Volume Manager..." - local leader=$(etcdctl --endpoints=localhost:2379 get /csf/volume-manager/election/leader --print-value-only 2>/dev/null) + local leader=$(etcdctl --endpoints=localhost:2379 get /csfx/volume-manager/election/leader --print-value-only 2>/dev/null) if [ -n "$leader" ]; then log_success "Volume Manager leader: $leader" # Count nodes - local node_count=$(etcdctl --endpoints=localhost:2379 get /csf/volume-manager/nodes/ --prefix --keys-only 2>/dev/null | grep -c "/csf/volume-manager/nodes/" || echo "0") + local node_count=$(etcdctl --endpoints=localhost:2379 get /csfx/volume-manager/nodes/ --prefix --keys-only 2>/dev/null | grep -c "/csfx/volume-manager/nodes/" || echo "0") log_info "Registered nodes: $node_count" return 0 else @@ -204,18 +204,18 @@ test_data_replication() { local test_data="hybrid_test_$(date +%s)" log_step "Creating test table..." - docker exec patroni1 psql -U csf -d csf_core -c \ + docker exec patroni1 psql -U csfx -d csfx_core -c \ "CREATE TABLE IF NOT EXISTS hybrid_test (id SERIAL PRIMARY KEY, data TEXT, created_at TIMESTAMP DEFAULT NOW());" &>/dev/null log_step "Writing test data to primary..." - docker exec patroni1 psql -U csf -d csf_core -c \ + docker exec patroni1 psql -U csfx -d csfx_core -c \ "INSERT INTO hybrid_test (data) VALUES ('$test_data');" &>/dev/null # Wait for replication sleep 2 log_step "Verifying data on replica..." - local result=$(docker exec patroni2 psql -U csf -d csf_core -t -c \ + local result=$(docker exec patroni2 psql -U csfx -d csfx_core -t -c \ "SELECT data FROM hybrid_test WHERE data='$test_data';" 2>/dev/null | xargs) if [ "$result" == "$test_data" ]; then @@ -296,7 +296,7 @@ test_postgres_failover() { # Test connectivity sleep 2 - if docker exec $new_primary psql -U csf -d csf_core -c "SELECT 1;" &>/dev/null; then + if docker exec $new_primary psql -U csfx -d csfx_core -c "SELECT 1;" &>/dev/null; then log_success "New primary accepting connections" fi @@ -348,7 +348,7 @@ test_ceph_failover() { echo "" log_step "Testing PostgreSQL availability..." - if docker exec patroni1 psql -U csf -d csf_core -c "SELECT version();" &>/dev/null; then + if docker exec patroni1 psql -U csfx -d csfx_core -c "SELECT version();" &>/dev/null; then log_success "PostgreSQL still fully operational (Ceph has 2 remaining replicas)" else log_error "PostgreSQL affected by OSD failure" @@ -375,7 +375,7 @@ test_ceph_failover() { test_volume_manager_failover() { log_header "Test 5: Volume Manager Failover" - local current_leader=$(etcdctl --endpoints=localhost:2379 get /csf/volume-manager/election/leader --print-value-only 2>/dev/null) + local current_leader=$(etcdctl --endpoints=localhost:2379 get /csfx/volume-manager/election/leader --print-value-only 2>/dev/null) if [ -z "$current_leader" ]; then log_error "No leader found" @@ -398,7 +398,7 @@ test_volume_manager_failover() { log_step "Waiting for leader re-election (10s)..." sleep 10 - local new_leader=$(etcdctl --endpoints=localhost:2379 get /csf/volume-manager/election/leader --print-value-only 2>/dev/null) + local new_leader=$(etcdctl --endpoints=localhost:2379 get /csfx/volume-manager/election/leader --print-value-only 2>/dev/null) if [ -n "$new_leader" ] && [ "$new_leader" != "$current_leader" ]; then log_success "New leader elected: $new_leader" @@ -457,11 +457,11 @@ test_e2e_integration() { log_step "Testing complete data flow..." local test_val="e2e_test_$(date +%s)" - if docker exec patroni1 psql -U csf -d csf_core -c \ + if docker exec patroni1 psql -U csfx -d csfx_core -c \ "CREATE TABLE IF NOT EXISTS e2e_test (val TEXT); INSERT INTO e2e_test VALUES ('$test_val');" &>/dev/null; then sleep 2 - local result=$(docker exec patroni2 psql -U csf -d csf_core -t -c \ + local result=$(docker exec patroni2 psql -U csfx -d csfx_core -t -c \ "SELECT val FROM e2e_test WHERE val='$test_val';" 2>/dev/null | xargs) if [ "$result" == "$test_val" ]; then @@ -488,7 +488,7 @@ test_performance_metrics() { echo "" # PostgreSQL connections - local pg_connections=$(docker exec patroni1 psql -U csf -d csf_core -t -c \ + local pg_connections=$(docker exec patroni1 psql -U csfx -d csfx_core -t -c \ "SELECT count(*) FROM pg_stat_activity;" 2>/dev/null | xargs) echo -e "${CYAN}PostgreSQL Connections:${NC} $pg_connections" @@ -516,7 +516,7 @@ test_live_monitoring() { # etcd echo -e "${CYAN}🔑 etcd Leader:${NC}" - etcdctl --endpoints=localhost:2379 get /csf/volume-manager/election/leader --print-value-only 2>/dev/null || echo "none" + etcdctl --endpoints=localhost:2379 get /csfx/volume-manager/election/leader --print-value-only 2>/dev/null || echo "none" echo "" # Ceph @@ -595,7 +595,7 @@ test_chaos() { # Scenario 3: Kill Volume Manager leader log_info "🔥 Scenario 3: Killing Volume Manager leader..." - local leader=$(etcdctl --endpoints=localhost:2379 get /csf/volume-manager/election/leader --print-value-only 2>/dev/null) + local leader=$(etcdctl --endpoints=localhost:2379 get /csfx/volume-manager/election/leader --print-value-only 2>/dev/null) if [ -n "$leader" ]; then docker-compose -f docker-compose.patroni.yml stop $leader &>/dev/null log_warn "$leader stopped" @@ -609,7 +609,7 @@ test_chaos() { # Check if system is still functional log_step "Testing system functionality under stress..." - if docker exec patroni2 psql -U csf -d csf_core -c "SELECT 1;" &>/dev/null; then + if docker exec patroni2 psql -U csfx -d csfx_core -c "SELECT 1;" &>/dev/null; then log_success "✅ Database still accessible!" else log_error "Database not accessible" diff --git a/control-plane/volume-manager/test-patroni-ha.sh b/control-plane/volume-manager/test-patroni-ha.sh index 55931d2..ac1eef6 100755 --- a/control-plane/volume-manager/test-patroni-ha.sh +++ b/control-plane/volume-manager/test-patroni-ha.sh @@ -76,15 +76,15 @@ test_write() { if [ "$primary" == "patroni1" ]; then replica="patroni2"; else replica="patroni1"; fi - docker exec $primary psql -U csf -d csf_core -c \ + docker exec $primary psql -U csfx -d csfx_core -c \ "CREATE TABLE IF NOT EXISTS failover_test (id SERIAL PRIMARY KEY, data TEXT, created_at TIMESTAMP DEFAULT NOW());" &>/dev/null - docker exec $primary psql -U csf -d csf_core -c \ + docker exec $primary psql -U csfx -d csfx_core -c \ "INSERT INTO failover_test (data) VALUES ('$test_data');" &>/dev/null # Verify on replica sleep 2 - local result=$(docker exec $replica psql -U csf -d csf_core -t -c \ + local result=$(docker exec $replica psql -U csfx -d csfx_core -t -c \ "SELECT data FROM failover_test WHERE data='$test_data';" 2>/dev/null | xargs) if [ "$result" == "$test_data" ]; then @@ -212,7 +212,7 @@ test_postgres_failover() { # Test connection to new primary echo "Testing connection to new primary..." sleep 3 - if docker exec $new_primary psql -U csf -d csf_core -c "SELECT 1;" &>/dev/null; then + if docker exec $new_primary psql -U csfx -d csfx_core -c "SELECT 1;" &>/dev/null; then echo -e "${GREEN}✅ New primary is accepting connections${NC}" else echo -e "${RED}❌ New primary not ready${NC}" @@ -257,7 +257,7 @@ test_ceph_failure() { echo "" echo -e "${YELLOW}Testing if PostgreSQL still works...${NC}" - if docker exec patroni1 psql -U csf -d csf_core -c "SELECT version();" &>/dev/null; then + if docker exec patroni1 psql -U csfx -d csfx_core -c "SELECT version();" &>/dev/null; then echo -e "${GREEN}✅ PostgreSQL still working (Ceph has 2 replicas)${NC}" else echo -e "${RED}❌ PostgreSQL affected${NC}" diff --git a/deployments/docker/patroni/Dockerfile b/deployments/docker/patroni/Dockerfile new file mode 100644 index 0000000..7c6fd76 --- /dev/null +++ b/deployments/docker/patroni/Dockerfile @@ -0,0 +1,23 @@ +FROM postgres:16-alpine + +RUN apk add --no-cache python3 py3-pip curl && \ + pip3 install --no-cache-dir --break-system-packages patroni[etcd3]==3.3.2 psycopg2-binary + +RUN mkdir -p /etc/patroni /data && \ + addgroup -S patroni && \ + adduser -S -G patroni -h /data patroni && \ + chown -R patroni:patroni /data /etc/patroni + +COPY config.yml /etc/patroni/config.yml.tpl +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh && chown patroni:patroni /etc/patroni/config.yml.tpl + +ENV PATRONI_POSTGRESQL_DATA_DIR=/data/pgdata \ + PATRONI_POSTGRESQL_BIN_DIR=/usr/local/bin \ + PATH="/usr/local/bin:$PATH" + +USER patroni + +EXPOSE 5432 8008 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/deployments/docker/patroni/config.yml b/deployments/docker/patroni/config.yml new file mode 100644 index 0000000..e15f38c --- /dev/null +++ b/deployments/docker/patroni/config.yml @@ -0,0 +1,51 @@ +scope: csfx +name: ${PATRONI_NAME} + +restapi: + listen: 0.0.0.0:8008 + connect_address: ${PATRONI_RESTAPI_CONNECT_ADDRESS} + +etcd3: + hosts: ${PATRONI_ETCD3_HOSTS} + +bootstrap: + dcs: + ttl: 30 + loop_wait: 10 + retry_timeout: 10 + maximum_lag_on_failover: 1048576 + postgresql: + use_pg_rewind: true + parameters: + wal_level: replica + hot_standby: "on" + max_wal_senders: 5 + max_replication_slots: 5 + initdb: + - encoding: UTF8 + - data-checksums + pg_hba: + - host replication replicator 0.0.0.0/0 md5 + - host all all 0.0.0.0/0 md5 + +postgresql: + listen: 0.0.0.0:5432 + connect_address: ${PATRONI_POSTGRESQL_CONNECT_ADDRESS} + data_dir: /data/pgdata + bin_dir: /usr/local/bin + authentication: + replication: + username: replicator + password: ${PATRONI_REPLICATION_PASSWORD} + superuser: + username: postgres + password: ${PATRONI_SUPERUSER_PASSWORD} + rewind: + username: rewind_user + password: ${PATRONI_SUPERUSER_PASSWORD} + +tags: + nofailover: false + noloadbalance: false + clonefrom: false + nosync: false diff --git a/deployments/docker/patroni/docker-compose.test.yml b/deployments/docker/patroni/docker-compose.test.yml new file mode 100644 index 0000000..132c343 --- /dev/null +++ b/deployments/docker/patroni/docker-compose.test.yml @@ -0,0 +1,30 @@ +services: + etcd: + image: quay.io/coreos/etcd:v3.5.12 + network_mode: host + command: + - etcd + - --data-dir=/data + - --listen-client-urls=http://0.0.0.0:2379 + - --advertise-client-urls=http://127.0.0.1:2379 + volumes: + - etcd-test:/data + + patroni: + image: csfx-ce-patroni:local + network_mode: host + depends_on: + - etcd + environment: + PATRONI_NAME: test-local + PATRONI_RESTAPI_CONNECT_ADDRESS: 127.0.0.1:8008 + PATRONI_ETCD3_HOSTS: 127.0.0.1:2379 + PATRONI_POSTGRESQL_CONNECT_ADDRESS: 127.0.0.1:5432 + PATRONI_SUPERUSER_PASSWORD: postgres + PATRONI_REPLICATION_PASSWORD: replicator + volumes: + - patroni-test:/data + +volumes: + etcd-test: + patroni-test: diff --git a/deployments/docker/patroni/entrypoint.sh b/deployments/docker/patroni/entrypoint.sh new file mode 100644 index 0000000..71d0d6e --- /dev/null +++ b/deployments/docker/patroni/entrypoint.sh @@ -0,0 +1,15 @@ +#!/bin/sh +set -eu + +mkdir -p /etc/patroni + +sed \ + -e "s|\${PATRONI_NAME}|${PATRONI_NAME}|g" \ + -e "s|\${PATRONI_RESTAPI_CONNECT_ADDRESS}|${PATRONI_RESTAPI_CONNECT_ADDRESS}|g" \ + -e "s|\${PATRONI_ETCD3_HOSTS}|${PATRONI_ETCD3_HOSTS}|g" \ + -e "s|\${PATRONI_POSTGRESQL_CONNECT_ADDRESS}|${PATRONI_POSTGRESQL_CONNECT_ADDRESS}|g" \ + -e "s|\${PATRONI_REPLICATION_PASSWORD}|${PATRONI_REPLICATION_PASSWORD}|g" \ + -e "s|\${PATRONI_SUPERUSER_PASSWORD}|${PATRONI_SUPERUSER_PASSWORD}|g" \ + /etc/patroni/config.yml.tpl > /etc/patroni/config.yml + +exec patroni /etc/patroni/config.yml diff --git a/deployments/systemd/csf-updater.service b/deployments/systemd/csfx-updater.service similarity index 62% rename from deployments/systemd/csf-updater.service rename to deployments/systemd/csfx-updater.service index d25f07e..544f4f0 100644 --- a/deployments/systemd/csf-updater.service +++ b/deployments/systemd/csfx-updater.service @@ -1,24 +1,24 @@ [Unit] -Description=CSF Control Plane Updater +Description=CSFX Control Plane Updater After=network.target docker.service Requires=docker.service [Service] Type=simple -User=csf-updater +User=csfx-updater Group=docker -EnvironmentFile=/opt/csf/.env +EnvironmentFile=/opt/csfx/.env Environment=ETCD_ENDPOINT=http://localhost:2379 Environment=ETCD_USERNAME=csf Environment=GHCR_ORG=csfx-cloud -Environment=COMPOSE_FILE=/opt/csf/docker-compose.prod.yml +Environment=COMPOSE_FILE=/opt/csfx/docker-compose.prod.yml Environment=POLL_INTERVAL=30 -ExecStart=/opt/csf/csf-updater.sh +ExecStart=/opt/csfx/csfx-updater.sh Restart=always RestartSec=10 StandardOutput=journal StandardError=journal -SyslogIdentifier=csf-updater +SyslogIdentifier=csfx-updater [Install] WantedBy=multi-user.target diff --git a/deployments/systemd/csf-updater.sh b/deployments/systemd/csfx-updater.sh similarity index 88% rename from deployments/systemd/csf-updater.sh rename to deployments/systemd/csfx-updater.sh index 6c08ff7..c0fb533 100755 --- a/deployments/systemd/csf-updater.sh +++ b/deployments/systemd/csfx-updater.sh @@ -2,15 +2,15 @@ set -euo pipefail ETCD_ENDPOINT="${ETCD_ENDPOINT:-http://localhost:2379}" -ETCD_USERNAME="${ETCD_USERNAME:-csf}" +ETCD_USERNAME="${ETCD_USERNAME:-csfx}" ETCD_PASSWORD="${ETCD_PASSWORD:?ETCD_PASSWORD must be set}" -COMPOSE_FILE="${COMPOSE_FILE:-/opt/csf/docker-compose.prod.yml}" +COMPOSE_FILE="${COMPOSE_FILE:-/opt/csfxx/docker-compose.prod.yml}" GHCR_ORG="${GHCR_ORG:-csfx-cloud}" POLL_INTERVAL="${POLL_INTERVAL:-30}" GHCR_TOKEN="${GHCR_TOKEN:?GHCR_TOKEN must be set}" -ETCD_DESIRED_KEY="/csf/config/desired_cp_version" -ETCD_RESULT_KEY="/csf/config/last_update_result" +ETCD_DESIRED_KEY="/csfx/config/desired_cp_version" +ETCD_RESULT_KEY="/csfx/config/last_update_result" SERVICES=(api-gateway registry scheduler volume-manager failover-controller sdn-controller) @@ -75,9 +75,9 @@ verify_images() { local version="$1" log "verifying image digests against GHCR" for svc in "${SERVICES[@]}"; do - local image="ghcr.io/${GHCR_ORG}/csf-ce-${svc}" + local image="ghcr.io/${GHCR_ORG}/csfx-ce-${svc}" local remote_digest local_dig - remote_digest="$(ghcr_digest "${GHCR_ORG}/csf-ce-${svc}" "${version}")" + remote_digest="$(ghcr_digest "${GHCR_ORG}/csfx-ce-${svc}" "${version}")" local_dig="$(local_digest "${image}:${version}")" if [[ -z "$remote_digest" ]]; then @@ -99,7 +99,7 @@ run_update() { etcd_put "$ETCD_RESULT_KEY" "in_progress" log "pulling images" - if ! GHCR_ORG="$GHCR_ORG" CSF_VERSION="$version" \ + if ! GHCR_ORG="$GHCR_ORG" CSFX_VERSION="$version" \ docker compose -f "$COMPOSE_FILE" pull; then log "pull failed" etcd_put "$ETCD_RESULT_KEY" "failed" @@ -113,7 +113,7 @@ run_update() { fi log "restarting services" - if ! GHCR_ORG="$GHCR_ORG" CSF_VERSION="$version" \ + if ! GHCR_ORG="$GHCR_ORG" CSFX_VERSION="$version" \ docker compose -f "$COMPOSE_FILE" up -d; then log "up failed" etcd_put "$ETCD_RESULT_KEY" "failed" @@ -122,7 +122,7 @@ run_update() { log "waiting for health checks" sleep 15 - if ! GHCR_ORG="$GHCR_ORG" CSF_VERSION="$version" \ + if ! GHCR_ORG="$GHCR_ORG" CSFX_VERSION="$version" \ docker compose -f "$COMPOSE_FILE" ps --format json \ | jq -e '[.[] | select(.Health == "unhealthy")] | length == 0' > /dev/null 2>&1; then log "health check failed" @@ -138,7 +138,7 @@ is_valid_version() { [[ "$1" =~ ^v?[0-9]+\.[0-9]+\.[0-9]+(-[a-zA-Z0-9._-]+)?$ ]] } -log "csf-updater started, polling etcd every ${POLL_INTERVAL}s" +log "csfx-updater started, polling etcd every ${POLL_INTERVAL}s" last_applied="" diff --git a/deployments/systemd/install.sh b/deployments/systemd/install.sh index 96ee1d9..57c447e 100755 --- a/deployments/systemd/install.sh +++ b/deployments/systemd/install.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash set -euo pipefail -CSF_DIR="/opt/csf" +CSFX_DIR="/opt/csfxx" if [[ "$EUID" -ne 0 ]]; then echo "run as root" @@ -11,28 +11,28 @@ fi SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" -if ! id csf-updater &>/dev/null; then - useradd --system --no-create-home --shell /usr/sbin/nologin csf-updater - usermod -aG docker csf-updater - echo "created csf-updater system user" +if ! id csfx-updater &>/dev/null; then + useradd --system --no-create-home --shell /usr/sbin/nologin csfx-updater + usermod -aG docker csfx-updater + echo "created csfx-updater system user" fi -mkdir -p "$CSF_DIR" -chown csf-updater:docker "$CSF_DIR" +mkdir -p "$CSFX_DIR" +chown csfx-updater:docker "$CSFX_DIR" -cp "${REPO_ROOT}/docker-compose.prod.yml" "${CSF_DIR}/docker-compose.prod.yml" -cp "${SCRIPT_DIR}/csf-updater.sh" "${CSF_DIR}/csf-updater.sh" -chmod 750 "${CSF_DIR}/csf-updater.sh" -chown csf-updater:docker "${CSF_DIR}/csf-updater.sh" +cp "${REPO_ROOT}/docker-compose.prod.yml" "${CSFX_DIR}/docker-compose.prod.yml" +cp "${SCRIPT_DIR}/csfx-updater.sh" "${CSFX_DIR}/csfx-updater.sh" +chmod 750 "${CSFX_DIR}/csfx-updater.sh" +chown csfx-updater:docker "${CSFX_DIR}/csfx-updater.sh" -if [[ ! -f "${CSF_DIR}/.env" ]]; then - cp "${REPO_ROOT}/.env.example" "${CSF_DIR}/.env" - chmod 640 "${CSF_DIR}/.env" - chown csf-updater:docker "${CSF_DIR}/.env" - echo "created ${CSF_DIR}/.env — fill in values before starting" +if [[ ! -f "${CSFX_DIR}/.env" ]]; then + cp "${REPO_ROOT}/.env.example" "${CSFX_DIR}/.env" + chmod 640 "${CSFX_DIR}/.env" + chown csfx-updater:docker "${CSFX_DIR}/.env" + echo "created ${CSFX_DIR}/.env — fill in values before starting" fi -cp "${SCRIPT_DIR}/csf-updater.service" /etc/systemd/system/csf-updater.service +cp "${SCRIPT_DIR}/csfx-updater.service" /etc/systemd/system/csfx-updater.service if command -v ufw &>/dev/null; then ufw deny in 2379/tcp comment "etcd - internal only" @@ -46,8 +46,8 @@ elif command -v firewall-cmd &>/dev/null; then fi systemctl daemon-reload -systemctl enable csf-updater -systemctl start csf-updater +systemctl enable csfx-updater +systemctl start csfx-updater -echo "csf-updater installed and started" -echo "logs: journalctl -u csf-updater -f" +echo "csfx-updater installed and started" +echo "logs: journalctl -u csfx-updater -f" diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 0408396..b37a5b0 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -9,32 +9,32 @@ x-rust-service: &rust-service - cargo_cache:/usr/local/cargo/registry - cargo_git:/usr/local/cargo/git networks: - - csf-network + - csfx-network restart: unless-stopped services: postgres: image: postgres:16-alpine - container_name: csf-postgres-dev + container_name: csfx-postgres-dev environment: - POSTGRES_USER: ${POSTGRES_USER:-csf_user} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-csf_password} - POSTGRES_DB: ${POSTGRES_DB:-csf_core} + POSTGRES_USER: ${POSTGRES_USER:-csfx_user} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-csfx_password} + POSTGRES_DB: ${POSTGRES_DB:-csfx_core} ports: - "5432:5432" volumes: - postgres_data:/var/lib/postgresql/data healthcheck: - test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-csf_user} -d ${POSTGRES_DB:-csf_core}"] + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-csfx_user} -d ${POSTGRES_DB:-csfx_core}"] interval: 10s timeout: 5s retries: 5 networks: - - csf-network + - csfx-network etcd: image: gcr.io/etcd-development/etcd:v3.5.21 - container_name: csf-etcd-dev + container_name: csfx-etcd-dev command: - etcd - --advertise-client-urls=http://etcd:2379 @@ -42,13 +42,13 @@ services: ports: - "2379:2379" networks: - - csf-network + - csfx-network api-gateway: <<: *rust-service - container_name: csf-api-gateway-dev + container_name: csfx-api-gateway-dev environment: - DATABASE_URL: postgres://${POSTGRES_USER:-csf_user}:${POSTGRES_PASSWORD:-csf_password}@postgres:5432/${POSTGRES_DB:-csf_core} + DATABASE_URL: postgres://${POSTGRES_USER:-csfx_user}:${POSTGRES_PASSWORD:-csfx_password}@postgres:5432/${POSTGRES_DB:-csfx_core} RUST_LOG: ${RUST_LOG:-debug} JWT_SECRET: ${JWT_SECRET:-dev_jwt_secret_change_in_production} RSA_KEY_SIZE: "2048" @@ -69,9 +69,9 @@ services: registry: <<: *rust-service - container_name: csf-registry-dev + container_name: csfx-registry-dev environment: - DATABASE_URL: postgres://${POSTGRES_USER:-csf_user}:${POSTGRES_PASSWORD:-csf_password}@postgres:5432/${POSTGRES_DB:-csf_core} + DATABASE_URL: postgres://${POSTGRES_USER:-csfx_user}:${POSTGRES_PASSWORD:-csfx_password}@postgres:5432/${POSTGRES_DB:-csfx_core} ETCD_ENDPOINTS: http://etcd:2379 REGISTRY_PORT: "8001" RUST_LOG: ${RUST_LOG:-debug} @@ -95,9 +95,9 @@ services: scheduler: <<: *rust-service - container_name: csf-scheduler-dev + container_name: csfx-scheduler-dev environment: - DATABASE_URL: postgres://${POSTGRES_USER:-csf_user}:${POSTGRES_PASSWORD:-csf_password}@postgres:5432/${POSTGRES_DB:-csf_core} + DATABASE_URL: postgres://${POSTGRES_USER:-csfx_user}:${POSTGRES_PASSWORD:-csfx_password}@postgres:5432/${POSTGRES_DB:-csfx_core} ETCD_ENDPOINTS: http://etcd:2379 SCHEDULER_PORT: "8002" RUST_LOG: ${RUST_LOG:-debug} @@ -115,9 +115,9 @@ services: volume-manager: <<: *rust-service - container_name: csf-volume-manager-dev + container_name: csfx-volume-manager-dev environment: - DATABASE_URL: postgres://${POSTGRES_USER:-csf_user}:${POSTGRES_PASSWORD:-csf_password}@postgres:5432/${POSTGRES_DB:-csf_core} + DATABASE_URL: postgres://${POSTGRES_USER:-csfx_user}:${POSTGRES_PASSWORD:-csfx_password}@postgres:5432/${POSTGRES_DB:-csfx_core} ETCD_ENDPOINTS: http://etcd:2379 VOLUME_MANAGER_PORT: "8003" RUST_LOG: ${RUST_LOG:-debug} @@ -135,9 +135,9 @@ services: failover-controller: <<: *rust-service - container_name: csf-failover-controller-dev + container_name: csfx-failover-controller-dev environment: - DATABASE_URL: postgres://${POSTGRES_USER:-csf_user}:${POSTGRES_PASSWORD:-csf_password}@postgres:5432/${POSTGRES_DB:-csf_core} + DATABASE_URL: postgres://${POSTGRES_USER:-csfx_user}:${POSTGRES_PASSWORD:-csfx_password}@postgres:5432/${POSTGRES_DB:-csfx_core} FAILOVER_CONTROLLER_PORT: "8004" SCHEDULER_SERVICE_URL: http://scheduler:8002 VOLUME_MANAGER_URL: http://volume-manager:8003 @@ -158,9 +158,9 @@ services: sdn-controller: <<: *rust-service - container_name: csf-sdn-controller-dev + container_name: csfx-sdn-controller-dev environment: - DATABASE_URL: postgres://${POSTGRES_USER:-csf_user}:${POSTGRES_PASSWORD:-csf_password}@postgres:5432/${POSTGRES_DB:-csf_core} + DATABASE_URL: postgres://${POSTGRES_USER:-csfx_user}:${POSTGRES_PASSWORD:-csfx_password}@postgres:5432/${POSTGRES_DB:-csfx_core} ETCD_URL: http://etcd:2379 SDN_CONTROLLER_PORT: "8005" RUST_LOG: ${RUST_LOG:-debug} @@ -193,5 +193,5 @@ volumes: cargo_target_sdn_controller: networks: - csf-network: + csfx-network: driver: bridge diff --git a/docker-compose.failover-controller.yml b/docker-compose.failover-controller.yml index 413c7c4..b2a5a8b 100644 --- a/docker-compose.failover-controller.yml +++ b/docker-compose.failover-controller.yml @@ -9,32 +9,32 @@ x-rust-common: &rust-common - cargo_cache:/usr/local/cargo/registry - cargo_git:/usr/local/cargo/git networks: - - csf-network + - csfx-network restart: unless-stopped services: postgres: image: postgres:16-alpine - container_name: csf-postgres-failover + container_name: csfx-postgres-failover environment: - POSTGRES_USER: csf_user - POSTGRES_PASSWORD: csf_password - POSTGRES_DB: csf_core + POSTGRES_USER: csfx_user + POSTGRES_PASSWORD: csfx_password + POSTGRES_DB: csfx_core ports: - "5432:5432" volumes: - postgres_data:/var/lib/postgresql/data healthcheck: - test: ["CMD-SHELL", "pg_isready -U csf_user -d csf_core"] + test: ["CMD-SHELL", "pg_isready -U csfx_user -d csfx_core"] interval: 10s timeout: 5s retries: 5 networks: - - csf-network + - csfx-network etcd: image: gcr.io/etcd-development/etcd:v3.5.21 - container_name: csf-etcd-failover + container_name: csfx-etcd-failover command: - etcd - --advertise-client-urls=http://etcd:2379 @@ -42,13 +42,13 @@ services: ports: - "2379:2379" networks: - - csf-network + - csfx-network api-gateway: <<: *rust-common - container_name: csf-api-gateway-failover + container_name: csfx-api-gateway-failover environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core RUST_LOG: debug JWT_SECRET: ${JWT_SECRET:-dev_jwt_secret_change_in_production} RSA_KEY_SIZE: "2048" @@ -73,9 +73,9 @@ services: registry: <<: *rust-common - container_name: csf-registry-failover + container_name: csfx-registry-failover environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core ETCD_ENDPOINTS: http://etcd:2379 REGISTRY_PORT: "8001" RUST_LOG: debug @@ -99,9 +99,9 @@ services: scheduler: <<: *rust-common - container_name: csf-scheduler-failover + container_name: csfx-scheduler-failover environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core ETCD_ENDPOINTS: http://etcd:2379 SCHEDULER_PORT: "8002" RUST_LOG: debug @@ -124,9 +124,9 @@ services: volume-manager: <<: *rust-common - container_name: csf-volume-manager-failover + container_name: csfx-volume-manager-failover environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core ETCD_ENDPOINTS: http://etcd:2379 VOLUME_MANAGER_PORT: "8003" RUST_LOG: debug @@ -149,9 +149,9 @@ services: failover-controller: <<: *rust-common - container_name: csf-failover-controller + container_name: csfx-failover-controller environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core FAILOVER_CONTROLLER_PORT: "8004" SCHEDULER_SERVICE_URL: http://scheduler:8002 VOLUME_MANAGER_URL: http://volume-manager:8003 @@ -199,5 +199,5 @@ volumes: driver: local networks: - csf-network: + csfx-network: driver: bridge diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml index 6dcdabc..fad5805 100644 --- a/docker-compose.prod.yml +++ b/docker-compose.prod.yml @@ -1,25 +1,25 @@ services: postgres: image: postgres:16-alpine - container_name: csf-postgres + container_name: csfx-postgres environment: POSTGRES_USER: ${POSTGRES_USER} POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - POSTGRES_DB: ${POSTGRES_DB:-csf_core} + POSTGRES_DB: ${POSTGRES_DB:-csfx_core} volumes: - postgres_data:/var/lib/postgresql/data healthcheck: - test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB:-csf_core}"] + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB:-csfx_core}"] interval: 10s timeout: 5s retries: 5 networks: - - csf-internal + - csfx-internal restart: unless-stopped etcd: image: gcr.io/etcd-development/etcd:v3.5.21 - container_name: csf-etcd + container_name: csfx-etcd command: - etcd - --advertise-client-urls=http://etcd:2379 @@ -28,7 +28,7 @@ services: volumes: - etcd_data:/etcd-data networks: - - csf-internal + - csfx-internal restart: unless-stopped healthcheck: test: ["CMD", "etcdctl", "endpoint", "health"] @@ -37,23 +37,23 @@ services: retries: 5 migrate: - image: ghcr.io/${GHCR_ORG}/csf-ce-api-gateway:${CSF_VERSION:-latest} - container_name: csf-migrate - command: ["/csf-migrate"] + image: ghcr.io/${GHCR_ORG}/csfx-ce-api-gateway:${CSFX_VERSION:-latest} + container_name: csfx-migrate + command: ["/csfx-migrate"] environment: - DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csf_core} + DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csfx_core} depends_on: postgres: condition: service_healthy networks: - - csf-internal + - csfx-internal restart: "no" api-gateway: - image: ghcr.io/${GHCR_ORG}/csf-ce-api-gateway:${CSF_VERSION:-latest} - container_name: csf-api-gateway + image: ghcr.io/${GHCR_ORG}/csfx-ce-api-gateway:${CSFX_VERSION:-latest} + container_name: csfx-api-gateway environment: - DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csf_core} + DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csfx_core} RUST_LOG: ${RUST_LOG:-info} JWT_SECRET: ${JWT_SECRET} RSA_KEY_SIZE: "4096" @@ -71,7 +71,7 @@ services: migrate: condition: service_completed_successfully networks: - - csf-internal + - csfx-internal restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/api/system/health"] @@ -81,10 +81,10 @@ services: start_period: 15s registry: - image: ghcr.io/${GHCR_ORG}/csf-ce-registry:${CSF_VERSION:-latest} - container_name: csf-registry + image: ghcr.io/${GHCR_ORG}/csfx-ce-registry:${CSFX_VERSION:-latest} + container_name: csfx-registry environment: - DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csf_core} + DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csfx_core} ETCD_ENDPOINTS: http://etcd:2379 REGISTRY_PORT: "8001" RUST_LOG: ${RUST_LOG:-info} @@ -95,7 +95,7 @@ services: migrate: condition: service_completed_successfully networks: - - csf-internal + - csfx-internal restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8001/health"] @@ -105,10 +105,10 @@ services: start_period: 15s scheduler: - image: ghcr.io/${GHCR_ORG}/csf-ce-scheduler:${CSF_VERSION:-latest} - container_name: csf-scheduler + image: ghcr.io/${GHCR_ORG}/csfx-ce-scheduler:${CSFX_VERSION:-latest} + container_name: csfx-scheduler environment: - DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csf_core} + DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csfx_core} ETCD_ENDPOINTS: http://etcd:2379 SCHEDULER_PORT: "8002" RUST_LOG: ${RUST_LOG:-info} @@ -118,14 +118,14 @@ services: migrate: condition: service_completed_successfully networks: - - csf-internal + - csfx-internal restart: unless-stopped volume-manager: - image: ghcr.io/${GHCR_ORG}/csf-ce-volume-manager:${CSF_VERSION:-latest} - container_name: csf-volume-manager + image: ghcr.io/${GHCR_ORG}/csfx-ce-volume-manager:${CSFX_VERSION:-latest} + container_name: csfx-volume-manager environment: - DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csf_core} + DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csfx_core} ETCD_ENDPOINTS: http://etcd:2379 VOLUME_MANAGER_PORT: "8003" RUST_LOG: ${RUST_LOG:-info} @@ -135,14 +135,14 @@ services: migrate: condition: service_completed_successfully networks: - - csf-internal + - csfx-internal restart: unless-stopped failover-controller: - image: ghcr.io/${GHCR_ORG}/csf-ce-failover-controller:${CSF_VERSION:-latest} - container_name: csf-failover-controller + image: ghcr.io/${GHCR_ORG}/csfx-ce-failover-controller:${CSFX_VERSION:-latest} + container_name: csfx-failover-controller environment: - DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csf_core} + DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csfx_core} FAILOVER_CONTROLLER_PORT: "8004" SCHEDULER_SERVICE_URL: http://scheduler:8002 VOLUME_MANAGER_URL: http://volume-manager:8003 @@ -157,14 +157,14 @@ services: migrate: condition: service_completed_successfully networks: - - csf-internal + - csfx-internal restart: unless-stopped sdn-controller: - image: ghcr.io/${GHCR_ORG}/csf-ce-sdn-controller:${CSF_VERSION:-latest} - container_name: csf-sdn-controller + image: ghcr.io/${GHCR_ORG}/csfx-ce-sdn-controller:${CSFX_VERSION:-latest} + container_name: csfx-sdn-controller environment: - DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csf_core} + DATABASE_URL: postgres://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB:-csfx_core} ETCD_URL: http://etcd:2379 SDN_CONTROLLER_PORT: "8005" RUST_LOG: ${RUST_LOG:-info} @@ -174,7 +174,7 @@ services: migrate: condition: service_completed_successfully networks: - - csf-internal + - csfx-internal restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8005/health"] @@ -188,5 +188,5 @@ volumes: etcd_data: networks: - csf-internal: + csfx-internal: driver: bridge diff --git a/docker-compose.registry.yml b/docker-compose.registry.yml index 6fd7c7b..e07b4c8 100644 --- a/docker-compose.registry.yml +++ b/docker-compose.registry.yml @@ -3,7 +3,7 @@ x-rust-common: &rust-common context: . dockerfile: control-plane/Dockerfile.dev.shared cache_from: - - type=registry,ref=csf-core-rust-base:latest + - type=registry,ref=csfx-core-rust-base:latest volumes: - ./control-plane/shared:/app/control-plane/shared - ./Cargo.toml:/app/Cargo.toml @@ -11,34 +11,34 @@ x-rust-common: &rust-common - cargo_cache:/usr/local/cargo/registry - cargo_git:/usr/local/cargo/git networks: - - csf-network + - csfx-network restart: unless-stopped services: postgres: image: postgres:16-alpine - container_name: csf-postgres-registry + container_name: csfx-postgres-registry environment: - POSTGRES_USER: csf_user - POSTGRES_PASSWORD: csf_password - POSTGRES_DB: csf_core + POSTGRES_USER: csfx_user + POSTGRES_PASSWORD: csfx_password + POSTGRES_DB: csfx_core ports: - "5432:5432" volumes: - postgres_data:/var/lib/postgresql/data healthcheck: - test: ["CMD-SHELL", "pg_isready -U csf_user -d csf_core"] + test: ["CMD-SHELL", "pg_isready -U csfx_user -d csfx_core"] interval: 10s timeout: 5s retries: 5 networks: - - csf-network + - csfx-network api-gateway: <<: *rust-common - container_name: csf-api-gateway-registry + container_name: csfx-api-gateway-registry environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core RUST_LOG: debug JWT_SECRET: ${JWT_SECRET:-dev_jwt_secret_change_in_production} RSA_KEY_SIZE: "2048" @@ -60,9 +60,9 @@ services: registry: <<: *rust-common - container_name: csf-registry-service + container_name: csfx-registry-service environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core RUST_LOG: debug REGISTRY_PORT: 8001 ports: @@ -89,11 +89,11 @@ services: agent: <<: *rust-common - container_name: csf-agent-test + container_name: csfx-agent-test environment: - CSF_GATEWAY_URL: http://api-gateway:8000 - CSF_REGISTRATION_TOKEN: ${CSF_REGISTRATION_TOKEN} - CSF_HEARTBEAT_INTERVAL: "30" + CSFX_GATEWAY_URL: http://api-gateway:8000 + CSFX_REGISTRATION_TOKEN: ${CSFX_REGISTRATION_TOKEN} + CSFX_HEARTBEAT_INTERVAL: "30" RUST_LOG: debug depends_on: registry: @@ -106,8 +106,8 @@ services: - cargo_cache:/usr/local/cargo/registry - cargo_git:/usr/local/cargo/git - cargo_target_agent:/app/target - - agent_state:/var/lib/csf-daemon - command: cargo run -p csf-agent + - agent_state:/var/lib/csfxx-daemon + command: cargo run -p csfx-agent restart: "no" volumes: @@ -127,5 +127,5 @@ volumes: driver: local networks: - csf-network: + csfx-network: driver: bridge diff --git a/docker-compose.scheduler.yml b/docker-compose.scheduler.yml index 7f7eae1..da73b4d 100644 --- a/docker-compose.scheduler.yml +++ b/docker-compose.scheduler.yml @@ -9,32 +9,32 @@ x-rust-common: &rust-common - cargo_cache:/usr/local/cargo/registry - cargo_git:/usr/local/cargo/git networks: - - csf-network + - csfx-network restart: unless-stopped services: postgres: image: postgres:16-alpine - container_name: csf-postgres-scheduler + container_name: csfx-postgres-scheduler environment: - POSTGRES_USER: csf_user - POSTGRES_PASSWORD: csf_password - POSTGRES_DB: csf_core + POSTGRES_USER: csfx_user + POSTGRES_PASSWORD: csfx_password + POSTGRES_DB: csfx_core ports: - "5432:5432" volumes: - postgres_data:/var/lib/postgresql/data healthcheck: - test: ["CMD-SHELL", "pg_isready -U csf_user -d csf_core"] + test: ["CMD-SHELL", "pg_isready -U csfx_user -d csfx_core"] interval: 10s timeout: 5s retries: 5 networks: - - csf-network + - csfx-network etcd: image: gcr.io/etcd-development/etcd:v3.5.21 - container_name: csf-etcd-scheduler + container_name: csfx-etcd-scheduler command: - etcd - --advertise-client-urls=http://etcd:2379 @@ -42,13 +42,13 @@ services: ports: - "2379:2379" networks: - - csf-network + - csfx-network api-gateway: <<: *rust-common - container_name: csf-api-gateway-scheduler + container_name: csfx-api-gateway-scheduler environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core RUST_LOG: debug JWT_SECRET: ${JWT_SECRET:-dev_jwt_secret_change_in_production} RSA_KEY_SIZE: "2048" @@ -71,9 +71,9 @@ services: scheduler: <<: *rust-common - container_name: csf-scheduler + container_name: csfx-scheduler environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core ETCD_ENDPOINTS: http://etcd:2379 SCHEDULER_PORT: "8002" RUST_LOG: debug @@ -114,5 +114,5 @@ volumes: driver: local networks: - csf-network: + csfx-network: driver: bridge diff --git a/docker-compose.sdn-controller.yml b/docker-compose.sdn-controller.yml index 79db47a..a4fc790 100644 --- a/docker-compose.sdn-controller.yml +++ b/docker-compose.sdn-controller.yml @@ -9,32 +9,32 @@ x-rust-common: &rust-common - cargo_cache:/usr/local/cargo/registry - cargo_git:/usr/local/cargo/git networks: - - csf-network + - csfx-network restart: unless-stopped services: postgres: image: postgres:16-alpine - container_name: csf-postgres-sdn + container_name: csfx-postgres-sdn environment: - POSTGRES_USER: csf_user - POSTGRES_PASSWORD: csf_password - POSTGRES_DB: csf_core + POSTGRES_USER: csfx_user + POSTGRES_PASSWORD: csfx_password + POSTGRES_DB: csfx_core ports: - "5432:5432" volumes: - postgres_data:/var/lib/postgresql/data healthcheck: - test: ["CMD-SHELL", "pg_isready -U csf_user -d csf_core"] + test: ["CMD-SHELL", "pg_isready -U csfx_user -d csfx_core"] interval: 10s timeout: 5s retries: 5 networks: - - csf-network + - csfx-network etcd: image: gcr.io/etcd-development/etcd:v3.5.21 - container_name: csf-etcd-sdn + container_name: csfx-etcd-sdn command: - etcd - --advertise-client-urls=http://etcd:2379 @@ -42,13 +42,13 @@ services: ports: - "2379:2379" networks: - - csf-network + - csfx-network api-gateway: <<: *rust-common - container_name: csf-api-gateway-sdn + container_name: csfx-api-gateway-sdn environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core RUST_LOG: debug JWT_SECRET: ${JWT_SECRET:-dev_jwt_secret_change_in_production} RSA_KEY_SIZE: "2048" @@ -74,9 +74,9 @@ services: registry: <<: *rust-common - container_name: csf-registry-sdn + container_name: csfx-registry-sdn environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core ETCD_ENDPOINTS: http://etcd:2379 REGISTRY_PORT: "8001" RUST_LOG: debug @@ -100,9 +100,9 @@ services: scheduler: <<: *rust-common - container_name: csf-scheduler-sdn + container_name: csfx-scheduler-sdn environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core ETCD_ENDPOINTS: http://etcd:2379 SCHEDULER_PORT: "8002" RUST_LOG: debug @@ -125,9 +125,9 @@ services: volume-manager: <<: *rust-common - container_name: csf-volume-manager-sdn + container_name: csfx-volume-manager-sdn environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core ETCD_ENDPOINTS: http://etcd:2379 VOLUME_MANAGER_PORT: "8003" RUST_LOG: debug @@ -150,9 +150,9 @@ services: failover-controller: <<: *rust-common - container_name: csf-failover-controller-sdn + container_name: csfx-failover-controller-sdn environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core FAILOVER_CONTROLLER_PORT: "8004" SCHEDULER_SERVICE_URL: http://scheduler:8002 VOLUME_MANAGER_URL: http://volume-manager:8003 @@ -178,9 +178,9 @@ services: sdn-controller: <<: *rust-common - container_name: csf-sdn-controller + container_name: csfx-sdn-controller environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core ETCD_URL: http://etcd:2379 SDN_CONTROLLER_PORT: "8005" RUST_LOG: debug @@ -227,5 +227,5 @@ volumes: driver: local networks: - csf-network: + csfx-network: driver: bridge diff --git a/docker-compose.volume-manager.yml b/docker-compose.volume-manager.yml index 8838a32..ce7312f 100644 --- a/docker-compose.volume-manager.yml +++ b/docker-compose.volume-manager.yml @@ -9,32 +9,32 @@ x-rust-common: &rust-common - cargo_cache:/usr/local/cargo/registry - cargo_git:/usr/local/cargo/git networks: - - csf-network + - csfx-network restart: unless-stopped services: postgres: image: postgres:16-alpine - container_name: csf-postgres-volumes + container_name: csfx-postgres-volumes environment: - POSTGRES_USER: csf_user - POSTGRES_PASSWORD: csf_password - POSTGRES_DB: csf_core + POSTGRES_USER: csfx_user + POSTGRES_PASSWORD: csfx_password + POSTGRES_DB: csfx_core ports: - "5432:5432" volumes: - postgres_data:/var/lib/postgresql/data healthcheck: - test: ["CMD-SHELL", "pg_isready -U csf_user -d csf_core"] + test: ["CMD-SHELL", "pg_isready -U csfx_user -d csfx_core"] interval: 10s timeout: 5s retries: 5 networks: - - csf-network + - csfx-network etcd: image: gcr.io/etcd-development/etcd:v3.5.21 - container_name: csf-etcd-volumes + container_name: csfx-etcd-volumes command: - etcd - --advertise-client-urls=http://etcd:2379 @@ -42,13 +42,13 @@ services: ports: - "2379:2379" networks: - - csf-network + - csfx-network api-gateway: <<: *rust-common - container_name: csf-api-gateway-volumes + container_name: csfx-api-gateway-volumes environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core RUST_LOG: debug JWT_SECRET: ${JWT_SECRET:-dev_jwt_secret_change_in_production} RSA_KEY_SIZE: "2048" @@ -72,9 +72,9 @@ services: volume-manager: <<: *rust-common - container_name: csf-volume-manager + container_name: csfx-volume-manager environment: - DATABASE_URL: postgres://csf_user:csf_password@postgres:5432/csf_core + DATABASE_URL: postgres://csfx_user:csfx_password@postgres:5432/csfx_core ETCD_ENDPOINTS: http://etcd:2379 VOLUME_MANAGER_PORT: "8003" RUST_LOG: debug @@ -115,5 +115,5 @@ volumes: driver: local networks: - csf-network: + csfx-network: driver: bridge diff --git a/docs/UPDATER_PLAN.md b/docs/UPDATER_PLAN.md new file mode 100644 index 0000000..d907c73 --- /dev/null +++ b/docs/UPDATER_PLAN.md @@ -0,0 +1,305 @@ +# CSFX Updater — Architekturplan + +## Aktueller Stand (vollständig analysiert) + +### CI/CD Pipeline + +**GitHub Actions Workflows:** +- `release-please.yml`: Läuft auf `main` — erstellt automatisch GitHub Releases via Conventional Commits, bumped `Cargo.toml` workspace version, aktuell bei `0.2.2` +- `docker-build.yml`: Trigggert nach erfolgreichem Release-Please-Run **oder** `workflow_dispatch` **oder** `push` auf `develop` + - Matrix-Build: 6 Services × 2 Architekturen (amd64 + arm64) via native GitHub Runners (`ubuntu-latest` + `ubuntu-24.04-arm`) + - Build-Strategie: `push-by-digest` → separater `manifest`-Job erstellt Multi-Arch-Manifest + - Images landen auf `ghcr.io//csfx-ce-:` + `:latest` + - Dockerfile: `control-plane/Dockerfile.prod.shared` mit `cargo-chef` für Layer-Caching + - `build-binaries`-Job: baut `csfx-updater` und `csfx-agent` als statische musl-Binaries (amd64 + arm64) + - `attach-binaries-release`-Job: uploaded Binaries + SHA256-Dateien zum GitHub Release +- `prerelease.yml`: Identischer Flow für `develop`-Branch → Pre-release mit `-alpha.` Tag +- `lint.yml`: `cargo clippy -D warnings` + `cargo fmt --check` + `cargo audit` auf PRs und `main` +- `renovate.yml`: automatische Dependency-Updates (vermutlich) + +**Dockerfile-Struktur (`Dockerfile.prod.shared`):** +- Stage 1 (`planner`): `cargo chef prepare` — generiert `recipe.json` +- Stage 2 (`builder`): `cargo chef cook` (Dependency-Cache) + `cargo build --profile docker-release --bin --bin csfx-migrate` +- Stage 3 (`runtime`): `debian:bookworm-slim`, beide Binaries (`/app/service` + `/csfx-migrate`) kopiert +- Build-Arg `CSFX_BUILD_VERSION` wird an den Build übergeben (für `build.rs`) + +**`Dockerfile.csfx-updater`:** +- Separates Dockerfile nur für `csfx-updater`, exportiert Binary via `FROM scratch AS export` +- Wird nicht vom CI verwendet — CI baut `csfx-updater` als musl-Binary direkt via `cargo build` +- Dieses Dockerfile ist totes Deployment-Artefakt, das nicht mehr zum CI-Flow passt + +### Runtime-Komponenten + +**`csfx-updater` Binary** (`control-plane/csfx-updater/`): +- Pollt etcd alle N Sekunden auf `/csfx/config/desired_cp_version` +- Validiert Semver-Format, setzt `/csfx/config/last_update_result` als Statusindikator +- Lädt GHCR-Token verschlüsselt aus etcd (AES-256-GCM via `secret.rs`) +- Führt `docker compose pull` → Digest-Verify → `docker compose up -d` aus +- Digest-Verify: GHCR Registry API (remote) vs. `docker image inspect` (lokal) — aber `local_digest()` macht intern nochmal `docker pull` +- Wartet 15s pauschal, prüft dann `docker compose ps` auf unhealthy Services +- Downloadet `csfx-agent` und `csfx-updater` Binaries von GitHub Releases, verifiziert SHA256, swappt atomar via `rename(2)` +- Startet Units via `sudo systemctl restart ` + +**Shell-Fallback** (`deployments/systemd/csfx-updater.sh`): +- Identische Logik in Bash: etcd-Poll via curl + jq, docker-compose-Flow, Digest-Verify +- Kein Binary-Download, kein Self-Update +- Kein Health-Check nach up (nur `sleep 15` + `jq`-Filter) + +**Systemd-Unit** (`deployments/systemd/csfx-updater.service`): +- `ExecStart` zeigt auf `csfx-updater.sh` (Shell-Script), nicht auf das Rust-Binary +- Fehlende Env-Var: `SECRET_ENCRYPTION_KEY` (vom Rust-Binary required, im Shell-Script nicht gebraucht) +- `ETCD_ENDPOINT` (Singular) statt `ETCD_ENDPOINTS` (Liste, wie Config erwartet) +- Kein Hardening: kein `ProtectSystem`, kein `NoNewPrivileges`, kein `CapabilityBoundingSet` +- User `csfx-updater` ist in Gruppe `docker` — kann alle Container auf dem Host steuern + +--- + +## Probleme und Schwachstellen + +### P1 — systemd-Unit startet Shell-Script statt Rust-Binary +`ExecStart=/opt/csfx/csfx-updater.sh` — das Rust-Binary wird gebaut, deployed, aber nie gestartet. +Das Secret-Handling (AES-256-GCM), das persistente etcd-RESULT_KEY-Schreiben und die SHA256-Verify laufen damit in Prod nie. Die Shell-Version hat keine Verschlüsselung und kein Binary-Download. + +### P2 — sudo ohne sudoers-Regel bricht in Prod +`restart_unit()` ruft `sudo systemctl restart ` auf. Der User `csfx-updater` hat keine sudoers-Regel — jeder Update-Cycle schlägt beim systemctl-Call fehl, ohne Rollback. + +### P3 — Kein Rollback +Wenn `health_check()` einen unhealthy Service meldet, wird `RESULT_KEY` auf `failed` gesetzt und der Cycle endet. Die Services laufen weiterhin mit dem neuen (kaputten) Image. Kein `docker compose up -d` mit dem vorherigen Tag. + +### P4 — Self-Update-Race +`update_self_binary()` downloaded das neue Binary und macht `systemctl restart csfx-updater`. Der eigene Prozess wird gekillt bevor er `RESULT_KEY = success` schreiben kann — jeder Self-Update-Cycle hinterlässt `in_progress` in etcd. + +### P5 — `last_applied` nur im RAM +Nach Crash oder Restart versucht der Updater sofort wieder dieselbe Version zu applyen. Bei einem kaputten Setup → endloser Retry-Loop. + +### P6 — 15s Sleep ist nicht deterministisch +`health_check()` wartet pauschal 15 Sekunden. Bei großen Images oder langsamen Nodes reicht das nicht. Bei schnellen Nodes ist es Verschwendung. + +### P7 — Kein Distributed Lock +Wenn zwei Master-Nodes gleichzeitig denselben `desired_cp_version`-Key sehen, laufen beide gleichzeitig `docker compose up -d`. Kein Lock in etcd. + +### P8 — Reines Polling, keine etcd-Watches +Der Updater reconnected zu etcd jede Poll-Iteration und macht ein synchrones GET. Ein etcd-Watch wäre reaktiver und ressourcenschonender. + +### P9 — `local_digest()` macht internen zweiten `docker pull` +In `verify_images()` wird `docker pull --quiet` in `local_digest()` aufgerufen — obwohl `pull()` das Image bereits wenige Sekunden vorher gezogen hat. Verdoppelt die Download-Zeit. + +### P10 — Agent-Binary-Update inkompatibel mit NixOS +`update_agent_binary()` schreibt nach `/usr/local/bin/csfx-agent` und startet `csfx-daemon` neu. Auf NixOS überlebt das Binary keinen `nixos-rebuild switch` — die systemd-Unit zeigt auf einen Nix-Store-Pfad, nicht auf `/usr/local/bin`. Der Ansatz funktioniert nur auf nicht-NixOS-Systemen. + +### P11 — `Dockerfile.csfx-updater` ist orphaned +Das separate Dockerfile baut `csfx-updater` als statisches Binary, exportiert es via `FROM scratch`. Der CI-Flow (`docker-build.yml`) nutzt es nicht — er baut `csfx-updater` direkt via `cargo build --target musl`. Das Dockerfile ist toter Code und führt zu Verwirrung bei der Frage welcher Build-Pfad der kanonische ist. + +### P12 — `update-versions.sh` referenziert `backend/Cargo.toml` das nicht existiert +Das Script in `.github/scripts/update-versions.sh` patcht `backend/Cargo.toml`. Das Projekt heißt aber `CSFX-Core` mit `Cargo.toml` im Root als Workspace. `backend/` existiert nicht. Das Script ist toter Code aus einem früheren Projekt-Layout. + +### P13 — `csfx-updater` im selben `Dockerfile.prod.shared` wie Services +Der `build`-Job in `docker-build.yml` baut alle 6 Services mit `Dockerfile.prod.shared`. `csfx-updater` hat ein eigenes `Dockerfile.csfx-updater`. Der `build-binaries`-Job baut `csfx-updater` als musl-Binary. Drei verschiedene Build-Pfade für dasselbe Binary — unklar welcher kanonisch ist. + +--- + +## Zielarchitektur + +### Schicht 1 — Control Plane Updates (Docker-basiert) + +``` +GitHub Release v1.2.3 + → CI baut Images + musl-Binaries + → Images auf ghcr.io//csfx-ce-:1.2.3 + → Binaries als Release-Assets (csfx-agent-amd64, csfx-updater-amd64 etc.) + → Admin setzt etcd: /csfx/config/desired_cp_version = "1.2.3" + +etcd-Watch (kein Poll) triggert csfx-updater: + 1. acquire_lock (etcd Lease, 60s TTL) — verhindert parallele Updates + 2. pull images (alle 6 Services parallel via goroutines/tasks) + 3. verify digests (remote GHCR API vs lokaler docker inspect, KEIN zweiter pull) + 4. docker compose up -d --remove-orphans + 5. wait_healthy (Retry-Loop, 5s Interval, konfigurierbarer Timeout) + → bei timeout: docker compose up -d mit PREV_VERSION (Rollback) + 6. release_lock + 7. put applied_cp_version = version, put last_update_result = success + +bei Fehler in Schritt 4/5: + 8. docker compose up -d mit applied_cp_version (Rollback) + 9. put last_update_result = rolled_back +``` + +**etcd-Keys:** +``` +/csfx/config/desired_cp_version → Zielversion (Admin schreibt diesen Key) +/csfx/config/applied_cp_version → zuletzt erfolgreich gerollte Version (persistentes last_applied) +/csfx/config/last_update_result → in_progress | success | failed | rolled_back +/csfx/config/update_paused → true/false (bereits implementiert) +/csfx/config/update_lock → Distributed Lock (etcd Lease) +/csfx/config/ghcr_token → AES-256-GCM verschlüsseltes Token (bereits implementiert) +/csfx/config/desired_agent_version → Zielversion für csfx-agent (Registry liest, Heartbeat trägt aus) +``` + +### Schicht 2 — Agent-Updates + +**NixOS-Nodes (Primärpfad):** +``` +Registry liest desired_agent_version aus etcd + → Heartbeat-Response: { desired_version: "1.2.3" } + → Agent vergleicht mit env!("CARGO_PKG_VERSION") aus build.rs + → wenn neuer: schreibe /var/lib/csfx-daemon/desired_version + → triggere systemctl start csfx-agent-update.service (PolicyKit-Regel) + → Oneshot-Unit führt nixos-rebuild switch aus + → systemd startet csfx-daemon nach rebuild neu (neues Binary aus Nix-Store) +``` + +**Nicht-NixOS-Fallback:** +``` +Agent: + 1. Download Binary in tmpfile (/var/lib/csfx-daemon/csfx-agent.new) + 2. verifiziere SHA256 gegen Release-Asset + 3. chmod 0o750 + 4. rename(2) → atomarer swap nach /var/lib/csfx-daemon/csfx-agent + 5. exec() sich selbst (in-place restart, kein PID-Wechsel) + bei exec()-Fehler: systemctl restart csfx-daemon via D-Bus (kein sudo) +``` + +Der `csfx-updater` ist nicht zuständig für Agent-Updates. Er schreibt nur `/csfx/config/desired_agent_version`. Die Verteilung läuft ausschließlich über den Heartbeat-Mechanismus. + +### Schicht 3 — Self-Update des Updaters + +Empfehlung: `csfx-updater` Self-Update entfernen. + +Begründung: `csfx-updater` ist kein Service der laufend upgedatet werden muss. Er wird beim Aufsetzen eines neuen Nodes deployed (via NixOS-Modul oder Ansible). Neue Versionen des Updaters kommen mit dem nächsten Node-Provisioning. Der Self-Update-Race (P4) entfällt komplett. + +Falls Self-Update doch gewünscht: `success` + `applied_cp_version` in etcd schreiben, **dann** Binary tauschen + Unit neustarten. Die neue Instanz liest `applied_cp_version` beim Start und überspringt die Version. + +--- + +## Konkrete Änderungen (priorisiert) + +### 1 — systemd-Unit auf Rust-Binary umstellen [blocking] +`ExecStart` von `csfx-updater.sh` auf `/usr/local/bin/csfx-updater` ändern. +`ETCD_ENDPOINT` → `ETCD_ENDPOINTS` (kommaseparierte Liste). +`SECRET_ENCRYPTION_KEY` als Env-Var ergänzen (aus `/opt/csfx/.env`). + +### 2 — Persistentes `applied_version` in etcd [blocking] +Beim Start: `etcd.get(APPLIED_VERSION_KEY)` als initialen `last_applied`. +`APPLIED_VERSION_KEY` nach erfolgreichem Update schreiben. +Eliminiert idempotenten Retry-Loop nach Restart. + +### 3 — Rollback-Logik in `updater.rs` +Vor Update: `prev_version = etcd.get(APPLIED_VERSION_KEY)`. +Nach fehlgeschlagenem health_check: `compose(cfg, &prev, docker_config_dir, &["up", "-d"])`. +`RESULT_KEY = "rolled_back"`. + +### 4 — Health-Check: Retry-Loop statt pauschaler Sleep +```rust +let timeout = Duration::from_secs(cfg.health_check_timeout_secs); +let deadline = Instant::now() + timeout; +loop { + if all_healthy(cfg, version).await? { return Ok(()); } + if Instant::now() > deadline { bail!("health check timeout"); } + sleep(Duration::from_secs(5)).await; +} +``` +Neues Config-Feld: `health_check_timeout_secs` (Default: 120). + +### 5 — `local_digest()`: internen Pull entfernen +`local_digest()` soll nur `docker image inspect` aufrufen. Der Pull ist bereits in `pull()` passiert. +Wenn `inspect` fehlschlägt → bail, nicht erneut pullen. + +### 6 — Distributed Lock in `etcd.rs` +```rust +pub async fn acquire_lock(&mut self, ttl_secs: i64) -> Result // returns lease_id +pub async fn release_lock(&mut self, lease_id: i64) -> Result<()> +``` +`acquire_lock` nutzt `etcd_client::Client::lease_grant` + `put` mit `LeaseId` auf `LOCK_KEY`. +Vor jedem Update-Cycle: lock acquiren. Bei Fehler (Lock bereits gehalten): `info!` + skip (kein Fehler). + +### 7 — etcd-Watch in `main.rs` +etcd-Client hält eine persistente Verbindung, `watch()` auf `DESIRED_VERSION_KEY`. +Fallback-Poll alle 5 Minuten (Watch kann bei Netzwerkproblemen abreißen). +Eliminiert das unnötige Reconnect bei jedem Poll-Cycle. + +### 8 — sudoers-Datei oder D-Bus-Restart +Einfachste Lösung: `/etc/sudoers.d/90-csfx-updater`: +``` +csfx-updater ALL=(root) NOPASSWD: /usr/bin/systemctl restart csfx-daemon +``` +Dieses File muss Teil des NixOS-Moduls / Deployment-Skripts sein. +Mittelfristig: `zbus`-Crate für D-Bus-nativen systemd-Unit-Restart ohne sudo. + +### 9 — Self-Update aus `updater.rs` entfernen +`update_agent_binary()` und `update_self_binary()` aus `updater::run()` entfernen. +Agent-Updates laufen via Heartbeat-Response (Schicht 2). +Updater-Updates laufen via Node-Provisioning. + +### 10 — `Dockerfile.csfx-updater` entfernen +Totes Artefakt — CI nutzt es nicht. Verursacht Verwirrung über den kanonischen Build-Pfad. +Kanonisch ist `build-binaries`-Job in `docker-build.yml` (musl, statisches Binary). + +### 11 — `update-versions.sh` fixen oder entfernen +Script referenziert `backend/Cargo.toml` (existiert nicht). Versioning läuft über `release-please` + `Cargo.toml` workspace. Script ist funktionslos, sollte entfernt werden. + +### 12 — NixOS-Modul: `csfx-agent-update.service` Oneshot-Unit +```nix +systemd.services.csfx-agent-update = { + description = "CSFX Agent NixOS Update"; + serviceConfig = { + Type = "oneshot"; + ExecStart = "${pkgs.nixos-rebuild}/bin/nixos-rebuild switch"; + User = "root"; + }; +}; +security.polkit.extraConfig = '' + polkit.addRule(function(action, subject) { + if (action.id === "org.freedesktop.systemd1.manage-units" && + action.lookup("unit") === "csfx-agent-update.service" && + subject.user === "csfx-daemon") { + return polkit.Result.YES; + } + }); +''; +``` + +--- + +## Was nicht geändert werden soll + +- AES-256-GCM Secret-Handling (`secret.rs`) ist korrekt. +- Semver-Validierung in `main.rs` ist ausreichend. +- GHCR-Token-Exchange-Logik in `verify.rs` ist korrekt. +- `docker compose up -d --remove-orphans` ist der richtige Rolling-Restart-Mechanismus. +- Multi-Arch-Matrix-Build-Strategie (digest-first + manifest) in CI ist korrekt. +- `cargo-chef`-Layer-Caching in `Dockerfile.prod.shared` ist korrekt. +- `release-please` + Conventional Commits als Release-Trigger ist korrekt. +- SHA256-Verify + atomares `rename(2)` beim Binary-Swap ist korrekt. + +--- + +## Deployment-Checkliste + +``` +[ ] systemd-Unit auf Rust-Binary umgestellt (ExecStart, ETCD_ENDPOINTS, SECRET_ENCRYPTION_KEY) +[ ] applied_cp_version Key beim Start geladen (persistentes last_applied) +[ ] applied_cp_version nach erfolgreichem Update in etcd geschrieben +[ ] Rollback-Logik in updater.rs (compose up mit prev_version bei health-check-Fehler) +[ ] Health-Check: Retry-Loop mit konfigurierbarem Timeout statt pauschalen 15s +[ ] local_digest() ohne internen docker pull Aufruf +[ ] Distributed Lock (acquire/release) in etcd.rs +[ ] etcd-Watch in main.rs (mit Fallback-Poll) +[ ] sudoers-Datei im Deployment oder D-Bus-basierter Restart +[ ] Self-Update (update_agent_binary, update_self_binary) aus updater::run() entfernt +[ ] Dockerfile.csfx-updater entfernt +[ ] update-versions.sh entfernt oder auf Workspace-Cargo.toml korrigiert +[ ] desired_agent_version in etcd schreiben (Admin-API oder Registry-Seite) +[ ] HeartbeatResponse: desired_version Feld ergänzen (Registry + Agent) +[ ] Agent: Version-Check + Update-Trigger (NixOS-Pfad + Fallback) +[ ] NixOS-Modul: csfx-agent-update.service Oneshot-Unit + PolicyKit-Regel +[ ] systemd-Unit Hardening (NoNewPrivileges, ProtectSystem, CapabilityBoundingSet) +``` + +--- + +## Nicht in Scope (bewusst ausgeschlossen) + +- Watchtower: Dev-only, kein Digest-Verify, kein Rollback — nicht Prod-fähig +- Kubernetes-style Rolling Updates pro Replica: nicht relevant, Docker-Compose-Instanz pro Node +- Automatische Datenbankmigrationen im Updater: `csfx-migrate` Init-Container ist korrekt und bleibt getrennt +- Separate Version-Tracks pro Service: alle Services laufen auf derselben Workspace-Version diff --git a/frontend/src/lib/components/auth/login-form-client.svelte b/frontend/src/lib/components/auth/login-form-client.svelte index 4024a09..959bd0e 100644 --- a/frontend/src/lib/components/auth/login-form-client.svelte +++ b/frontend/src/lib/components/auth/login-form-client.svelte @@ -131,7 +131,7 @@

Willkommen zurück

-

Melden Sie sich in Ihrem CSF-Core Konto an

+

Melden Sie sich in Ihrem CSFX-Core Konto an

@@ -204,7 +204,7 @@ class="mx-auto mb-4 w-200 h-200 md:w-200 h-200 lg:w-[240px] h-[240px]" /> -

CSF-Core

+

CSFX-Core

The AI-Ready Business Platform
diff --git a/frontend/src/lib/components/auth/otp-form-client.svelte b/frontend/src/lib/components/auth/otp-form-client.svelte index ccff56a..812003c 100644 --- a/frontend/src/lib/components/auth/otp-form-client.svelte +++ b/frontend/src/lib/components/auth/otp-form-client.svelte @@ -168,11 +168,11 @@
CSF-Core Logo -

CSF-Core

+

CSFX-Core

The AI-Ready Business Platform
diff --git a/frontend/src/lib/components/navbar/app-sidebar.svelte b/frontend/src/lib/components/navbar/app-sidebar.svelte index fa33e3c..749fef8 100644 --- a/frontend/src/lib/components/navbar/app-sidebar.svelte +++ b/frontend/src/lib/components/navbar/app-sidebar.svelte @@ -24,7 +24,7 @@ }, teams: [ { - name: 'CSF Core', + name: 'CSFX Core', plan: 'Premium', }, ], diff --git a/frontend/src/lib/components/navbar/nav-user.svelte b/frontend/src/lib/components/navbar/nav-user.svelte index f39dcdd..62aa4c7 100644 --- a/frontend/src/lib/components/navbar/nav-user.svelte +++ b/frontend/src/lib/components/navbar/nav-user.svelte @@ -71,7 +71,7 @@
{authState.user?.username || 'User'} - CSF-Core + CSFX-Core
@@ -94,7 +94,7 @@
{authState.user?.username || 'User'} - CSF-Core + CSFX-Core
diff --git a/frontend/src/lib/components/navbar/team-switcher.svelte b/frontend/src/lib/components/navbar/team-switcher.svelte index 9bdaced..468fd40 100644 --- a/frontend/src/lib/components/navbar/team-switcher.svelte +++ b/frontend/src/lib/components/navbar/team-switcher.svelte @@ -18,12 +18,12 @@ > CSF-Core Logo
- CSF-Core + CSFX-Core Business Platform
diff --git a/frontend/src/lib/components/settings/UpdateSettings.svelte b/frontend/src/lib/components/settings/UpdateSettings.svelte index ddc6905..d0cc25e 100644 --- a/frontend/src/lib/components/settings/UpdateSettings.svelte +++ b/frontend/src/lib/components/settings/UpdateSettings.svelte @@ -103,7 +103,7 @@ Software-Updates - Überprüfen und installieren Sie CSF-Core Updates + Überprüfen und installieren Sie CSFX-Core Updates @@ -295,7 +295,7 @@

Automatische Update-Prüfung

- CSF-Core prüft automatisch stündlich auf neue Updates. Updates werden nur angezeigt, wenn + CSFX-Core prüft automatisch stündlich auf neue Updates. Updates werden nur angezeigt, wenn sie verfügbar sind.

diff --git a/frontend/src/routes/local-system/+page.svelte b/frontend/src/routes/local-system/+page.svelte index 68cd4d8..26249f2 100644 --- a/frontend/src/routes/local-system/+page.svelte +++ b/frontend/src/routes/local-system/+page.svelte @@ -80,14 +80,14 @@ - Local System - CSF Core + Local System - CSFX Core

Local System

-

Monitor the system running the CSF Core backend daemon

+

Monitor the system running the CSFX Core backend daemon