diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 1909865..de390fb 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -297,6 +297,97 @@ jobs: env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + update-infra: + name: Update CSFX-Infra versions.nix + runs-on: ubuntu-latest + needs: [prepare, manifest, build-binaries, attach-binaries-release] + if: needs.prepare.outputs.is_release == 'true' + steps: + - uses: actions/checkout@v4 + with: + repository: ${{ github.repository_owner }}/CSFX-Infra + token: ${{ secrets.INFRA_REPO_TOKEN }} + path: infra + + - uses: actions/download-artifact@v4 + with: + pattern: digest-* + path: /tmp/digests + merge-multiple: true + + - uses: actions/download-artifact@v4 + with: + pattern: csf-agent-* + path: /tmp/binaries + merge-multiple: true + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to GHCR + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Write versions.nix + run: | + VERSION="${{ needs.prepare.outputs.version }}" + ORG=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + REPO="${{ github.repository }}" + RELEASE_BASE="https://github.com/${REPO}/releases/download/v${VERSION}" + + get_manifest_digest() { + local svc=$1 + local image="ghcr.io/${ORG}/csf-ce-${svc}:${VERSION}" + docker buildx imagetools inspect "${image}" \ + --format '{{json .Manifest}}' | jq -r '.digest' + } + + get_sha256() { + local binary=$1 + local arch=$2 + awk '{print $1}' /tmp/binaries/${binary}-${arch}.sha256 2>/dev/null + } + + cat > infra/versions.nix </dev/null + } + + cat > infra/versions.nix <, + pub post_update_heartbeats: Option, +} + #[derive(Debug, Deserialize)] pub struct AssignedWorkload { pub id: String, @@ -143,7 +149,7 @@ impl ApiClient { api_key: &str, container_statuses: Option>, metrics: Option, - ) -> Result<()> { + ) -> Result { let url = format!( "{}/api/registry/agents/{}/heartbeat", self.gateway_url, agent_id @@ -189,7 +195,9 @@ impl ApiClient { anyhow::bail!("Heartbeat failed status={}", status); } - Ok(()) + resp.json::() + .await + .context("Failed to parse heartbeat response") } pub async fn fetch_assigned_workloads( diff --git a/agent/src/main.rs b/agent/src/main.rs index 320afa6..595a39e 100644 --- a/agent/src/main.rs +++ b/agent/src/main.rs @@ -4,6 +4,7 @@ mod docker; mod pki; mod rbd; mod system; +mod update_watch; use anyhow::{Context, Result}; use std::collections::HashMap; @@ -169,6 +170,7 @@ async fn run_heartbeat_loop( ) { let mut interval = tokio::time::interval(Duration::from_secs(interval_secs)); let mut failure_count: u32 = 0; + let mut current_flake_rev = String::new(); loop { tokio::select! { @@ -183,11 +185,24 @@ async fn run_heartbeat_loop( let metrics = system::collect_metrics(); match client.heartbeat(agent_id, api_key, Some(statuses), Some(metrics)).await { - Ok(_) => { + Ok(resp) => { if failure_count > 0 { info!(agent_id = %agent_id, "Heartbeat recovered after {} failures", failure_count); failure_count = 0; } + + if let Some(count) = resp.post_update_heartbeats { + update_watch::write_heartbeat_counter(count).await; + } + + if let Some(rev) = resp.desired_flake_rev { + let rev_clone = rev.clone(); + let current = current_flake_rev.clone(); + tokio::spawn(async move { + update_watch::handle(agent_id, &rev_clone, ¤t).await; + }); + current_flake_rev = rev; + } } Err(e) => { failure_count += 1; diff --git a/agent/src/update_watch.rs b/agent/src/update_watch.rs new file mode 100644 index 0000000..1fd1004 --- /dev/null +++ b/agent/src/update_watch.rs @@ -0,0 +1,59 @@ +use std::time::Duration; +use tokio::fs; +use tracing::{info, warn}; +use uuid::Uuid; + +const TRIGGER_FILE: &str = "/var/lib/csf/update_trigger"; +const HEARTBEAT_COUNTER_FILE: &str = "/var/lib/csf/post_update_heartbeats"; +const MAX_JITTER_SECS: u64 = 300; + +pub async fn handle(agent_id: Uuid, desired_flake_rev: &str, current_flake_rev: &str) { + if desired_flake_rev == current_flake_rev { + return; + } + + if !is_valid_sha(desired_flake_rev) { + warn!(flake_rev = %desired_flake_rev, "received invalid flake rev in heartbeat response"); + return; + } + + let jitter = jitter_delay(agent_id); + info!( + flake_rev = %desired_flake_rev, + jitter_secs = jitter, + "update signal received, waiting before writing trigger" + ); + + tokio::time::sleep(Duration::from_secs(jitter)).await; + + if let Err(e) = write_trigger(desired_flake_rev).await { + warn!(error = %e, flake_rev = %desired_flake_rev, "failed to write update trigger file"); + } else { + info!(flake_rev = %desired_flake_rev, "update trigger written"); + } +} + +pub async fn write_heartbeat_counter(count: u32) { + if let Some(parent) = std::path::Path::new(HEARTBEAT_COUNTER_FILE).parent() { + let _ = fs::create_dir_all(parent).await; + } + let _ = fs::write(HEARTBEAT_COUNTER_FILE, count.to_string()).await; +} + +async fn write_trigger(flake_rev: &str) -> anyhow::Result<()> { + if let Some(parent) = std::path::Path::new(TRIGGER_FILE).parent() { + fs::create_dir_all(parent).await?; + } + fs::write(TRIGGER_FILE, flake_rev).await?; + Ok(()) +} + +fn jitter_delay(agent_id: Uuid) -> u64 { + let bytes = agent_id.as_bytes(); + let val = u64::from_le_bytes(bytes[..8].try_into().unwrap_or([0u8; 8])); + val % MAX_JITTER_SECS +} + +fn is_valid_sha(rev: &str) -> bool { + rev.len() == 40 && rev.chars().all(|c| c.is_ascii_hexdigit()) +} diff --git a/control-plane/api-gateway/src/routes/update.rs b/control-plane/api-gateway/src/routes/update.rs index 44db094..8a75570 100644 --- a/control-plane/api-gateway/src/routes/update.rs +++ b/control-plane/api-gateway/src/routes/update.rs @@ -3,40 +3,34 @@ use etcd_client::Client; use serde::{Deserialize, Serialize}; use std::env; -use crate::auth::crypto::{decrypt_secret, encrypt_secret}; use crate::auth::rbac::CanManageSystem; use crate::AppState; -const ETCD_DESIRED_VERSION_KEY: &str = "/csf/config/desired_cp_version"; -const ETCD_UPDATE_RESULT_KEY: &str = "/csf/config/last_update_result"; -const ETCD_GHCR_TOKEN_KEY: &str = "/csf/config/ghcr_token"; +const ETCD_AVAILABLE_FLAKE_REV_KEY: &str = "/csf/config/available_flake_rev"; +const ETCD_DESIRED_FLAKE_REV_KEY: &str = "/csf/config/desired_flake_rev"; +const ETCD_BUILD_STATUS_KEY: &str = "/csf/config/cp_build_status"; +const ETCD_RESULT_KEY: &str = "/csf/config/last_build_result"; const ETCD_PAUSED_KEY: &str = "/csf/config/update_paused"; #[derive(Debug, Deserialize)] pub struct UpdateRequest { - pub version: String, + pub flake_rev: String, } #[derive(Debug, Serialize)] pub struct UpdateResponse { pub status: String, - pub version: String, + pub flake_rev: String, } #[derive(Debug, Serialize)] pub struct UpdateStatusResponse { pub current_version: String, - pub desired_version: Option, + pub available_flake_rev: Option, + pub desired_flake_rev: Option, + pub build_status: Option, pub last_result: Option, pub paused: bool, - pub agent_version: Option, - pub updater_version: Option, -} - -#[derive(Debug, Deserialize)] -pub struct GhcrTokenRequest { - pub token: String, - pub username: String, } pub fn routes() -> Router { @@ -45,7 +39,6 @@ pub fn routes() -> Router { .route("/system/update/status", get(update_status)) .route("/system/update/pause", post(pause_updates)) .route("/system/update/resume", post(resume_updates)) - .route("/system/ghcr-token", post(set_ghcr_token)) } async fn etcd_client() -> Result { @@ -64,35 +57,25 @@ async fn trigger_update( State(_state): State, Json(req): Json, ) -> Result, StatusCode> { - if !is_valid_semver(&req.version) { + if !is_valid_sha(&req.flake_rev) { return Err(StatusCode::UNPROCESSABLE_ENTITY); } let mut client = etcd_client().await?; client - .put(ETCD_DESIRED_VERSION_KEY, req.version.as_bytes(), None) - .await - .map_err(|e| { - tracing::error!(error = %e, "failed to write desired version to etcd"); - StatusCode::INTERNAL_SERVER_ERROR - })?; - - client - .put(ETCD_UPDATE_RESULT_KEY, b"in_progress", None) + .put(ETCD_DESIRED_FLAKE_REV_KEY, req.flake_rev.as_bytes(), None) .await .map_err(|e| { - tracing::error!(error = %e, "failed to write update result to etcd"); + tracing::error!(error = %e, "failed to write desired flake rev to etcd"); StatusCode::INTERNAL_SERVER_ERROR })?; - tracing::info!(version = %req.version, "update requested"); - - spawn_update(req.version.clone()); + tracing::info!(flake_rev = %req.flake_rev, "update requested"); Ok(Json(UpdateResponse { status: "update_scheduled".to_string(), - version: req.version, + flake_rev: req.flake_rev, })) } @@ -102,34 +85,22 @@ async fn update_status( ) -> Result, StatusCode> { let mut client = etcd_client().await?; - let desired = etcd_get(&mut client, ETCD_DESIRED_VERSION_KEY).await?; - let last_result = etcd_get(&mut client, ETCD_UPDATE_RESULT_KEY).await?; + let available_flake_rev = etcd_get(&mut client, ETCD_AVAILABLE_FLAKE_REV_KEY).await?; + let desired_flake_rev = etcd_get(&mut client, ETCD_DESIRED_FLAKE_REV_KEY).await?; + let build_status = etcd_get(&mut client, ETCD_BUILD_STATUS_KEY).await?; + let last_result = etcd_get(&mut client, ETCD_RESULT_KEY).await?; let paused = etcd_get(&mut client, ETCD_PAUSED_KEY).await?.as_deref() == Some("true"); - let binary_dir = env::var("BINARY_DIR").unwrap_or_else(|_| "/usr/local/bin".to_string()); - let agent_version = binary_version(&format!("{}/csf-agent", binary_dir)).await; - let updater_version = binary_version(&format!("{}/csf-updater", binary_dir)).await; - Ok(Json(UpdateStatusResponse { current_version: env!("CARGO_PKG_VERSION").to_string(), - desired_version: desired, + available_flake_rev, + desired_flake_rev, + build_status, last_result, paused, - agent_version, - updater_version, })) } -async fn binary_version(path: &str) -> Option { - let output = tokio::process::Command::new(path) - .arg("--version") - .output() - .await - .ok()?; - let raw = String::from_utf8(output.stdout).ok()?; - raw.split_whitespace().last().map(|s| s.trim().to_string()) -} - async fn etcd_get(client: &mut Client, key: &str) -> Result, StatusCode> { let resp = client.get(key, None).await.map_err(|e| { tracing::error!(error = %e, key = key, "failed to read from etcd"); @@ -143,71 +114,8 @@ async fn etcd_get(client: &mut Client, key: &str) -> Result, Stat .map(|s| s.to_string())) } -fn is_valid_semver(version: &str) -> bool { - let v = version.strip_prefix('v').unwrap_or(version); - let (base, _pre) = match v.split_once('-') { - Some((b, p)) => (b, Some(p)), - None => (v, None), - }; - let parts: Vec<&str> = base.split('.').collect(); - parts.len() == 3 && parts.iter().all(|p| p.parse::().is_ok()) -} - -fn spawn_update(version: String) { - tokio::spawn(async move { - if let Err(e) = run_update(&version).await { - tracing::error!(error = %e, version = %version, "update failed"); - write_result("failed").await; - } else { - tracing::info!(version = %version, "update completed"); - write_result("success").await; - } - }); -} - -async fn run_update(version: &str) -> Result<(), String> { - let compose_file = env::var("COMPOSE_FILE") - .unwrap_or_else(|_| "docker-compose.prod.yml".to_string()); - let ghcr_org = env::var("GHCR_ORG").map_err(|_| "GHCR_ORG not set".to_string())?; - - pull_images(&compose_file, &ghcr_org, version).await?; - restart_services(&compose_file, &ghcr_org, version).await -} - -async fn pull_images(compose_file: &str, ghcr_org: &str, version: &str) -> Result<(), String> { - let status = tokio::process::Command::new("docker") - .args(["compose", "-f", compose_file, "pull"]) - .env("GHCR_ORG", ghcr_org) - .env("CSF_VERSION", version) - .status() - .await - .map_err(|e| format!("docker compose pull failed: {}", e))?; - - if !status.success() { - return Err(format!("docker compose pull exited with {}", status)); - } - Ok(()) -} - -async fn restart_services(compose_file: &str, ghcr_org: &str, version: &str) -> Result<(), String> { - let status = tokio::process::Command::new("docker") - .args(["compose", "-f", compose_file, "up", "-d"]) - .env("GHCR_ORG", ghcr_org) - .env("CSF_VERSION", version) - .status() - .await - .map_err(|e| format!("docker compose up failed: {}", e))?; - - if !status.success() { - return Err(format!("docker compose up exited with {}", status)); - } - Ok(()) -} - -async fn write_result(result: &str) { - if let Ok(mut client) = etcd_client().await { - let _ = client.put(ETCD_UPDATE_RESULT_KEY, result.as_bytes(), None).await; - } +fn is_valid_sha(rev: &str) -> bool { + rev.len() == 40 && rev.chars().all(|c| c.is_ascii_hexdigit()) } async fn pause_updates( @@ -241,36 +149,3 @@ async fn resume_updates( tracing::info!("updates resumed"); Ok(StatusCode::NO_CONTENT) } - -async fn set_ghcr_token( - _auth: CanManageSystem, - State(_state): State, - Json(req): Json, -) -> Result { - if req.token.is_empty() || req.username.is_empty() { - return Err(StatusCode::UNPROCESSABLE_ENTITY); - } - - let encryption_key = env::var("SECRET_ENCRYPTION_KEY").map_err(|_| { - tracing::error!("SECRET_ENCRYPTION_KEY not set"); - StatusCode::INTERNAL_SERVER_ERROR - })?; - - let payload = format!("{}:{}", req.username, req.token); - let encrypted = encrypt_secret(&payload, &encryption_key).map_err(|e| { - tracing::error!(error = %e, "failed to encrypt ghcr token"); - StatusCode::INTERNAL_SERVER_ERROR - })?; - - let mut client = etcd_client().await?; - client - .put(ETCD_GHCR_TOKEN_KEY, encrypted.as_bytes(), None) - .await - .map_err(|e| { - tracing::error!(error = %e, "failed to write ghcr token to etcd"); - StatusCode::INTERNAL_SERVER_ERROR - })?; - - tracing::info!(username = %req.username, "ghcr token updated"); - Ok(StatusCode::NO_CONTENT) -} diff --git a/control-plane/csf-updater/Cargo.toml b/control-plane/csf-updater/Cargo.toml index 2f63777..39a3b9c 100644 --- a/control-plane/csf-updater/Cargo.toml +++ b/control-plane/csf-updater/Cargo.toml @@ -19,9 +19,3 @@ etcd-client = { workspace = true } reqwest = { version = "0.11", features = ["json", "rustls-tls-webpki-roots"], default-features = false } serde = { workspace = true } serde_json = { workspace = true } -aes-gcm = { workspace = true } -base64 = { workspace = true } -sha2 = { workspace = true } -hex = "0.4" -bytes = "1" -tempfile = "3" diff --git a/control-plane/csf-updater/src/config.rs b/control-plane/csf-updater/src/config.rs index 2b18015..577a476 100644 --- a/control-plane/csf-updater/src/config.rs +++ b/control-plane/csf-updater/src/config.rs @@ -3,12 +3,11 @@ use std::env; pub struct Config { pub etcd_endpoints: Vec, - pub ghcr_org: String, - pub compose_file: String, pub poll_interval_secs: u64, - pub secret_encryption_key: String, - pub binary_dir: String, - pub github_release_base_url: String, + pub infra_repo_mirror_dir: String, + pub infra_repo_mirror_url: String, + pub infra_repo_github: String, + pub infra_repo_branch: String, } impl Config { @@ -19,19 +18,18 @@ impl Config { .split(',') .map(|s| s.trim().to_string()) .collect(), - ghcr_org: env::var("GHCR_ORG").context("GHCR_ORG must be set")?, - compose_file: env::var("COMPOSE_FILE") - .unwrap_or_else(|_| "/etc/csf-core/docker-compose.yml".to_string()), poll_interval_secs: env::var("POLL_INTERVAL_SECS") .ok() .and_then(|v| v.parse().ok()) - .unwrap_or(30), - secret_encryption_key: env::var("SECRET_ENCRYPTION_KEY") - .context("SECRET_ENCRYPTION_KEY must be set")?, - binary_dir: env::var("BINARY_DIR") - .unwrap_or_else(|_| "/usr/local/bin".to_string()), - github_release_base_url: env::var("GITHUB_RELEASE_BASE_URL") - .unwrap_or_else(|_| "https://github.com/csfx-cloud/CSF-Core/releases/download".to_string()), + .unwrap_or(120), + infra_repo_mirror_dir: env::var("INFRA_REPO_MIRROR_DIR") + .unwrap_or_else(|_| "/var/lib/csf-updater/infra.git".to_string()), + infra_repo_mirror_url: env::var("INFRA_REPO_MIRROR_URL") + .context("INFRA_REPO_MIRROR_URL must be set")?, + infra_repo_github: env::var("INFRA_REPO_GITHUB") + .context("INFRA_REPO_GITHUB must be set (e.g. csfx-cloud/CSFX-Infra)")?, + infra_repo_branch: env::var("INFRA_REPO_BRANCH") + .unwrap_or_else(|_| "main".to_string()), }) } } diff --git a/control-plane/csf-updater/src/etcd.rs b/control-plane/csf-updater/src/etcd.rs index fff8468..3ea61c0 100644 --- a/control-plane/csf-updater/src/etcd.rs +++ b/control-plane/csf-updater/src/etcd.rs @@ -2,10 +2,12 @@ use anyhow::Result; use crate::config::Config; -pub const DESIRED_VERSION_KEY: &str = "/csf/config/desired_cp_version"; -pub const RESULT_KEY: &str = "/csf/config/last_update_result"; -pub const GHCR_TOKEN_KEY: &str = "/csf/config/ghcr_token"; +pub const AVAILABLE_FLAKE_REV_KEY: &str = "/csf/config/available_flake_rev"; +pub const DESIRED_FLAKE_REV_KEY: &str = "/csf/config/desired_flake_rev"; +pub const BUILD_STATUS_KEY: &str = "/csf/config/cp_build_status"; +pub const RESULT_KEY: &str = "/csf/config/last_build_result"; pub const PAUSED_KEY: &str = "/csf/config/update_paused"; +pub const NODE_HEARTBEAT_PREFIX: &str = "/csf/nodes/"; pub struct Client { inner: etcd_client::Client, @@ -31,4 +33,12 @@ impl Client { self.inner.put(key, value.as_bytes(), None).await?; Ok(()) } + + pub async fn delete_prefix(&mut self, prefix: &str) -> Result<()> { + use etcd_client::DeleteOptions; + self.inner + .delete(prefix, Some(DeleteOptions::new().with_prefix())) + .await?; + Ok(()) + } } diff --git a/control-plane/csf-updater/src/git_mirror.rs b/control-plane/csf-updater/src/git_mirror.rs new file mode 100644 index 0000000..9153205 --- /dev/null +++ b/control-plane/csf-updater/src/git_mirror.rs @@ -0,0 +1,54 @@ +use anyhow::{bail, Result}; +use std::path::Path; +use tokio::process::Command; +use tracing::info; + +pub async fn sync(mirror_dir: &str, remote_url: &str) -> Result<()> { + if Path::new(mirror_dir).join("HEAD").exists() { + fetch(mirror_dir).await + } else { + clone(mirror_dir, remote_url).await + } +} + +async fn clone(mirror_dir: &str, remote_url: &str) -> Result<()> { + info!(mirror_dir = %mirror_dir, remote_url = %remote_url, "cloning infra repo mirror"); + + let status = Command::new("git") + .args(["clone", "--mirror", remote_url, mirror_dir]) + .status() + .await?; + + if !status.success() { + bail!("git clone --mirror failed for {}", remote_url); + } + + info!(mirror_dir = %mirror_dir, "mirror clone complete"); + Ok(()) +} + +async fn fetch(mirror_dir: &str) -> Result<()> { + info!(mirror_dir = %mirror_dir, "fetching infra repo mirror"); + + let status = Command::new("git") + .args(["--git-dir", mirror_dir, "fetch", "--prune"]) + .status() + .await?; + + if !status.success() { + bail!("git fetch --prune failed in {}", mirror_dir); + } + + info!(mirror_dir = %mirror_dir, "mirror fetch complete"); + Ok(()) +} + +pub async fn rev_exists(mirror_dir: &str, rev: &str) -> Result { + let output = Command::new("git") + .args(["--git-dir", mirror_dir, "cat-file", "-t", rev]) + .output() + .await?; + + Ok(output.status.success() + && String::from_utf8_lossy(&output.stdout).trim() == "commit") +} diff --git a/control-plane/csf-updater/src/main.rs b/control-plane/csf-updater/src/main.rs index d474148..4d60a10 100644 --- a/control-plane/csf-updater/src/main.rs +++ b/control-plane/csf-updater/src/main.rs @@ -1,10 +1,12 @@ mod config; mod etcd; -mod secret; +mod git_mirror; +mod nix_build; +mod poller; mod updater; -mod verify; use std::time::Duration; +use tokio::sync::watch; use tracing::info; #[tokio::main] @@ -16,35 +18,89 @@ async fn main() -> anyhow::Result<()> { .init(); let cfg = config::Config::from_env()?; - let poll_interval = Duration::from_secs(cfg.poll_interval_secs); - info!(poll_interval_secs = cfg.poll_interval_secs, "csf-updater started"); + info!( + poll_interval_secs = cfg.poll_interval_secs, + infra_repo_github = %cfg.infra_repo_github, + "csf-updater started" + ); - let mut last_applied = String::new(); + let cfg = std::sync::Arc::new(cfg); + let cfg_poller = cfg.clone(); + let cfg_executor = cfg.clone(); + + let poller_task = tokio::spawn(async move { + run_poller_loop(&cfg_poller).await; + }); + + let executor_task = tokio::spawn(async move { + run_executor_loop(&cfg_executor).await; + }); + + tokio::select! { + _ = poller_task => tracing::error!("poller task exited unexpectedly"), + _ = executor_task => tracing::error!("executor task exited unexpectedly"), + } + + Ok(()) +} + +async fn run_poller_loop(cfg: &config::Config) { + let mut last_etag: Option = None; + let interval = Duration::from_secs(cfg.poll_interval_secs); loop { - match run_once(&cfg, &last_applied).await { - Ok(Some(version)) => { - last_applied = version; + match git_mirror::sync(&cfg.infra_repo_mirror_dir, &cfg.infra_repo_mirror_url).await { + Ok(()) => {} + Err(e) => { + tracing::error!(error = %e, "git mirror sync failed"); + tokio::time::sleep(interval).await; + continue; } - Ok(None) => {} + } + + let mut etcd = match etcd::Client::connect(cfg).await { + Ok(c) => c, Err(e) => { - tracing::error!(error = %e, "update cycle error"); + tracing::error!(error = %e, "etcd connect failed in poller"); + tokio::time::sleep(interval).await; + continue; } + }; + + match poller::poll_and_update(cfg, &mut etcd, &mut last_etag).await { + Ok(Some(sha)) => info!(sha = %sha, "available_flake_rev updated"), + Ok(None) => {} + Err(e) => tracing::error!(error = %e, "poll failed"), + } + + tokio::time::sleep(interval).await; + } +} + +async fn run_executor_loop(cfg: &config::Config) { + let mut last_applied = String::new(); + let interval = Duration::from_secs(10); + + loop { + tokio::time::sleep(interval).await; + + match execute_once(cfg, &last_applied).await { + Ok(Some(rev)) => last_applied = rev, + Ok(None) => {} + Err(e) => tracing::error!(error = %e, "executor cycle failed"), } - tokio::time::sleep(poll_interval).await; } } -async fn run_once(cfg: &config::Config, last_applied: &str) -> anyhow::Result> { +async fn execute_once(cfg: &config::Config, last_applied: &str) -> anyhow::Result> { let mut etcd = etcd::Client::connect(cfg).await?; if etcd.get(etcd::PAUSED_KEY).await?.as_deref() == Some("true") { - tracing::info!("updates paused, skipping"); return Ok(None); } - let desired = match etcd.get(etcd::DESIRED_VERSION_KEY).await? { + let desired = match etcd.get(etcd::DESIRED_FLAKE_REV_KEY).await? { Some(v) => v, None => return Ok(None), }; @@ -53,35 +109,55 @@ async fn run_once(cfg: &config::Config, last_applied: &str) -> anyhow::Result {} + Err(e) => { + tracing::error!(error = %e, flake_rev = %desired, "nix build failed"); + etcd.put(etcd::BUILD_STATUS_KEY, "failed").await?; + etcd.put(etcd::RESULT_KEY, "failed").await?; + return Ok(Some(desired)); + } + } + + match updater::switch(cfg, &desired).await { Ok(()) => { + etcd.put(etcd::BUILD_STATUS_KEY, "ready").await?; etcd.put(etcd::RESULT_KEY, "success").await?; - info!(version = %desired, "update complete"); + info!(flake_rev = %desired, "update complete"); Ok(Some(desired)) } Err(e) => { - tracing::error!(error = %e, version = %desired, "update failed"); + tracing::error!(error = %e, flake_rev = %desired, "nixos-rebuild switch failed"); + etcd.put(etcd::BUILD_STATUS_KEY, "failed").await?; etcd.put(etcd::RESULT_KEY, "failed").await?; Ok(Some(desired)) } } } -fn is_valid_version(v: &str) -> bool { - let v = v.trim_start_matches('v'); - let (base, _pre) = match v.split_once('-') { - Some((b, p)) => (b, Some(p)), - None => (v, None), - }; - let parts: Vec<&str> = base.split('.').collect(); - parts.len() == 3 && parts.iter().all(|p| p.parse::().is_ok()) +fn is_valid_sha(rev: &str) -> bool { + rev.len() == 40 && rev.chars().all(|c| c.is_ascii_hexdigit()) } diff --git a/control-plane/csf-updater/src/nix_build.rs b/control-plane/csf-updater/src/nix_build.rs new file mode 100644 index 0000000..bed7e34 --- /dev/null +++ b/control-plane/csf-updater/src/nix_build.rs @@ -0,0 +1,36 @@ +use anyhow::{bail, Result}; +use tokio::process::Command; +use tokio::sync::watch; +use tracing::info; + +pub async fn build(mirror_dir: &str, rev: &str, mut cancel: watch::Receiver) -> Result<()> { + let flake_url = format!("git+file://{}?rev={}", mirror_dir, rev); + + info!(flake_rev = %rev, "starting nix build"); + + let mut child = Command::new("nixos-rebuild") + .args(["build", "--flake", &flake_url]) + .spawn()?; + + tokio::select! { + result = child.wait() => { + let status = result?; + if !status.success() { + bail!("nix build failed for rev {}", rev); + } + info!(flake_rev = %rev, "nix build complete"); + Ok(()) + } + _ = cancel.changed() => { + if *cancel.borrow() { + let _ = child.kill().await; + bail!("nix build cancelled for rev {}", rev); + } + let status = child.wait().await?; + if !status.success() { + bail!("nix build failed for rev {}", rev); + } + Ok(()) + } + } +} diff --git a/control-plane/csf-updater/src/poller.rs b/control-plane/csf-updater/src/poller.rs new file mode 100644 index 0000000..68c80fd --- /dev/null +++ b/control-plane/csf-updater/src/poller.rs @@ -0,0 +1,55 @@ +use anyhow::Result; +use reqwest::header::{ETAG, IF_NONE_MATCH}; +use serde::Deserialize; +use tracing::info; + +use crate::config::Config; +use crate::etcd; + +#[derive(Debug, Deserialize)] +struct GitHubCommit { + sha: String, +} + +pub async fn poll_and_update(cfg: &Config, etcd: &mut etcd::Client, last_etag: &mut Option) -> Result> { + let url = format!( + "https://api.github.com/repos/{}/commits/{}", + cfg.infra_repo_github, cfg.infra_repo_branch + ); + + let mut req = reqwest::Client::new() + .get(&url) + .header("User-Agent", "csf-updater") + .header("Accept", "application/vnd.github.v3+json"); + + if let Some(etag) = last_etag.as_deref() { + req = req.header(IF_NONE_MATCH, etag); + } + + let resp = req.send().await?; + + if resp.status() == reqwest::StatusCode::NOT_MODIFIED { + return Ok(None); + } + + if !resp.status().is_success() { + anyhow::bail!("GitHub API returned {}", resp.status()); + } + + if let Some(etag) = resp.headers().get(ETAG) { + *last_etag = Some(etag.to_str()?.to_string()); + } + + let commit: GitHubCommit = resp.json().await?; + let sha = commit.sha; + + let current = etcd.get(etcd::AVAILABLE_FLAKE_REV_KEY).await?; + if current.as_deref() == Some(&sha) { + return Ok(None); + } + + etcd.put(etcd::AVAILABLE_FLAKE_REV_KEY, &sha).await?; + info!(sha = %sha, "new flake rev available"); + + Ok(Some(sha)) +} diff --git a/control-plane/csf-updater/src/secret.rs b/control-plane/csf-updater/src/secret.rs deleted file mode 100644 index 5d51b60..0000000 --- a/control-plane/csf-updater/src/secret.rs +++ /dev/null @@ -1,26 +0,0 @@ -use aes_gcm::{aead::{Aead, KeyInit}, Aes256Gcm, Nonce}; -use anyhow::{bail, Result}; -use base64::Engine; - -pub fn decrypt_secret(encoded: &str, key_b64: &str) -> Result { - let key_bytes = base64::engine::general_purpose::STANDARD.decode(key_b64)?; - if key_bytes.len() != 32 { - bail!("invalid encryption key length"); - } - - let combined = base64::engine::general_purpose::STANDARD.decode(encoded)?; - if combined.len() < 12 { - bail!("invalid ciphertext"); - } - - let (nonce_bytes, ciphertext) = combined.split_at(12); - let cipher = Aes256Gcm::new_from_slice(&key_bytes) - .map_err(|e| anyhow::anyhow!("cipher init failed: {}", e))?; - let nonce = Nonce::from_slice(nonce_bytes); - - let plaintext = cipher - .decrypt(nonce, ciphertext) - .map_err(|e| anyhow::anyhow!("decryption failed: {}", e))?; - - Ok(String::from_utf8(plaintext)?) -} diff --git a/control-plane/csf-updater/src/updater.rs b/control-plane/csf-updater/src/updater.rs index 71e451a..fa0d092 100644 --- a/control-plane/csf-updater/src/updater.rs +++ b/control-plane/csf-updater/src/updater.rs @@ -1,194 +1,26 @@ use anyhow::{bail, Result}; -use sha2::{Digest, Sha256}; -use std::process::Stdio; use tokio::process::Command; use tracing::info; use crate::config::Config; -use crate::etcd; -use crate::secret::decrypt_secret; -use crate::verify; -pub async fn run(cfg: &Config, version: &str, etcd: &mut etcd::Client) -> Result<()> { - let (docker_config_dir, ghcr_auth) = setup_docker_auth(cfg, etcd).await?; - pull(cfg, version, docker_config_dir.as_deref()).await?; - verify::verify_images(cfg, version, ghcr_auth.as_deref()).await?; - up(cfg, version, docker_config_dir.as_deref()).await?; - health_check(cfg, version).await?; - update_agent_binary(cfg, version).await?; - update_self_binary(cfg, version).await -} - -async fn setup_docker_auth(cfg: &Config, etcd: &mut etcd::Client) -> Result<(Option, Option)> { - let encrypted = match etcd.get(etcd::GHCR_TOKEN_KEY).await? { - Some(v) => v, - None => return Ok((None, None)), - }; - - let payload = decrypt_secret(&encrypted, &cfg.secret_encryption_key)?; - let (username, token) = payload - .split_once(':') - .ok_or_else(|| anyhow::anyhow!("invalid ghcr token payload"))?; - - let dir = tempfile::tempdir()?; - let config_path = dir.path().join("config.json"); - - let auth_raw = format!("{}:{}", username, token); - let auth_b64 = base64::Engine::encode( - &base64::engine::general_purpose::STANDARD, - auth_raw.as_bytes(), - ); - let config = serde_json::json!({ - "auths": { - "ghcr.io": { - "auth": auth_b64 - } - } - }); - - tokio::fs::write(&config_path, serde_json::to_string(&config)?).await?; - let dir_path = dir.into_path().to_string_lossy().to_string(); - Ok((Some(dir_path), Some(auth_b64))) -} - -async fn pull(cfg: &Config, version: &str, docker_config_dir: Option<&str>) -> Result<()> { - info!(version = %version, "pulling images"); - compose(cfg, version, docker_config_dir, &["pull"]).await -} - -async fn up(cfg: &Config, version: &str, docker_config_dir: Option<&str>) -> Result<()> { - info!(version = %version, "restarting services"); - compose(cfg, version, docker_config_dir, &["up", "-d", "--remove-orphans"]).await -} - -async fn health_check(cfg: &Config, version: &str) -> Result<()> { - info!("waiting for health checks"); - tokio::time::sleep(std::time::Duration::from_secs(15)).await; - - let output = Command::new("docker") - .args(["compose", "-f", &cfg.compose_file, "ps", "--format", "json"]) - .env("GHCR_ORG", &cfg.ghcr_org) - .env("CSF_VERSION", version) - .output() - .await?; - - let stdout = String::from_utf8_lossy(&output.stdout); - for line in stdout.lines() { - if let Ok(svc) = serde_json::from_str::(line) { - if svc["Health"].as_str() == Some("unhealthy") { - bail!("service {} is unhealthy after update", svc["Name"].as_str().unwrap_or("unknown")); - } - } - } - - info!("all services healthy"); - Ok(()) -} - -async fn update_agent_binary(cfg: &Config, version: &str) -> Result<()> { - info!(version = %version, "updating csf-agent binary"); - let arch = detect_arch(); - let url = format!( - "{}/v{}/csf-agent-{}", - cfg.github_release_base_url, version, arch - ); - let dest = format!("{}/csf-agent", cfg.binary_dir); - download_and_swap(&url, &dest).await?; - restart_unit("csf-daemon").await -} - -async fn update_self_binary(cfg: &Config, version: &str) -> Result<()> { - info!(version = %version, "updating csf-updater binary"); - let arch = detect_arch(); - let url = format!( - "{}/v{}/csf-updater-{}", - cfg.github_release_base_url, version, arch +pub async fn switch(cfg: &Config, flake_rev: &str) -> Result<()> { + let flake_url = format!( + "git+file://{}?rev={}", + cfg.infra_repo_mirror_dir, flake_rev ); - let dest = format!("{}/csf-updater", cfg.binary_dir); - download_and_swap(&url, &dest).await?; - restart_unit("csf-updater").await -} - -async fn download_and_swap(url: &str, dest: &str) -> Result<()> { - let tmp = format!("{}.new", dest); - let bytes = fetch(url).await?; - let expected = fetch_checksum(&format!("{}.sha256", url)).await?; - verify_checksum(&bytes, &expected)?; + info!(flake_rev = %flake_rev, "running nixos-rebuild switch"); - tokio::fs::write(&tmp, &bytes).await?; - - let mut perms = tokio::fs::metadata(&tmp).await?.permissions(); - std::os::unix::fs::PermissionsExt::set_mode(&mut perms, 0o750); - tokio::fs::set_permissions(&tmp, perms).await?; - - tokio::fs::rename(&tmp, dest).await?; - info!(dest = %dest, "binary swapped"); - Ok(()) -} - -async fn fetch(url: &str) -> Result { - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - bail!("failed to download {}: {}", url, resp.status()); - } - Ok(resp.bytes().await?) -} - -async fn fetch_checksum(url: &str) -> Result { - let resp = reqwest::get(url).await?; - if !resp.status().is_success() { - bail!("failed to download checksum {}: {}", url, resp.status()); - } - let text = resp.text().await?; - text.split_whitespace() - .next() - .map(|s| s.to_string()) - .ok_or_else(|| anyhow::anyhow!("empty checksum file at {}", url)) -} - -fn verify_checksum(data: &[u8], expected: &str) -> Result<()> { - let digest = hex::encode(Sha256::digest(data)); - if digest != expected { - bail!("checksum mismatch: expected={} got={}", expected, digest); - } - info!("checksum verified"); - Ok(()) -} - -async fn restart_unit(unit: &str) -> Result<()> { - let status = Command::new("sudo") - .args(["systemctl", "restart", unit]) + let status = Command::new("nixos-rebuild") + .args(["switch", "--flake", &flake_url]) .status() .await?; - if !status.success() { - bail!("systemctl restart {} failed: {}", unit, status); - } - Ok(()) -} -fn detect_arch() -> &'static str { - if cfg!(target_arch = "aarch64") { "arm64" } else { "amd64" } -} - -async fn compose(cfg: &Config, version: &str, docker_config_dir: Option<&str>, args: &[&str]) -> Result<()> { - let mut cmd_args = vec!["compose", "-f", cfg.compose_file.as_str()]; - cmd_args.extend_from_slice(args); - - let mut cmd = Command::new("docker"); - cmd.args(&cmd_args) - .env("GHCR_ORG", &cfg.ghcr_org) - .env("CSF_VERSION", version) - .stdout(Stdio::inherit()) - .stderr(Stdio::inherit()); - - if let Some(dir) = docker_config_dir { - cmd.env("DOCKER_CONFIG", dir); - } - - let status = cmd.status().await?; if !status.success() { - bail!("docker compose {} failed: {}", args.join(" "), status); + bail!("nixos-rebuild switch failed for rev {}", flake_rev); } + + info!(flake_rev = %flake_rev, "nixos-rebuild switch complete"); Ok(()) } diff --git a/control-plane/csf-updater/src/verify.rs b/control-plane/csf-updater/src/verify.rs deleted file mode 100644 index 5fae4e2..0000000 --- a/control-plane/csf-updater/src/verify.rs +++ /dev/null @@ -1,112 +0,0 @@ -use anyhow::{bail, Result}; -use tracing::info; - -use crate::config::Config; - -const SERVICES: &[&str] = &[ - "api-gateway", - "registry", - "scheduler", - "volume-manager", - "failover-controller", - "sdn-controller", -]; - -pub async fn verify_images(cfg: &Config, version: &str, ghcr_auth: Option<&str>) -> Result<()> { - let client = reqwest::Client::new(); - - for svc in SERVICES { - let image = format!("{}/csf-ce-{}", cfg.ghcr_org, svc); - let remote = remote_digest(&client, &image, version, ghcr_auth).await?; - let local = local_digest(&format!("ghcr.io/{}/csf-ce-{}:{}", cfg.ghcr_org, svc, version))?; - - if remote != local { - bail!( - "digest mismatch for {}: remote={} local={}", - svc, remote, local - ); - } - - info!(service = svc, digest = %remote, "image verified"); - } - - Ok(()) -} - -async fn exchange_token(client: &reqwest::Client, image: &str, basic_auth: &str) -> Result { - let url = format!( - "https://ghcr.io/token?scope=repository:{}:pull", - image - ); - let resp = client - .get(&url) - .header("Authorization", format!("Basic {}", basic_auth)) - .send() - .await?; - - if !resp.status().is_success() { - bail!("GHCR token exchange failed for {}: {}", image, resp.status()); - } - - let body: serde_json::Value = resp.json().await?; - body["token"] - .as_str() - .map(|s| s.to_string()) - .ok_or_else(|| anyhow::anyhow!("no token in GHCR token response for {}", image)) -} - -async fn remote_digest(client: &reqwest::Client, image: &str, tag: &str, ghcr_auth: Option<&str>) -> Result { - let bearer = match ghcr_auth { - Some(auth) => exchange_token(client, image, auth).await?, - None => bail!("no GHCR auth configured"), - }; - - let url = format!("https://ghcr.io/v2/{}/manifests/{}", image, tag); - let resp = client - .head(&url) - .header("Authorization", format!("Bearer {}", bearer)) - .header("Accept", "application/vnd.docker.distribution.manifest.v2+json") - .send() - .await?; - - if !resp.status().is_success() { - bail!("GHCR manifest request failed for {}: {}", image, resp.status()); - } - - resp.headers() - .get("docker-content-digest") - .and_then(|v| v.to_str().ok()) - .map(|s| s.to_string()) - .ok_or_else(|| anyhow::anyhow!("no docker-content-digest header for {}", image)) -} - -fn local_digest(image: &str) -> Result { - let pull = std::process::Command::new("docker") - .args(["pull", "--quiet", image]) - .output()?; - - if !pull.status.success() { - bail!( - "docker pull failed for {}: {}", - image, - String::from_utf8_lossy(&pull.stderr).trim() - ); - } - - let output = std::process::Command::new("docker") - .args(["image", "inspect", "--format", "{{json .RepoDigests}}", image]) - .output()?; - - if !output.status.success() { - bail!("docker inspect failed for {}", image); - } - - let raw = String::from_utf8(output.stdout)?; - let digests: Vec = serde_json::from_str(raw.trim()) - .map_err(|e| anyhow::anyhow!("failed to parse RepoDigests for {}: {}", image, e))?; - - digests - .into_iter() - .find_map(|d| d.split('@').nth(1).map(|s| s.to_string())) - .ok_or_else(|| anyhow::anyhow!("no repo digest found for {}", image)) -} diff --git a/control-plane/registry/Cargo.toml b/control-plane/registry/Cargo.toml index c603976..744e278 100644 --- a/control-plane/registry/Cargo.toml +++ b/control-plane/registry/Cargo.toml @@ -38,6 +38,7 @@ chrono = { workspace = true, features = ["serde"] } sea-orm = { workspace = true } reqwest = { workspace = true } +etcd-client = { workspace = true } # Crypto sha2 = { workspace = true } diff --git a/control-plane/registry/src/handlers/agent.rs b/control-plane/registry/src/handlers/agent.rs index 7727626..2431afb 100644 --- a/control-plane/registry/src/handlers/agent.rs +++ b/control-plane/registry/src/handlers/agent.rs @@ -202,9 +202,15 @@ pub async fn heartbeat( } } + let desired_flake_rev = read_desired_flake_rev(&state.etcd_endpoints).await; + let post_update_heartbeats = + increment_post_update_heartbeats(&state.etcd_endpoints, agent_id).await; + Ok(Json(HeartbeatResponse { success: true, message: "Heartbeat recorded".to_string(), + desired_flake_rev, + post_update_heartbeats, })) } Err(e) => Err(( @@ -216,6 +222,47 @@ pub async fn heartbeat( } } +async fn read_desired_flake_rev(etcd_endpoints: &str) -> Option { + let mut client = etcd_client::Client::connect([etcd_endpoints], None) + .await + .ok()?; + + let resp = client + .get("/csf/config/desired_flake_rev", None) + .await + .ok()?; + + resp.kvs() + .first() + .and_then(|kv| std::str::from_utf8(kv.value()).ok()) + .map(|s| s.to_string()) +} + +async fn increment_post_update_heartbeats(etcd_endpoints: &str, agent_id: Uuid) -> Option { + let key = format!("/csf/nodes/{}/post_update_heartbeats", agent_id); + + let mut client = etcd_client::Client::connect([etcd_endpoints], None) + .await + .ok()?; + + let current: u32 = client + .get(key.as_str(), None) + .await + .ok() + .and_then(|r| r.kvs().first().map(|kv| kv.value().to_vec())) + .and_then(|v| std::str::from_utf8(&v).ok().and_then(|s| s.parse().ok())) + .unwrap_or(0); + + let next = current + 1; + + client + .put(key.as_str(), next.to_string().as_bytes(), None) + .await + .ok()?; + + Some(next) +} + async fn forward_container_statuses( state: &crate::server::AppState, statuses: Vec, diff --git a/control-plane/registry/src/main.rs b/control-plane/registry/src/main.rs index 79d0fc4..2a4a9a5 100644 --- a/control-plane/registry/src/main.rs +++ b/control-plane/registry/src/main.rs @@ -51,6 +51,9 @@ async fn main() -> anyhow::Result<()> { .build() .expect("Failed to build HTTP client"); + let etcd_endpoints = std::env::var("ETCD_ENDPOINTS") + .unwrap_or_else(|_| "http://localhost:2379".to_string()); + let state = server::AppState { token_manager: token_manager.clone(), bootstrap_token_manager: bootstrap_token_manager.clone(), @@ -61,6 +64,7 @@ async fn main() -> anyhow::Result<()> { scheduler_url, gateway_url, http_client, + etcd_endpoints, }; let token_cleanup_handle = { diff --git a/control-plane/registry/src/models/agent.rs b/control-plane/registry/src/models/agent.rs index 4804ec6..7f5e857 100644 --- a/control-plane/registry/src/models/agent.rs +++ b/control-plane/registry/src/models/agent.rs @@ -135,6 +135,8 @@ pub struct HeartbeatRequest { pub struct HeartbeatResponse { pub success: bool, pub message: String, + pub desired_flake_rev: Option, + pub post_update_heartbeats: Option, } #[derive(Debug, Serialize, Deserialize)] diff --git a/control-plane/registry/src/server.rs b/control-plane/registry/src/server.rs index fbdccc6..a84966b 100644 --- a/control-plane/registry/src/server.rs +++ b/control-plane/registry/src/server.rs @@ -31,6 +31,7 @@ pub struct AppState { pub scheduler_url: String, pub gateway_url: String, pub http_client: Client, + pub etcd_endpoints: String, } pub async fn health_check() -> impl IntoResponse { diff --git a/docs/UPDATER_PLAN.md b/docs/UPDATER_PLAN.md new file mode 100644 index 0000000..4da90fa --- /dev/null +++ b/docs/UPDATER_PLAN.md @@ -0,0 +1,305 @@ +# CSF Updater — Architekturplan + +## Aktueller Stand (vollständig analysiert) + +### CI/CD Pipeline + +**GitHub Actions Workflows:** +- `release-please.yml`: Läuft auf `main` — erstellt automatisch GitHub Releases via Conventional Commits, bumped `Cargo.toml` workspace version, aktuell bei `0.2.2` +- `docker-build.yml`: Trigggert nach erfolgreichem Release-Please-Run **oder** `workflow_dispatch` **oder** `push` auf `develop` + - Matrix-Build: 6 Services × 2 Architekturen (amd64 + arm64) via native GitHub Runners (`ubuntu-latest` + `ubuntu-24.04-arm`) + - Build-Strategie: `push-by-digest` → separater `manifest`-Job erstellt Multi-Arch-Manifest + - Images landen auf `ghcr.io//csf-ce-:` + `:latest` + - Dockerfile: `control-plane/Dockerfile.prod.shared` mit `cargo-chef` für Layer-Caching + - `build-binaries`-Job: baut `csf-updater` und `csf-agent` als statische musl-Binaries (amd64 + arm64) + - `attach-binaries-release`-Job: uploaded Binaries + SHA256-Dateien zum GitHub Release +- `prerelease.yml`: Identischer Flow für `develop`-Branch → Pre-release mit `-alpha.` Tag +- `lint.yml`: `cargo clippy -D warnings` + `cargo fmt --check` + `cargo audit` auf PRs und `main` +- `renovate.yml`: automatische Dependency-Updates (vermutlich) + +**Dockerfile-Struktur (`Dockerfile.prod.shared`):** +- Stage 1 (`planner`): `cargo chef prepare` — generiert `recipe.json` +- Stage 2 (`builder`): `cargo chef cook` (Dependency-Cache) + `cargo build --profile docker-release --bin --bin csf-migrate` +- Stage 3 (`runtime`): `debian:bookworm-slim`, beide Binaries (`/app/service` + `/csf-migrate`) kopiert +- Build-Arg `CSF_BUILD_VERSION` wird an den Build übergeben (für `build.rs`) + +**`Dockerfile.csf-updater`:** +- Separates Dockerfile nur für `csf-updater`, exportiert Binary via `FROM scratch AS export` +- Wird nicht vom CI verwendet — CI baut `csf-updater` als musl-Binary direkt via `cargo build` +- Dieses Dockerfile ist totes Deployment-Artefakt, das nicht mehr zum CI-Flow passt + +### Runtime-Komponenten + +**`csf-updater` Binary** (`control-plane/csf-updater/`): +- Pollt etcd alle N Sekunden auf `/csf/config/desired_cp_version` +- Validiert Semver-Format, setzt `/csf/config/last_update_result` als Statusindikator +- Lädt GHCR-Token verschlüsselt aus etcd (AES-256-GCM via `secret.rs`) +- Führt `docker compose pull` → Digest-Verify → `docker compose up -d` aus +- Digest-Verify: GHCR Registry API (remote) vs. `docker image inspect` (lokal) — aber `local_digest()` macht intern nochmal `docker pull` +- Wartet 15s pauschal, prüft dann `docker compose ps` auf unhealthy Services +- Downloadet `csf-agent` und `csf-updater` Binaries von GitHub Releases, verifiziert SHA256, swappt atomar via `rename(2)` +- Startet Units via `sudo systemctl restart ` + +**Shell-Fallback** (`deployments/systemd/csf-updater.sh`): +- Identische Logik in Bash: etcd-Poll via curl + jq, docker-compose-Flow, Digest-Verify +- Kein Binary-Download, kein Self-Update +- Kein Health-Check nach up (nur `sleep 15` + `jq`-Filter) + +**Systemd-Unit** (`deployments/systemd/csf-updater.service`): +- `ExecStart` zeigt auf `csf-updater.sh` (Shell-Script), nicht auf das Rust-Binary +- Fehlende Env-Var: `SECRET_ENCRYPTION_KEY` (vom Rust-Binary required, im Shell-Script nicht gebraucht) +- `ETCD_ENDPOINT` (Singular) statt `ETCD_ENDPOINTS` (Liste, wie Config erwartet) +- Kein Hardening: kein `ProtectSystem`, kein `NoNewPrivileges`, kein `CapabilityBoundingSet` +- User `csf-updater` ist in Gruppe `docker` — kann alle Container auf dem Host steuern + +--- + +## Probleme und Schwachstellen + +### P1 — systemd-Unit startet Shell-Script statt Rust-Binary +`ExecStart=/opt/csf/csf-updater.sh` — das Rust-Binary wird gebaut, deployed, aber nie gestartet. +Das Secret-Handling (AES-256-GCM), das persistente etcd-RESULT_KEY-Schreiben und die SHA256-Verify laufen damit in Prod nie. Die Shell-Version hat keine Verschlüsselung und kein Binary-Download. + +### P2 — sudo ohne sudoers-Regel bricht in Prod +`restart_unit()` ruft `sudo systemctl restart ` auf. Der User `csf-updater` hat keine sudoers-Regel — jeder Update-Cycle schlägt beim systemctl-Call fehl, ohne Rollback. + +### P3 — Kein Rollback +Wenn `health_check()` einen unhealthy Service meldet, wird `RESULT_KEY` auf `failed` gesetzt und der Cycle endet. Die Services laufen weiterhin mit dem neuen (kaputten) Image. Kein `docker compose up -d` mit dem vorherigen Tag. + +### P4 — Self-Update-Race +`update_self_binary()` downloaded das neue Binary und macht `systemctl restart csf-updater`. Der eigene Prozess wird gekillt bevor er `RESULT_KEY = success` schreiben kann — jeder Self-Update-Cycle hinterlässt `in_progress` in etcd. + +### P5 — `last_applied` nur im RAM +Nach Crash oder Restart versucht der Updater sofort wieder dieselbe Version zu applyen. Bei einem kaputten Setup → endloser Retry-Loop. + +### P6 — 15s Sleep ist nicht deterministisch +`health_check()` wartet pauschal 15 Sekunden. Bei großen Images oder langsamen Nodes reicht das nicht. Bei schnellen Nodes ist es Verschwendung. + +### P7 — Kein Distributed Lock +Wenn zwei Master-Nodes gleichzeitig denselben `desired_cp_version`-Key sehen, laufen beide gleichzeitig `docker compose up -d`. Kein Lock in etcd. + +### P8 — Reines Polling, keine etcd-Watches +Der Updater reconnected zu etcd jede Poll-Iteration und macht ein synchrones GET. Ein etcd-Watch wäre reaktiver und ressourcenschonender. + +### P9 — `local_digest()` macht internen zweiten `docker pull` +In `verify_images()` wird `docker pull --quiet` in `local_digest()` aufgerufen — obwohl `pull()` das Image bereits wenige Sekunden vorher gezogen hat. Verdoppelt die Download-Zeit. + +### P10 — Agent-Binary-Update inkompatibel mit NixOS +`update_agent_binary()` schreibt nach `/usr/local/bin/csf-agent` und startet `csf-daemon` neu. Auf NixOS überlebt das Binary keinen `nixos-rebuild switch` — die systemd-Unit zeigt auf einen Nix-Store-Pfad, nicht auf `/usr/local/bin`. Der Ansatz funktioniert nur auf nicht-NixOS-Systemen. + +### P11 — `Dockerfile.csf-updater` ist orphaned +Das separate Dockerfile baut `csf-updater` als statisches Binary, exportiert es via `FROM scratch`. Der CI-Flow (`docker-build.yml`) nutzt es nicht — er baut `csf-updater` direkt via `cargo build --target musl`. Das Dockerfile ist toter Code und führt zu Verwirrung bei der Frage welcher Build-Pfad der kanonische ist. + +### P12 — `update-versions.sh` referenziert `backend/Cargo.toml` das nicht existiert +Das Script in `.github/scripts/update-versions.sh` patcht `backend/Cargo.toml`. Das Projekt heißt aber `CSF-Core` mit `Cargo.toml` im Root als Workspace. `backend/` existiert nicht. Das Script ist toter Code aus einem früheren Projekt-Layout. + +### P13 — `csf-updater` im selben `Dockerfile.prod.shared` wie Services +Der `build`-Job in `docker-build.yml` baut alle 6 Services mit `Dockerfile.prod.shared`. `csf-updater` hat ein eigenes `Dockerfile.csf-updater`. Der `build-binaries`-Job baut `csf-updater` als musl-Binary. Drei verschiedene Build-Pfade für dasselbe Binary — unklar welcher kanonisch ist. + +--- + +## Zielarchitektur + +### Schicht 1 — Control Plane Updates (Docker-basiert) + +``` +GitHub Release v1.2.3 + → CI baut Images + musl-Binaries + → Images auf ghcr.io//csf-ce-:1.2.3 + → Binaries als Release-Assets (csf-agent-amd64, csf-updater-amd64 etc.) + → Admin setzt etcd: /csf/config/desired_cp_version = "1.2.3" + +etcd-Watch (kein Poll) triggert csf-updater: + 1. acquire_lock (etcd Lease, 60s TTL) — verhindert parallele Updates + 2. pull images (alle 6 Services parallel via goroutines/tasks) + 3. verify digests (remote GHCR API vs lokaler docker inspect, KEIN zweiter pull) + 4. docker compose up -d --remove-orphans + 5. wait_healthy (Retry-Loop, 5s Interval, konfigurierbarer Timeout) + → bei timeout: docker compose up -d mit PREV_VERSION (Rollback) + 6. release_lock + 7. put applied_cp_version = version, put last_update_result = success + +bei Fehler in Schritt 4/5: + 8. docker compose up -d mit applied_cp_version (Rollback) + 9. put last_update_result = rolled_back +``` + +**etcd-Keys:** +``` +/csf/config/desired_cp_version → Zielversion (Admin schreibt diesen Key) +/csf/config/applied_cp_version → zuletzt erfolgreich gerollte Version (persistentes last_applied) +/csf/config/last_update_result → in_progress | success | failed | rolled_back +/csf/config/update_paused → true/false (bereits implementiert) +/csf/config/update_lock → Distributed Lock (etcd Lease) +/csf/config/ghcr_token → AES-256-GCM verschlüsseltes Token (bereits implementiert) +/csf/config/desired_agent_version → Zielversion für csf-agent (Registry liest, Heartbeat trägt aus) +``` + +### Schicht 2 — Agent-Updates + +**NixOS-Nodes (Primärpfad):** +``` +Registry liest desired_agent_version aus etcd + → Heartbeat-Response: { desired_version: "1.2.3" } + → Agent vergleicht mit env!("CARGO_PKG_VERSION") aus build.rs + → wenn neuer: schreibe /var/lib/csf-daemon/desired_version + → triggere systemctl start csf-agent-update.service (PolicyKit-Regel) + → Oneshot-Unit führt nixos-rebuild switch aus + → systemd startet csf-daemon nach rebuild neu (neues Binary aus Nix-Store) +``` + +**Nicht-NixOS-Fallback:** +``` +Agent: + 1. Download Binary in tmpfile (/var/lib/csf-daemon/csf-agent.new) + 2. verifiziere SHA256 gegen Release-Asset + 3. chmod 0o750 + 4. rename(2) → atomarer swap nach /var/lib/csf-daemon/csf-agent + 5. exec() sich selbst (in-place restart, kein PID-Wechsel) + bei exec()-Fehler: systemctl restart csf-daemon via D-Bus (kein sudo) +``` + +Der `csf-updater` ist nicht zuständig für Agent-Updates. Er schreibt nur `/csf/config/desired_agent_version`. Die Verteilung läuft ausschließlich über den Heartbeat-Mechanismus. + +### Schicht 3 — Self-Update des Updaters + +Empfehlung: `csf-updater` Self-Update entfernen. + +Begründung: `csf-updater` ist kein Service der laufend upgedatet werden muss. Er wird beim Aufsetzen eines neuen Nodes deployed (via NixOS-Modul oder Ansible). Neue Versionen des Updaters kommen mit dem nächsten Node-Provisioning. Der Self-Update-Race (P4) entfällt komplett. + +Falls Self-Update doch gewünscht: `success` + `applied_cp_version` in etcd schreiben, **dann** Binary tauschen + Unit neustarten. Die neue Instanz liest `applied_cp_version` beim Start und überspringt die Version. + +--- + +## Konkrete Änderungen (priorisiert) + +### 1 — systemd-Unit auf Rust-Binary umstellen [blocking] +`ExecStart` von `csf-updater.sh` auf `/usr/local/bin/csf-updater` ändern. +`ETCD_ENDPOINT` → `ETCD_ENDPOINTS` (kommaseparierte Liste). +`SECRET_ENCRYPTION_KEY` als Env-Var ergänzen (aus `/opt/csf/.env`). + +### 2 — Persistentes `applied_version` in etcd [blocking] +Beim Start: `etcd.get(APPLIED_VERSION_KEY)` als initialen `last_applied`. +`APPLIED_VERSION_KEY` nach erfolgreichem Update schreiben. +Eliminiert idempotenten Retry-Loop nach Restart. + +### 3 — Rollback-Logik in `updater.rs` +Vor Update: `prev_version = etcd.get(APPLIED_VERSION_KEY)`. +Nach fehlgeschlagenem health_check: `compose(cfg, &prev, docker_config_dir, &["up", "-d"])`. +`RESULT_KEY = "rolled_back"`. + +### 4 — Health-Check: Retry-Loop statt pauschaler Sleep +```rust +let timeout = Duration::from_secs(cfg.health_check_timeout_secs); +let deadline = Instant::now() + timeout; +loop { + if all_healthy(cfg, version).await? { return Ok(()); } + if Instant::now() > deadline { bail!("health check timeout"); } + sleep(Duration::from_secs(5)).await; +} +``` +Neues Config-Feld: `health_check_timeout_secs` (Default: 120). + +### 5 — `local_digest()`: internen Pull entfernen +`local_digest()` soll nur `docker image inspect` aufrufen. Der Pull ist bereits in `pull()` passiert. +Wenn `inspect` fehlschlägt → bail, nicht erneut pullen. + +### 6 — Distributed Lock in `etcd.rs` +```rust +pub async fn acquire_lock(&mut self, ttl_secs: i64) -> Result // returns lease_id +pub async fn release_lock(&mut self, lease_id: i64) -> Result<()> +``` +`acquire_lock` nutzt `etcd_client::Client::lease_grant` + `put` mit `LeaseId` auf `LOCK_KEY`. +Vor jedem Update-Cycle: lock acquiren. Bei Fehler (Lock bereits gehalten): `info!` + skip (kein Fehler). + +### 7 — etcd-Watch in `main.rs` +etcd-Client hält eine persistente Verbindung, `watch()` auf `DESIRED_VERSION_KEY`. +Fallback-Poll alle 5 Minuten (Watch kann bei Netzwerkproblemen abreißen). +Eliminiert das unnötige Reconnect bei jedem Poll-Cycle. + +### 8 — sudoers-Datei oder D-Bus-Restart +Einfachste Lösung: `/etc/sudoers.d/90-csf-updater`: +``` +csf-updater ALL=(root) NOPASSWD: /usr/bin/systemctl restart csf-daemon +``` +Dieses File muss Teil des NixOS-Moduls / Deployment-Skripts sein. +Mittelfristig: `zbus`-Crate für D-Bus-nativen systemd-Unit-Restart ohne sudo. + +### 9 — Self-Update aus `updater.rs` entfernen +`update_agent_binary()` und `update_self_binary()` aus `updater::run()` entfernen. +Agent-Updates laufen via Heartbeat-Response (Schicht 2). +Updater-Updates laufen via Node-Provisioning. + +### 10 — `Dockerfile.csf-updater` entfernen +Totes Artefakt — CI nutzt es nicht. Verursacht Verwirrung über den kanonischen Build-Pfad. +Kanonisch ist `build-binaries`-Job in `docker-build.yml` (musl, statisches Binary). + +### 11 — `update-versions.sh` fixen oder entfernen +Script referenziert `backend/Cargo.toml` (existiert nicht). Versioning läuft über `release-please` + `Cargo.toml` workspace. Script ist funktionslos, sollte entfernt werden. + +### 12 — NixOS-Modul: `csf-agent-update.service` Oneshot-Unit +```nix +systemd.services.csf-agent-update = { + description = "CSF Agent NixOS Update"; + serviceConfig = { + Type = "oneshot"; + ExecStart = "${pkgs.nixos-rebuild}/bin/nixos-rebuild switch"; + User = "root"; + }; +}; +security.polkit.extraConfig = '' + polkit.addRule(function(action, subject) { + if (action.id === "org.freedesktop.systemd1.manage-units" && + action.lookup("unit") === "csf-agent-update.service" && + subject.user === "csf-daemon") { + return polkit.Result.YES; + } + }); +''; +``` + +--- + +## Was nicht geändert werden soll + +- AES-256-GCM Secret-Handling (`secret.rs`) ist korrekt. +- Semver-Validierung in `main.rs` ist ausreichend. +- GHCR-Token-Exchange-Logik in `verify.rs` ist korrekt. +- `docker compose up -d --remove-orphans` ist der richtige Rolling-Restart-Mechanismus. +- Multi-Arch-Matrix-Build-Strategie (digest-first + manifest) in CI ist korrekt. +- `cargo-chef`-Layer-Caching in `Dockerfile.prod.shared` ist korrekt. +- `release-please` + Conventional Commits als Release-Trigger ist korrekt. +- SHA256-Verify + atomares `rename(2)` beim Binary-Swap ist korrekt. + +--- + +## Deployment-Checkliste + +``` +[ ] systemd-Unit auf Rust-Binary umgestellt (ExecStart, ETCD_ENDPOINTS, SECRET_ENCRYPTION_KEY) +[ ] applied_cp_version Key beim Start geladen (persistentes last_applied) +[ ] applied_cp_version nach erfolgreichem Update in etcd geschrieben +[ ] Rollback-Logik in updater.rs (compose up mit prev_version bei health-check-Fehler) +[ ] Health-Check: Retry-Loop mit konfigurierbarem Timeout statt pauschalen 15s +[ ] local_digest() ohne internen docker pull Aufruf +[ ] Distributed Lock (acquire/release) in etcd.rs +[ ] etcd-Watch in main.rs (mit Fallback-Poll) +[ ] sudoers-Datei im Deployment oder D-Bus-basierter Restart +[ ] Self-Update (update_agent_binary, update_self_binary) aus updater::run() entfernt +[ ] Dockerfile.csf-updater entfernt +[ ] update-versions.sh entfernt oder auf Workspace-Cargo.toml korrigiert +[ ] desired_agent_version in etcd schreiben (Admin-API oder Registry-Seite) +[ ] HeartbeatResponse: desired_version Feld ergänzen (Registry + Agent) +[ ] Agent: Version-Check + Update-Trigger (NixOS-Pfad + Fallback) +[ ] NixOS-Modul: csf-agent-update.service Oneshot-Unit + PolicyKit-Regel +[ ] systemd-Unit Hardening (NoNewPrivileges, ProtectSystem, CapabilityBoundingSet) +``` + +--- + +## Nicht in Scope (bewusst ausgeschlossen) + +- Watchtower: Dev-only, kein Digest-Verify, kein Rollback — nicht Prod-fähig +- Kubernetes-style Rolling Updates pro Replica: nicht relevant, Docker-Compose-Instanz pro Node +- Automatische Datenbankmigrationen im Updater: `csf-migrate` Init-Container ist korrekt und bleibt getrennt +- Separate Version-Tracks pro Service: alle Services laufen auf derselben Workspace-Version diff --git a/nixos-node/flake.nix b/nixos-node/flake.nix index 71fdca8..0351078 100644 --- a/nixos-node/flake.nix +++ b/nixos-node/flake.nix @@ -1,5 +1,5 @@ { - description = "CSF NixOS Node Configuration"; + description = "CSF Node — binary builds and server configuration"; inputs = { nixpkgs.url = "github:NixOS/nixpkgs/nixos-25.05"; @@ -22,12 +22,12 @@ targets = [ "x86_64-unknown-linux-gnu" "x86_64-unknown-linux-musl" ]; }; - gnuPlatform = pkgs.makeRustPlatform { + platform = pkgs.makeRustPlatform { cargo = rustToolchain; rustc = rustToolchain; }; - csfAgentPkg = gnuPlatform.buildRustPackage { + csfAgentPkg = platform.buildRustPackage { pname = "csf-agent"; version = "0.2.2"; src = ../.; @@ -37,7 +37,7 @@ buildInputs = [ pkgs.openssl ]; }; - csfUpdaterPkg = gnuPlatform.buildRustPackage { + csfUpdaterPkg = platform.buildRustPackage { pname = "csf-updater"; version = "0.2.2"; src = ../.; @@ -48,46 +48,36 @@ doCheck = false; }; - csfDaemonModule = import ./modules/csf-daemon.nix; + versions = import ../CSFX-Infra/versions.nix; - agentSpecialArgs = { + serverSpecialArgs = { csf.agentPackage = csfAgentPkg; csf.updaterPackage = csfUpdaterPkg; + inherit versions; }; in { - nixosConfigurations = { - iso = nixpkgs.lib.nixosSystem { - inherit system; - modules = [ ./modules/iso-configuration.nix ]; - }; - - csf-node = nixpkgs.lib.nixosSystem { - inherit system; - specialArgs = agentSpecialArgs; - modules = [ - csfDaemonModule - ./modules/node-configuration.nix - ]; - }; - - csf-server = nixpkgs.lib.nixosSystem { - inherit system; - specialArgs = agentSpecialArgs; - modules = [ - csfDaemonModule - ./modules/server-configuration.nix - ]; - }; + nixosConfigurations.csf-server = nixpkgs.lib.nixosSystem { + inherit system; + specialArgs = serverSpecialArgs; + modules = [ ./modules/server-configuration.nix ]; }; - nixosModules.csf-daemon = csfDaemonModule; + nixosConfigurations.csf-iso = nixpkgs.lib.nixosSystem { + inherit system; + specialArgs = serverSpecialArgs; + modules = [ ./modules/iso-configuration.nix ]; + }; packages.${system} = { csf-agent = csfAgentPkg; csf-updater = csfUpdaterPkg; default = csfAgentPkg; - iso = self.nixosConfigurations.iso.config.system.build.isoImage; + iso = nixpkgs.lib.nixosSystem { + inherit system; + specialArgs = serverSpecialArgs; + modules = [ ./modules/iso-configuration.nix ]; + }.config.system.build.isoImage; }; }; } diff --git a/nixos-node/modules/csf-daemon.nix b/nixos-node/modules/csf-daemon.nix deleted file mode 100644 index 2e7605b..0000000 --- a/nixos-node/modules/csf-daemon.nix +++ /dev/null @@ -1,105 +0,0 @@ -{ config, lib, pkgs, ... }: - -let - cfg = config.services.csf-daemon; - credentialsFile = "/var/lib/csf-daemon/credentials"; -in -{ - options.services.csf-daemon = { - enable = lib.mkEnableOption "CSF local daemon agent"; - - package = lib.mkOption { - type = lib.types.package; - description = "The csf-agent package to use."; - }; - - binaryPath = lib.mkOption { - type = lib.types.str; - default = "/usr/local/bin/csf-agent"; - description = "Path to the csf-agent binary. Can be overwritten by the updater."; - }; - - apiGateway = lib.mkOption { - type = lib.types.str; - example = "https://gateway.csf.example:8000"; - description = "URL of the CSF API Gateway."; - }; - - registrationToken = lib.mkOption { - type = lib.types.str; - default = ""; - description = "Cluster-wide bootstrap token (csf-bootstrap.*) or node-specific pre-register token (reg_*). Ignored once the agent is registered."; - }; - - heartbeatInterval = lib.mkOption { - type = lib.types.ints.positive; - default = 60; - description = "Heartbeat interval in seconds."; - }; - - logLevel = lib.mkOption { - type = lib.types.enum [ "trace" "debug" "info" "warn" "error" ]; - default = "info"; - description = "Log level for the daemon."; - }; - }; - - config = lib.mkIf cfg.enable { - users.users.csf-daemon = { - isSystemUser = true; - group = "csf-daemon"; - extraGroups = [ "csf-updater" ]; - home = "/var/lib/csf-daemon"; - shell = pkgs.shadow; - description = "CSF daemon service user"; - }; - - users.groups.csf-daemon = {}; - - systemd.tmpfiles.rules = [ - "d /var/lib/csf-daemon 0700 csf-daemon csf-daemon -" - ]; - - systemd.services.csf-daemon = { - description = "CSF Local Daemon Agent"; - after = [ "network-online.target" "csf-control-plane.service" ]; - wants = [ "network-online.target" ]; - requires = [ "csf-control-plane.service" ]; - wantedBy = [ "multi-user.target" ]; - - environment = { - CSF_GATEWAY_URL = cfg.apiGateway; - CSF_HEARTBEAT_INTERVAL = toString cfg.heartbeatInterval; - RUST_LOG = cfg.logLevel; - } // lib.optionalAttrs (cfg.registrationToken != "") { - CSF_REGISTRATION_TOKEN = cfg.registrationToken; - }; - - serviceConfig = { - ExecStart = cfg.binaryPath; - Restart = "always"; - RestartSec = "5s"; - User = "csf-daemon"; - Group = "csf-daemon"; - StateDirectory = "csf-daemon"; - StateDirectoryMode = "0700"; - NoNewPrivileges = true; - ProtectSystem = "strict"; - ProtectHome = true; - PrivateTmp = true; - PrivateDevices = true; - ProtectKernelTunables = true; - ProtectKernelModules = true; - ProtectControlGroups = true; - RestrictAddressFamilies = [ "AF_INET" "AF_INET6" ]; - RestrictNamespaces = true; - LockPersonality = true; - MemoryDenyWriteExecute = true; - RestrictRealtime = true; - SystemCallFilter = "@system-service"; - ReadWritePaths = [ "/var/lib/csf-daemon" ]; - ReadOnlyPaths = [ "/var/lib/csf-updater/bin" ]; - }; - }; - }; -} diff --git a/nixos-node/modules/iso-configuration.nix b/nixos-node/modules/iso-configuration.nix index b4c488a..f7ca175 100644 --- a/nixos-node/modules/iso-configuration.nix +++ b/nixos-node/modules/iso-configuration.nix @@ -1,228 +1,161 @@ -{ config, pkgs, lib, ... }: +{ config, pkgs, lib, csf, versions, ... }: +let + updateUnitsModule = import ../../../CSFX-Infra/modules/update-units.nix; + + installScript = pkgs.writeShellScript "csf-install" '' + set -euo pipefail + + DISK="" + + for dev in sda vda nvme0n1; do + if [ -b "/dev/$dev" ]; then + DISK="/dev/$dev" + break + fi + done + + if [ -z "$DISK" ]; then + echo "[csf-install] ERROR: no suitable disk found" >&2 + exit 1 + fi + + echo "[csf-install] target disk: $DISK" + + if [[ "$DISK" == *nvme* ]]; then + PART_BOOT="${DISK}p1" + PART_ROOT="${DISK}p2" + else + PART_BOOT="${DISK}1" + PART_ROOT="${DISK}2" + fi + + parted "$DISK" -- mklabel gpt + parted "$DISK" -- mkpart ESP fat32 1MB 512MB + parted "$DISK" -- mkpart primary ext4 512MB 100% + parted "$DISK" -- set 1 esp on + + mkfs.fat -F 32 -n boot "$PART_BOOT" + mkfs.ext4 -L nixos "$PART_ROOT" + + mount "$PART_ROOT" /mnt + mkdir -p /mnt/boot + mount "$PART_BOOT" /mnt/boot + + echo "[csf-install] partitioning complete, running nixos-install" + + nixos-install \ + --no-root-passwd \ + --flake /iso/csf-flake#csf-server + + echo "[csf-install] installation complete — rebooting in 5s" + sleep 5 + reboot + ''; + + logoText = builtins.readFile ../logo.txt; + + motd = pkgs.writeText "csf-motd" '' + ${logoText} + + ╔══════════════════════════════════════════════════════════════════╗ + ║ CSF Node Installer ║ + ║ ║ + ║ Automatische Installation startet in 10 Sekunden. ║ + ║ CTRL+C zum Abbrechen und manuellem Eingriff. ║ + ║ ║ + ║ Nach der Installation: ║ + ║ - csf-agent verbindet sich mit dem API Gateway ║ + ║ - Updates laufen automatisch via GitOps ║ + ║ ║ + ╚══════════════════════════════════════════════════════════════════╝ + ''; +in { imports = [ + updateUnitsModule ]; - # System configuration - system.stateVersion = "24.11"; + system.stateVersion = "25.05"; + + isoImage.volumeID = "CSF-NODE"; + isoImage.edition = lib.mkForce "csf"; + isoImage.prependToMenuLabel = "CSF Node Installer — "; + isoImage.makeEfiBootable = true; + isoImage.makeUsbBootable = true; + + isoImage.storeContents = [ + csf.agentPackage + csf.updaterPackage + ]; + + isoImage.contents = [ + { + source = ../../../CSFX-Infra; + target = "/csf-flake"; + } + ]; + + boot.kernelParams = [ + "console=ttyS0,115200n8" + "console=tty0" + "quiet" + ]; + + boot.loader.timeout = lib.mkForce 10; - # Networking networking = { - hostName = "csf-docker-test"; - firewall = { - enable = true; - allowedTCPPorts = [ - 8080 # Test nginx container - ]; - }; + hostName = "csf-installer"; + useDHCP = true; + firewall.enable = false; }; - # Enable Docker - virtualisation.docker.enable = true; + time.timeZone = "UTC"; - # System packages - environment.systemPackages = with pkgs; [ - # Docker tools - docker-compose - docker + services.getty.autologinUser = lib.mkForce "root"; - # Utilities - curl - wget - vim - htop - ]; + users.users.root = { + initialPassword = ""; + shell = pkgs.bash; + }; - # Auto-login as root on boot (for ISO convenience) - services.getty.autologinUser = "root"; + services.openssh = { + enable = true; + settings = { + PermitRootLogin = "yes"; + PasswordAuthentication = true; + }; + }; - # Docker Compose service for nginx test - systemd.services.docker-compose-test = { - description = "Docker Compose Test Service (nginx)"; - after = [ "docker.service" ]; - requires = [ "docker.service" ]; - wantedBy = [ "multi-user.target" ]; + environment.etc."motd".source = motd; + systemd.services.csf-autoinstall = { + description = "CSF automatic node installer"; + after = [ "network-online.target" "getty.target" ]; + wants = [ "network-online.target" ]; + wantedBy = [ "multi-user.target" ]; serviceConfig = { Type = "oneshot"; - RemainAfterExit = true; - WorkingDirectory = "/etc/docker-test"; - ExecStart = "${pkgs.docker-compose}/bin/docker-compose up -d"; - ExecStop = "${pkgs.docker-compose}/bin/docker-compose down"; + ExecStartPre = "${pkgs.coreutils}/bin/sleep 10"; + ExecStart = installScript; + StandardOutput = "journal+console"; + StandardError = "journal+console"; }; }; - # Activation script to setup Docker Compose - system.activationScripts.docker-setup = { - text = '' - # Create docker-compose directory - mkdir -p /etc/docker-test - - # Create docker-compose.yml - cat > /etc/docker-test/docker-compose.yml < /etc/docker-test/nginx.conf < /etc/docker-test/html/index.html < - - - CSF-Core Docker Test - - - -
-

CSF-Core Docker Test

-

Docker & Docker Compose funktionieren!

-

Diese Seite wird von nginx in einem Docker Container serviert.

-

Health Check

-
- - -EOF - - # Create test script - cat > /root/test-docker.sh <.*' || echo "Port 8080 not responding" -echo "" -echo "Health check:" -curl -s http://localhost:8080/health || echo "Health check failed" -echo "" -echo "=== Test Complete ===" -EOF - chmod +x /root/test-docker.sh - ''; - deps = []; + nix.settings = { + experimental-features = [ "nix-command" "flakes" ]; + trusted-users = [ "root" ]; }; - # Boot message with logo - environment.etc."issue".text = '' - - - - - ..,,,,,,,,,,,,,,,,,,,,,,,;,,,,,,,,,,,,,,'.. . - ..ckXXNNNNNNNNNNNNNNNNNNNNNNNNNNXXXXXXXXKx;. - ..cONWMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMWOc.. - ..ckNWMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMW0l.. . .. - ..ckNWMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMW0l.. . - ..ckXWMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMW0l.. - . .;kXWMMMMMMMMWKOkkkkkkkkkkkkkkkkkkkkkkkkxc.. . - .oNWMMMMMMMMNx,.......................... - .oNMMMMMMMMMK:. - .oNMMMMMMMMMK; ......................... - .oNMMMMMMMMMK; .'lddddddddddddddddddddddc'. . - .oNMMMMMMMMMK; .'o0NWWWWWWWWWWWWWWWWWWWWKd,. - .oNMMMMMMMMMK; .'l0NWMMMMMMMMMMMMMMMMMMWXx,. - .oNMMMMMMMMMK; .'l0NWMMMMMMMMMMMMMMMMMMWXx;. - .oNWMMMMMMMMK:'l0NWMMMMMMMMMMMMMMMMMMWXx;. - .cKWMMMMMMMMXOOXWMMMMWWWWWWWWWWWWWWWXx;. - .,dKWMMMMMMXo;lKMMMNOl:;;;;;;;;;;;;,. - .,dKWMMMM0; 'kWMMMNOl'. - .,dKWMM0; 'kWMMMMMN0o,... - .,dKW0; 'kWMMMMMMMWKkc. - .,ox, 'kWMMMMMMMMNXo. - ... 'kWMMMMMMMMNXd. - 'kWMMMMMMMMNXd. - 'kWMMMMMMMMNXd. - 'kWMMMMMMMMNXo. - 'kWMMMMMMMMNXo. - 'kWMMMMMMMMNXo. - 'kWMMMMMMMNkl;. - 'kWMMMMMNk:... - 'kWMMMNk:. - 'kWWXx;. - . 'kKx;. - .;,. - .. - - - - - - - - - ╔═══════════════════════════════════════════════════════════╗ - ║ ║ - ║ CSF-Core Docker Test ISO ║ - ║ ║ - ║ Einfache Docker & Docker Compose Testumgebung ║ - ║ ║ - ║ Services: ║ - ║ - Docker: systemctl status docker ║ - ║ - Nginx Test: http://localhost:8080 ║ - ║ ║ - ║ Test commands: ║ - ║ ./test-docker.sh - Run comprehensive test ║ - ║ docker ps -a - List containers ║ - ║ docker-compose ps - Compose status ║ - ║ ║ - ╚═══════════════════════════════════════════════════════════╝ - - ''; -} \ No newline at end of file + environment.systemPackages = with pkgs; [ + git + curl + parted + dosfstools + e2fsprogs + jq + vim + ]; +} diff --git a/nixos-node/modules/node-configuration.nix b/nixos-node/modules/node-configuration.nix deleted file mode 100644 index 41b2f18..0000000 --- a/nixos-node/modules/node-configuration.nix +++ /dev/null @@ -1,54 +0,0 @@ -{ config, pkgs, lib, csf, ... }: - -{ - system.stateVersion = "25.05"; - - boot.loader.grub = { - enable = true; - device = "/dev/sda"; - }; - - networking = { - hostName = "csf-node"; - firewall = { - enable = true; - allowedTCPPorts = []; - }; - }; - - time.timeZone = "UTC"; - - users.users.root.hashedPassword = "!"; - - services.openssh = { - enable = true; - settings = { - PermitRootLogin = "no"; - PasswordAuthentication = false; - }; - }; - - services.csf-daemon = { - enable = true; - package = csf.agentPackage; - apiGateway = "http://gateway.csf.local:8000"; - heartbeatInterval = 60; - logLevel = "info"; - }; - - nix = { - settings = { - experimental-features = [ "nix-command" "flakes" ]; - auto-optimise-store = true; - }; - gc = { - automatic = true; - dates = "weekly"; - options = "--delete-older-than 30d"; - }; - }; - - environment.systemPackages = with pkgs; [ - curl - ]; -} diff --git a/nixos-node/modules/server-configuration.nix b/nixos-node/modules/server-configuration.nix index 904afc6..9e77852 100644 --- a/nixos-node/modules/server-configuration.nix +++ b/nixos-node/modules/server-configuration.nix @@ -1,38 +1,39 @@ -{ config, pkgs, lib, csf, ... }: +{ config, pkgs, lib, csf, versions, ... }: let + updateUnitsModule = import ../../../CSFX-Infra/modules/update-units.nix; composeDir = "/etc/csf-core"; - binDir = "/var/lib/csf-updater/bin"; - csfUpdaterBin = csf.updaterPackage; - csfAgentBin = csf.agentPackage; in { - system.stateVersion = "25.11"; + imports = [ updateUnitsModule ]; + + system.stateVersion = "25.05"; boot = { loader.grub = { enable = true; device = "/dev/sda"; - useOSProber = true; }; initrd.availableKernelModules = [ "ata_piix" "uhci_hcd" "virtio_pci" "virtio_scsi" "sd_mod" "sr_mod" ]; - initrd.kernelModules = []; - kernelModules = []; - extraModulePackages = []; }; fileSystems."/" = { - device = "/dev/disk/by-uuid/e4b27226-e75f-4cef-9dec-fc0c6f2185ac"; + device = "/dev/disk/by-label/nixos"; fsType = "ext4"; }; + fileSystems."/boot" = { + device = "/dev/disk/by-label/boot"; + fsType = "vfat"; + }; + swapDevices = []; nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux"; networking = { hostName = "csf-node"; - networkmanager.enable = true; + useDHCP = true; firewall = { enable = true; allowedTCPPorts = [ 22 8000 ]; @@ -49,136 +50,93 @@ in }; }; - users.users.rootcsf = { + users.users.admin = { isNormalUser = true; - description = "rootcsf"; - extraGroups = [ "networkmanager" "wheel" "docker" ]; + extraGroups = [ "wheel" "docker" ]; + openssh.authorizedKeys.keys = []; }; security.sudo.wheelNeedsPassword = false; - security.sudo.extraRules = [ - { - users = [ "csf-updater" ]; - commands = [ - { command = "/run/current-system/sw/bin/systemctl restart csf-daemon"; options = [ "NOPASSWD" ]; } - { command = "/run/current-system/sw/bin/systemctl restart csf-updater"; options = [ "NOPASSWD" ]; } - ]; - } - ]; - virtualisation.docker = { enable = true; enableOnBoot = true; }; - services.csf-daemon = { - enable = true; - package = csf.agentPackage; - binaryPath = "${binDir}/csf-agent"; - apiGateway = "http://localhost:8000"; - heartbeatInterval = 60; - logLevel = "info"; - }; - - environment.systemPackages = with pkgs; [ - docker-compose - curl - wget - vim - htop - git - tmux - lsof - ]; - - users.users.csf-updater = { + users.users.csf-agent = { isSystemUser = true; - group = "csf-updater"; - extraGroups = [ "docker" ]; - home = "/var/lib/csf-updater"; + group = "csf-agent"; + home = "/var/lib/csf-daemon"; createHome = true; - shell = pkgs.shadow; }; + users.groups.csf-agent = {}; users.groups.csf-updater = {}; systemd.tmpfiles.rules = [ - "d /var/lib/csf-updater 0710 csf-updater csf-daemon -" + "d /var/lib/csf-daemon 0750 csf-agent csf-agent -" + "d /var/lib/csf 0750 csf-agent csf-updater -" + "f /var/lib/csf/update_trigger 0660 csf-agent csf-updater -" + "d /var/lib/csf-updater 0750 root root -" + "d /var/lib/csf-updater/infra.git 0750 root root -" ]; - systemd.services.csf-updater = { - description = "CSF Control Plane Updater"; - after = [ "docker.service" "network-online.target" "csf-control-plane.service" ]; - requires = [ "docker.service" ]; - wants = [ "network-online.target" ]; + systemd.services.csf-agent = { + description = "CSF Agent Daemon"; wantedBy = [ "multi-user.target" ]; - + after = [ "network-online.target" ]; + wants = [ "network-online.target" ]; serviceConfig = { - Type = "simple"; - User = "csf-updater"; - Group = "csf-updater"; - EnvironmentFile = "/etc/csf-core/updater.env"; - ExecStart = "${binDir}/csf-updater"; - Restart = "always"; - RestartSec = "10"; + ExecStart = "${csf.agentPackage}/bin/csf-agent"; + User = "csf-agent"; + Group = "csf-agent"; + Restart = "on-failure"; + RestartSec = "10s"; + PrivateTmp = true; ProtectSystem = "strict"; - ProtectHome = true; - ReadWritePaths = [ composeDir "/tmp" binDir ]; + ReadWritePaths = [ "/var/lib/csf-daemon" "/var/lib/csf" ]; + NoNewPrivileges = true; }; - environment = { - ETCD_ENDPOINTS = "http://localhost:2379"; - ETCD_USERNAME = "csf"; - COMPOSE_FILE = "${composeDir}/docker-compose.yml"; - GHCR_ORG = "csfx-cloud"; - POLL_INTERVAL_SECS = "30"; + CSF_GATEWAY_URL = "http://localhost:8000"; + CSF_HEARTBEAT_INTERVAL = "60"; RUST_LOG = "info"; - BINARY_DIR = binDir; - GITHUB_RELEASE_BASE_URL = "https://github.com/csfx-cloud/CSF-Core/releases/download"; - PATH = lib.mkForce "/run/wrappers/bin:/nix/var/nix/profiles/default/bin:/run/current-system/sw/bin"; }; }; - systemd.services.csf-control-plane = { - description = "CSF Control Plane (Docker Compose)"; - after = [ "docker.service" "network-online.target" ]; - requires = [ "docker.service" ]; - wants = [ "network-online.target" ]; - partOf = [ "docker.service" ]; + systemd.services.csf-updater = { + description = "CSF GitOps Updater"; wantedBy = [ "multi-user.target" ]; - + after = [ "network-online.target" ]; + wants = [ "network-online.target" ]; serviceConfig = { - Type = "oneshot"; - RemainAfterExit = true; - WorkingDirectory = composeDir; - ExecStartPre = "${pkgs.docker}/bin/docker compose pull --quiet"; - ExecStart = "${pkgs.docker}/bin/docker compose up -d --remove-orphans"; - ExecStop = "${pkgs.docker}/bin/docker compose down"; - TimeoutStartSec = "600"; - TimeoutStopSec = "120"; + ExecStart = "${csf.updaterPackage}/bin/csf-updater"; + Restart = "on-failure"; + RestartSec = "10s"; + StateDirectory = "csf-updater"; + }; + environment = { + ETCD_ENDPOINTS = "http://localhost:2379"; + INFRA_REPO_GITHUB = "csfx-cloud/CSFX-Infra"; + INFRA_REPO_BRANCH = "main"; + INFRA_REPO_MIRROR_URL = "https://github.com/csfx-cloud/CSFX-Infra.git"; + INFRA_REPO_MIRROR_DIR = "/var/lib/csf-updater/infra.git"; + POLL_INTERVAL_SECS = "120"; + RUST_LOG = "info"; }; }; - system.activationScripts.csf-binaries = { - text = '' - mkdir -p ${binDir} - chown csf-updater:csf-daemon ${binDir} - chmod 750 ${binDir} - if [ ! -f ${binDir}/csf-updater ]; then - cp ${csfUpdaterBin}/bin/csf-updater ${binDir}/csf-updater - chown csf-updater:csf-updater ${binDir}/csf-updater - chmod 750 ${binDir}/csf-updater - fi - if [ ! -f ${binDir}/csf-agent ]; then - cp ${csfAgentBin}/bin/csf-agent ${binDir}/csf-agent - chown csf-updater:csf-updater ${binDir}/csf-agent - chmod 750 ${binDir}/csf-agent - fi - ''; - deps = []; + services.csf-update-units = { + enable = true; + nixCacheUrl = "http://localhost:5000"; + nixCachePublicKey = ""; }; - system.activationScripts.csf-core-setup = { + nix.settings = { + experimental-features = [ "nix-command" "flakes" ]; + trusted-users = [ "root" ]; + }; + + system.activationScripts.csf-core-compose = { text = '' mkdir -p ${composeDir} @@ -189,15 +147,13 @@ services: container_name: csf-etcd command: - etcd - - --advertise-client-urls=http://etcd:2379 + - --advertise-client-urls=http://0.0.0.0:2379 - --listen-client-urls=http://0.0.0.0:2379 - --data-dir=/etcd-data volumes: - etcd_data:/etcd-data ports: - "2379:2379" - networks: - - csf-internal restart: unless-stopped patroni: @@ -227,8 +183,6 @@ services: volumes: - patroni_data:/home/postgres/pgdata - /etc/csf-core/patroni-bootstrap.sh:/etc/csf-bootstrap.sh:ro - networks: - - csf-internal depends_on: - etcd healthcheck: @@ -240,53 +194,40 @@ services: restart: unless-stopped api-gateway: - image: ghcr.io/csfx-cloud/csf-ce-api-gateway:0.2.2-alpha.47 + image: ghcr.io/csfx-cloud/csf-ce-api-gateway@${versions.csf.images.api-gateway.digest} container_name: csf-api-gateway - env_file: - - /etc/csf-core/gateway.env environment: DATABASE_URL: postgres://csf:csfpassword@patroni:5432/csf_core - RUST_LOG: info JWT_SECRET: change_me_in_production - RSA_KEY_SIZE: "4096" + ETCD_ENDPOINTS: http://etcd:2379 REGISTRY_SERVICE_URL: http://registry:8001 SCHEDULER_SERVICE_URL: http://scheduler:8002 VOLUME_MANAGER_URL: http://volume-manager:8003 FAILOVER_CONTROLLER_URL: http://failover-controller:8004 SDN_CONTROLLER_URL: http://sdn-controller:8005 + RUST_LOG: info ports: - "8000:8000" depends_on: patroni: condition: service_healthy - networks: - - csf-internal restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8000/api/system/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 30s registry: - image: ghcr.io/csfx-cloud/csf-ce-registry:0.2.2-alpha.47 + image: ghcr.io/csfx-cloud/csf-ce-registry@${versions.csf.images.registry.digest} container_name: csf-registry environment: DATABASE_URL: postgres://csf:csfpassword@patroni:5432/csf_core ETCD_ENDPOINTS: http://etcd:2379 REGISTRY_PORT: "8001" RUST_LOG: info - SCHEDULER_SERVICE_URL: http://scheduler:8002 depends_on: patroni: condition: service_healthy - networks: - - csf-internal restart: unless-stopped scheduler: - image: ghcr.io/csfx-cloud/csf-ce-scheduler:0.2.2-alpha.47 + image: ghcr.io/csfx-cloud/csf-ce-scheduler@${versions.csf.images.scheduler.digest} container_name: csf-scheduler environment: DATABASE_URL: postgres://csf:csfpassword@patroni:5432/csf_core @@ -296,32 +237,27 @@ services: depends_on: patroni: condition: service_healthy - networks: - - csf-internal restart: unless-stopped volume-manager: - image: ghcr.io/csfx-cloud/csf-ce-volume-manager:0.2.2-alpha.47 + image: ghcr.io/csfx-cloud/csf-ce-volume-manager@${versions.csf.images.volume-manager.digest} container_name: csf-volume-manager environment: DATABASE_URL: postgres://csf:csfpassword@patroni:5432/csf_core ETCD_ENDPOINTS: http://etcd:2379 VOLUME_MANAGER_PORT: "8003" RUST_LOG: info - volumes: - - /mnt/csf-volumes:/mnt/csf-volumes depends_on: patroni: condition: service_healthy - networks: - - csf-internal restart: unless-stopped failover-controller: - image: ghcr.io/csfx-cloud/csf-ce-failover-controller:0.2.2-alpha.47 + image: ghcr.io/csfx-cloud/csf-ce-failover-controller@${versions.csf.images.failover-controller.digest} container_name: csf-failover-controller environment: DATABASE_URL: postgres://csf:csfpassword@patroni:5432/csf_core + ETCD_ENDPOINTS: http://etcd:2379 FAILOVER_CONTROLLER_PORT: "8004" SCHEDULER_SERVICE_URL: http://scheduler:8002 VOLUME_MANAGER_URL: http://volume-manager:8003 @@ -329,32 +265,24 @@ services: depends_on: patroni: condition: service_healthy - networks: - - csf-internal restart: unless-stopped sdn-controller: - image: ghcr.io/csfx-cloud/csf-ce-sdn-controller:0.2.2-alpha.47 + image: ghcr.io/csfx-cloud/csf-ce-sdn-controller@${versions.csf.images.sdn-controller.digest} container_name: csf-sdn-controller environment: DATABASE_URL: postgres://csf:csfpassword@patroni:5432/csf_core - ETCD_URL: http://etcd:2379 + ETCD_ENDPOINTS: http://etcd:2379 SDN_CONTROLLER_PORT: "8005" RUST_LOG: info depends_on: patroni: condition: service_healthy - networks: - - csf-internal restart: unless-stopped volumes: etcd_data: patroni_data: - -networks: - csf-internal: - driver: bridge COMPOSE cat > ${composeDir}/patroni-bootstrap.sh <<'BOOTSTRAP' @@ -368,15 +296,27 @@ BOOTSTRAP deps = []; }; - nix = { - settings = { - experimental-features = [ "nix-command" "flakes" ]; - auto-optimise-store = true; - }; - gc = { - automatic = true; - dates = "weekly"; - options = "--delete-older-than 30d"; + systemd.services.csf-control-plane = { + description = "CSF Control Plane (Docker Compose)"; + after = [ "docker.service" "network-online.target" ]; + requires = [ "docker.service" ]; + wants = [ "network-online.target" ]; + wantedBy = [ "multi-user.target" ]; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + WorkingDirectory = composeDir; + ExecStart = "${pkgs.docker}/bin/docker compose up -d --remove-orphans"; + ExecStop = "${pkgs.docker}/bin/docker compose down"; + TimeoutStartSec = "600"; }; }; + + environment.systemPackages = with pkgs; [ + docker-compose + curl + git + jq + etcd + ]; }