From d9e1ad6ecd29d367a9fa3f6f26dd40304e641cd7 Mon Sep 17 00:00:00 2001 From: mblos Date: Thu, 26 Mar 2026 12:58:20 +0100 Subject: [PATCH 1/7] Committed reservations allocation lifecycle --- .../reservations/commitments/config.go | 17 +- .../reservations/commitments/controller.go | 251 +++++++++++++----- .../commitments/controller_test.go | 206 ++++++++++++++ 3 files changed, 404 insertions(+), 70 deletions(-) diff --git a/internal/scheduling/reservations/commitments/config.go b/internal/scheduling/reservations/commitments/config.go index 7a6c9005f..17d89c8c0 100644 --- a/internal/scheduling/reservations/commitments/config.go +++ b/internal/scheduling/reservations/commitments/config.go @@ -11,10 +11,17 @@ import ( type Config struct { - // RequeueIntervalActive is the interval for requeueing active reservations for verification. + // RequeueIntervalActive is the interval for requeueing active reservations for periodic verification. RequeueIntervalActive time.Duration `json:"committedResourceRequeueIntervalActive"` // RequeueIntervalRetry is the interval for requeueing when retrying after knowledge is not ready. RequeueIntervalRetry time.Duration `json:"committedResourceRequeueIntervalRetry"` + // AllocationGracePeriod is the time window after a VM is allocated to a reservation + // during which it's expected to appear on the target host. VMs not confirmed within + // this period are considered stale and removed from the reservation. + AllocationGracePeriod time.Duration `json:"committedResourceAllocationGracePeriod"` + // RequeueIntervalGracePeriod is the interval for requeueing when VMs are in grace period. + // Shorter than RequeueIntervalActive for faster verification of new allocations. + RequeueIntervalGracePeriod time.Duration `json:"committedResourceRequeueIntervalGracePeriod"` // PipelineDefault is the default pipeline used for scheduling committed resource reservations. PipelineDefault string `json:"committedResourcePipelineDefault"` @@ -68,6 +75,12 @@ func (c *Config) ApplyDefaults() { if c.RequeueIntervalRetry == 0 { c.RequeueIntervalRetry = defaults.RequeueIntervalRetry } + if c.RequeueIntervalGracePeriod == 0 { + c.RequeueIntervalGracePeriod = defaults.RequeueIntervalGracePeriod + } + if c.AllocationGracePeriod == 0 { + c.AllocationGracePeriod = defaults.AllocationGracePeriod + } if c.PipelineDefault == "" { c.PipelineDefault = defaults.PipelineDefault } @@ -88,6 +101,8 @@ func DefaultConfig() Config { return Config{ RequeueIntervalActive: 5 * time.Minute, RequeueIntervalRetry: 1 * time.Minute, + RequeueIntervalGracePeriod: 1 * time.Minute, + AllocationGracePeriod: 15 * time.Minute, PipelineDefault: "kvm-general-purpose-load-balancing", SchedulerURL: "http://localhost:8080/scheduler/nova/external", ChangeAPIWatchReservationsTimeout: 10 * time.Second, diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go index 9c238aeee..c2e364721 100644 --- a/internal/scheduling/reservations/commitments/controller.go +++ b/internal/scheduling/reservations/commitments/controller.go @@ -23,9 +23,9 @@ import ( schedulerdelegationapi "github.com/cobaltcore-dev/cortex/api/external/nova" "github.com/cobaltcore-dev/cortex/api/v1alpha1" - "github.com/cobaltcore-dev/cortex/internal/knowledge/datasources/plugins/openstack/nova" "github.com/cobaltcore-dev/cortex/internal/knowledge/db" "github.com/cobaltcore-dev/cortex/internal/knowledge/extractor/plugins/compute" + schedulingnova "github.com/cobaltcore-dev/cortex/internal/scheduling/nova" "github.com/cobaltcore-dev/cortex/internal/scheduling/reservations" "github.com/cobaltcore-dev/cortex/pkg/multicluster" hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" @@ -44,6 +44,8 @@ type CommitmentReservationController struct { DB *db.DB // SchedulerClient for making scheduler API calls. SchedulerClient *reservations.SchedulerClient + // NovaClient for direct Nova API calls (real-time VM status). + NovaClient schedulingnova.NovaClient } // Reconcile is part of the main kubernetes reconciliation loop which aims to @@ -96,13 +98,18 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr if meta.IsStatusConditionTrue(res.Status.Conditions, v1alpha1.ReservationConditionReady) { logger.V(1).Info("reservation is active, verifying allocations") - // Verify all allocations in Spec against actual VM state from database - if err := r.reconcileAllocations(ctx, &res); err != nil { + // Verify all allocations in Spec against actual VM state + result, err := r.reconcileAllocations(ctx, &res) + if err != nil { logger.Error(err, "failed to reconcile allocations") return ctrl.Result{}, err } - // Requeue periodically to keep verifying allocations + // Requeue with appropriate interval based on allocation state + // Use shorter interval if there are allocations in grace period for faster verification + if result.HasAllocationsInGracePeriod { + return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalGracePeriod}, nil + } return ctrl.Result{RequeueAfter: r.Conf.RequeueIntervalActive}, nil } @@ -322,82 +329,204 @@ func (r *CommitmentReservationController) Reconcile(ctx context.Context, req ctr return ctrl.Result{}, nil } -// reconcileAllocations verifies all allocations in Spec against actual Nova VM state. +// reconcileAllocationsResult holds the outcome of allocation reconciliation. +type reconcileAllocationsResult struct { + // HasAllocationsInGracePeriod is true if any allocations are still in grace period. + HasAllocationsInGracePeriod bool +} + +// reconcileAllocations verifies all allocations in Spec against actual VM state. // It updates Status.Allocations based on the actual host location of each VM. -func (r *CommitmentReservationController) reconcileAllocations(ctx context.Context, res *v1alpha1.Reservation) error { +// For new allocations (within grace period), it uses the Nova API for real-time status. +// For older allocations, it uses the Hypervisor CRD to check if VM is on the expected host. +func (r *CommitmentReservationController) reconcileAllocations(ctx context.Context, res *v1alpha1.Reservation) (*reconcileAllocationsResult, error) { logger := LoggerFromContext(ctx).WithValues("component", "controller") + result := &reconcileAllocationsResult{} + now := time.Now() // Skip if no CommittedResourceReservation if res.Spec.CommittedResourceReservation == nil { - return nil + return result, nil } - // TODO trigger migrations of unused reservations (to PAYG VMs) - // Skip if no allocations to verify if len(res.Spec.CommittedResourceReservation.Allocations) == 0 { logger.V(1).Info("no allocations to verify", "reservation", res.Name) - return nil + return result, nil } - // Query all VMs for this project from the database - projectID := res.Spec.CommittedResourceReservation.ProjectID - serverMap, err := r.listServersByProjectID(ctx, projectID) - if err != nil { - return fmt.Errorf("failed to list servers for project %s: %w", projectID, err) + expectedHost := res.Status.Host + + // Fetch the Hypervisor CRD for the expected host (for older allocations) + var hypervisor hv1.Hypervisor + hvInstanceSet := make(map[string]bool) + if expectedHost != "" { + if err := r.Get(ctx, client.ObjectKey{Name: expectedHost}, &hypervisor); err != nil { + if client.IgnoreNotFound(err) != nil { + return nil, fmt.Errorf("failed to get hypervisor %s: %w", expectedHost, err) + } + // Hypervisor not found - all older allocations will be checked via Nova API fallback + logger.Info("hypervisor CRD not found", "host", expectedHost) + } else { + // Build set of active VM UUIDs on this hypervisor for O(1) lookup + for _, inst := range hypervisor.Status.Instances { + if inst.Active { + hvInstanceSet[inst.ID] = true + } + } + logger.V(1).Info("fetched hypervisor instances", "host", expectedHost, "instanceCount", len(hvInstanceSet)) + } } - // initialize + // Initialize status if res.Status.CommittedResourceReservation == nil { res.Status.CommittedResourceReservation = &v1alpha1.CommittedResourceReservationStatus{} } // Build new Status.Allocations map based on actual VM locations newStatusAllocations := make(map[string]string) + // Track allocations to remove from Spec (stale/leaving VMs) + var allocationsToRemove []string + + for vmUUID, allocation := range res.Spec.CommittedResourceReservation.Allocations { + allocationAge := now.Sub(allocation.CreationTimestamp.Time) + isInGracePeriod := allocationAge < r.Conf.AllocationGracePeriod + + if isInGracePeriod { + // New allocation: use Nova API for real-time status + result.HasAllocationsInGracePeriod = true + + if r.NovaClient == nil { + // No Nova client - skip verification for now, retry later + logger.V(1).Info("Nova client not available, skipping new allocation verification", + "vm", vmUUID, + "allocationAge", allocationAge) + continue + } - for vmUUID := range res.Spec.CommittedResourceReservation.Allocations { - server, exists := serverMap[vmUUID] - if exists { - // VM found - record its actual host location - actualHost := server.OSEXTSRVATTRHost - newStatusAllocations[vmUUID] = actualHost - - logger.V(1).Info("verified VM allocation", - "vm", vmUUID, - "actualHost", actualHost, - "expectedHost", res.Status.Host) - } else { - // VM not found in database - logger.Info("VM not found in database", - "vm", vmUUID, - "reservation", res.Name, - "projectID", projectID) + server, err := r.NovaClient.Get(ctx, vmUUID) + if err != nil { + // VM not yet available in Nova (still spawning) - retry on next reconcile + logger.V(1).Info("VM not yet available in Nova API", + "vm", vmUUID, + "error", err.Error(), + "allocationAge", allocationAge) + // Keep in Spec, don't add to Status - will retry on next reconcile + continue + } - // TODO handle entering and leave event + actualHost := server.ComputeHost + switch { + case actualHost == expectedHost: + // VM is on expected host - confirmed running + newStatusAllocations[vmUUID] = actualHost + logger.V(1).Info("verified new VM allocation via Nova API", + "vm", vmUUID, + "actualHost", actualHost, + "allocationAge", allocationAge) + case actualHost != "": + // VM is on different host - migration scenario (log for now) + newStatusAllocations[vmUUID] = actualHost + logger.Info("VM on different host than expected (migration?)", + "vm", vmUUID, + "actualHost", actualHost, + "expectedHost", expectedHost, + "allocationAge", allocationAge) + default: + // VM not yet on any host - still spawning + logger.V(1).Info("VM not yet on host (spawning)", + "vm", vmUUID, + "status", server.Status, + "allocationAge", allocationAge) + // Keep in Spec, don't add to Status - will retry on next reconcile + } + } else { + // Older allocation: use Hypervisor CRD for verification + if hvInstanceSet[vmUUID] { + // VM found on expected hypervisor - confirmed running + newStatusAllocations[vmUUID] = expectedHost + logger.V(1).Info("verified VM allocation via Hypervisor CRD", + "vm", vmUUID, + "host", expectedHost) + } else { + // VM not found on expected hypervisor - check Nova API as fallback + if r.NovaClient != nil { + novaServer, err := r.NovaClient.Get(ctx, vmUUID) + if err == nil && novaServer.ComputeHost != "" { + // VM exists but on different host - migration or placement change + newStatusAllocations[vmUUID] = novaServer.ComputeHost + logger.Info("VM found via Nova API fallback (not on expected host)", + "vm", vmUUID, + "actualHost", novaServer.ComputeHost, + "expectedHost", expectedHost) + continue + } + // Nova API confirms VM doesn't exist or has no host + logger.V(1).Info("Nova API confirmed VM not found", + "vm", vmUUID, + "error", err) + } + // VM not found on hypervisor and not in Nova - mark for removal (leaving VM) + allocationsToRemove = append(allocationsToRemove, vmUUID) + logger.Info("removing stale allocation (VM not found on hypervisor or Nova)", + "vm", vmUUID, + "reservation", res.Name, + "expectedHost", expectedHost, + "allocationAge", allocationAge, + "gracePeriod", r.Conf.AllocationGracePeriod) + } } } - // Patch the reservation status + // Patch the reservation old := res.DeepCopy() + specChanged := false + + // Remove stale allocations from Spec + if len(allocationsToRemove) > 0 { + for _, vmUUID := range allocationsToRemove { + delete(res.Spec.CommittedResourceReservation.Allocations, vmUUID) + } + specChanged = true + } // Update Status.Allocations res.Status.CommittedResourceReservation.Allocations = newStatusAllocations + // Patch Spec if changed (stale allocations removed) + if specChanged { + if err := r.Patch(ctx, res, client.MergeFrom(old)); err != nil { + if client.IgnoreNotFound(err) == nil { + return result, nil + } + return nil, fmt.Errorf("failed to patch reservation spec: %w", err) + } + // Re-fetch to get the updated resource version for status patch + if err := r.Get(ctx, client.ObjectKeyFromObject(res), res); err != nil { + if client.IgnoreNotFound(err) == nil { + return result, nil + } + return nil, fmt.Errorf("failed to re-fetch reservation: %w", err) + } + old = res.DeepCopy() + } + + // Patch Status patch := client.MergeFrom(old) if err := r.Status().Patch(ctx, res, patch); err != nil { - // Ignore not-found errors during background deletion if client.IgnoreNotFound(err) == nil { - // Object was deleted, no need to continue - return nil + return result, nil } - return fmt.Errorf("failed to patch reservation status: %w", err) + return nil, fmt.Errorf("failed to patch reservation status: %w", err) } logger.V(1).Info("reconciled allocations", "specAllocations", len(res.Spec.CommittedResourceReservation.Allocations), - "statusAllocations", len(newStatusAllocations)) + "statusAllocations", len(newStatusAllocations), + "removedAllocations", len(allocationsToRemove), + "hasAllocationsInGracePeriod", result.HasAllocationsInGracePeriod) - return nil + return result, nil } // getPipelineForFlavorGroup returns the pipeline name for a given flavor group. @@ -432,36 +561,20 @@ func (r *CommitmentReservationController) Init(ctx context.Context, client clien r.SchedulerClient = reservations.NewSchedulerClient(conf.SchedulerURL) logf.FromContext(ctx).Info("scheduler client initialized for commitment reservation controller", "url", conf.SchedulerURL) - return nil -} - -func (r *CommitmentReservationController) listServersByProjectID(ctx context.Context, projectID string) (map[string]*nova.Server, error) { - if r.DB == nil { - return nil, errors.New("database connection not initialized") - } - - logger := LoggerFromContext(ctx).WithValues("component", "controller") - - // Query servers from the database cache. - var servers []nova.Server - _, err := r.DB.Select(&servers, - "SELECT * FROM openstack_servers WHERE tenant_id = $1", - projectID) - if err != nil { - return nil, fmt.Errorf("failed to query servers from database: %w", err) - } - - logger.V(1).Info("queried servers from database", - "projectID", projectID, - "serverCount", len(servers)) - - // Build lookup map for O(1) access by VM UUID. - serverMap := make(map[string]*nova.Server, len(servers)) - for i := range servers { - serverMap[servers[i].ID] = &servers[i] + // Initialize Nova client for real-time VM status checks (optional). + // Skip if NovaClient is already set (e.g., injected for testing) or if keystone not configured. + if r.NovaClient == nil && conf.KeystoneSecretRef.Name != "" { + r.NovaClient = schedulingnova.NewNovaClient() + if err := r.NovaClient.Init(ctx, client, schedulingnova.NovaClientConfig{ + KeystoneSecretRef: conf.KeystoneSecretRef, + SSOSecretRef: conf.SSOSecretRef, + }); err != nil { + return fmt.Errorf("failed to initialize Nova client: %w", err) + } + logf.FromContext(ctx).Info("Nova client initialized for commitment reservation controller") } - return serverMap, nil + return nil } // commitmentReservationPredicate filters to only watch commitment reservations. diff --git a/internal/scheduling/reservations/commitments/controller_test.go b/internal/scheduling/reservations/commitments/controller_test.go index 5af5dfca9..20c8f55d2 100644 --- a/internal/scheduling/reservations/commitments/controller_test.go +++ b/internal/scheduling/reservations/commitments/controller_test.go @@ -18,6 +18,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" schedulerdelegationapi "github.com/cobaltcore-dev/cortex/api/external/nova" @@ -140,6 +141,211 @@ func TestCommitmentReservationController_Reconcile(t *testing.T) { } } +// ============================================================================ +// Test: reconcileAllocations +// ============================================================================ + +// Note: Full reconcileAllocations tests require mocking NovaClient, which uses +// unexported types (nova.server, nova.migration). Tests for the Nova API path +// would need to be placed in the nova package or the types would need to be exported. +// For now, we test only the Hypervisor CRD path (when NovaClient is nil). + +func TestReconcileAllocations_HypervisorCRDPath(t *testing.T) { + scheme := runtime.NewScheme() + if err := v1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add scheme: %v", err) + } + if err := hv1.AddToScheme(scheme); err != nil { + t.Fatalf("Failed to add hypervisor scheme: %v", err) + } + + now := time.Now() + recentTime := metav1.NewTime(now.Add(-5 * time.Minute)) // 5 minutes ago (within grace period) + oldTime := metav1.NewTime(now.Add(-30 * time.Minute)) // 30 minutes ago (past grace period) + + tests := []struct { + name string + reservation *v1alpha1.Reservation + hypervisor *hv1.Hypervisor + config Config + expectedStatusAllocations map[string]string + expectedHasGracePeriodAllocs bool + }{ + { + name: "old allocation - VM found on hypervisor CRD", + reservation: newTestCRReservation(map[string]metav1.Time{ + "vm-1": oldTime, + }), + hypervisor: newTestHypervisorCRD("host-1", []hv1.Instance{ + {ID: "vm-1", Name: "vm-1", Active: true}, + }), + config: Config{AllocationGracePeriod: 15 * time.Minute}, + expectedStatusAllocations: map[string]string{"vm-1": "host-1"}, + expectedHasGracePeriodAllocs: false, + }, + { + name: "old allocation - VM not on hypervisor CRD (no NovaClient fallback)", + reservation: newTestCRReservation(map[string]metav1.Time{ + "vm-1": oldTime, + }), + hypervisor: newTestHypervisorCRD("host-1", []hv1.Instance{}), // Empty + config: Config{AllocationGracePeriod: 15 * time.Minute}, + expectedStatusAllocations: map[string]string{}, // Not confirmed + expectedHasGracePeriodAllocs: false, + }, + { + name: "new allocation within grace period - no Nova client", + reservation: newTestCRReservation(map[string]metav1.Time{ + "vm-1": recentTime, + }), + hypervisor: nil, + config: Config{AllocationGracePeriod: 15 * time.Minute}, + expectedStatusAllocations: map[string]string{}, // Can't verify without Nova + expectedHasGracePeriodAllocs: true, + }, + { + name: "mixed allocations - old verified via CRD, new in grace period", + reservation: newTestCRReservation(map[string]metav1.Time{ + "vm-new": recentTime, // In grace period + "vm-old": oldTime, // Past grace period + }), + hypervisor: newTestHypervisorCRD("host-1", []hv1.Instance{ + {ID: "vm-old", Name: "vm-old", Active: true}, + }), + config: Config{AllocationGracePeriod: 15 * time.Minute}, + expectedStatusAllocations: map[string]string{"vm-old": "host-1"}, // Only old one confirmed via CRD + expectedHasGracePeriodAllocs: true, + }, + { + name: "empty allocations - no work to do", + reservation: newTestCRReservation(map[string]metav1.Time{}), + hypervisor: nil, + config: Config{AllocationGracePeriod: 15 * time.Minute}, + expectedStatusAllocations: map[string]string{}, + expectedHasGracePeriodAllocs: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Build fake client with objects + objects := []client.Object{tt.reservation} + if tt.hypervisor != nil { + objects = append(objects, tt.hypervisor) + } + + k8sClient := fake.NewClientBuilder(). + WithScheme(scheme). + WithObjects(objects...). + WithStatusSubresource(&v1alpha1.Reservation{}). + Build() + + controller := &CommitmentReservationController{ + Client: k8sClient, + Scheme: scheme, + Conf: tt.config, + NovaClient: nil, // No NovaClient - testing Hypervisor CRD path only + } + + ctx := WithNewGlobalRequestID(context.Background()) + result, err := controller.reconcileAllocations(ctx, tt.reservation) + if err != nil { + t.Fatalf("reconcileAllocations() error = %v", err) + } + + // Check grace period result + if result.HasAllocationsInGracePeriod != tt.expectedHasGracePeriodAllocs { + t.Errorf("expected HasAllocationsInGracePeriod=%v, got %v", + tt.expectedHasGracePeriodAllocs, result.HasAllocationsInGracePeriod) + } + + // Re-fetch reservation to check updates + var updated v1alpha1.Reservation + if err := k8sClient.Get(ctx, client.ObjectKeyFromObject(tt.reservation), &updated); err != nil { + t.Fatalf("failed to get updated reservation: %v", err) + } + + // Check status allocations + actualStatusAllocs := map[string]string{} + if updated.Status.CommittedResourceReservation != nil { + actualStatusAllocs = updated.Status.CommittedResourceReservation.Allocations + } + + if len(actualStatusAllocs) != len(tt.expectedStatusAllocations) { + t.Errorf("expected %d status allocations, got %d: %v", + len(tt.expectedStatusAllocations), len(actualStatusAllocs), actualStatusAllocs) + } + + for vmUUID, expectedHost := range tt.expectedStatusAllocations { + if actualHost, ok := actualStatusAllocs[vmUUID]; !ok { + t.Errorf("expected VM %s in status allocations", vmUUID) + } else if actualHost != expectedHost { + t.Errorf("VM %s: expected host %s, got %s", vmUUID, expectedHost, actualHost) + } + } + }) + } +} + +// newTestCRReservation creates a test CR reservation with allocations on "host-1". +func newTestCRReservation(allocations map[string]metav1.Time) *v1alpha1.Reservation { + const host = "host-1" + specAllocs := make(map[string]v1alpha1.CommittedResourceAllocation) + for vmUUID, timestamp := range allocations { + specAllocs[vmUUID] = v1alpha1.CommittedResourceAllocation{ + CreationTimestamp: timestamp, + Resources: map[hv1.ResourceName]resource.Quantity{ + "memory": resource.MustParse("4Gi"), + "cpu": resource.MustParse("2"), + }, + } + } + + return &v1alpha1.Reservation{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-reservation", + }, + Spec: v1alpha1.ReservationSpec{ + Type: v1alpha1.ReservationTypeCommittedResource, + TargetHost: host, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationSpec{ + ProjectID: "test-project", + ResourceName: "test-flavor", + Allocations: specAllocs, + }, + }, + Status: v1alpha1.ReservationStatus{ + Host: host, + Conditions: []metav1.Condition{ + { + Type: v1alpha1.ReservationConditionReady, + Status: metav1.ConditionTrue, + Reason: "ReservationActive", + }, + }, + CommittedResourceReservation: &v1alpha1.CommittedResourceReservationStatus{ + Allocations: make(map[string]string), + }, + }, + } +} + +// newTestHypervisorCRD creates a test Hypervisor CRD with instances. +func newTestHypervisorCRD(name string, instances []hv1.Instance) *hv1.Hypervisor { + return &hv1.Hypervisor{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Status: hv1.HypervisorStatus{ + Instances: instances, + }, + } +} + +// ============================================================================ +// Test: reconcileInstanceReservation_Success (existing test) +// ============================================================================ + func TestCommitmentReservationController_reconcileInstanceReservation_Success(t *testing.T) { scheme := runtime.NewScheme() if err := v1alpha1.AddToScheme(scheme); err != nil { From 0e2d00e887ad8bf4f0335ab3f3e493f2f611fa6c Mon Sep 17 00:00:00 2001 From: mblos Date: Thu, 26 Mar 2026 13:14:02 +0100 Subject: [PATCH 2/7] consider inactive VMs correctly --- .../reservations/commitments/controller.go | 7 +++---- .../reservations/commitments/controller_test.go | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/internal/scheduling/reservations/commitments/controller.go b/internal/scheduling/reservations/commitments/controller.go index c2e364721..0560313b3 100644 --- a/internal/scheduling/reservations/commitments/controller.go +++ b/internal/scheduling/reservations/commitments/controller.go @@ -368,11 +368,10 @@ func (r *CommitmentReservationController) reconcileAllocations(ctx context.Conte // Hypervisor not found - all older allocations will be checked via Nova API fallback logger.Info("hypervisor CRD not found", "host", expectedHost) } else { - // Build set of active VM UUIDs on this hypervisor for O(1) lookup + // Build set of all VM UUIDs on this hypervisor for O(1) lookup + // Include both active and inactive VMs - stopped/shelved VMs still consume the reservation slot for _, inst := range hypervisor.Status.Instances { - if inst.Active { - hvInstanceSet[inst.ID] = true - } + hvInstanceSet[inst.ID] = true } logger.V(1).Info("fetched hypervisor instances", "host", expectedHost, "instanceCount", len(hvInstanceSet)) } diff --git a/internal/scheduling/reservations/commitments/controller_test.go b/internal/scheduling/reservations/commitments/controller_test.go index 20c8f55d2..68d13faf3 100644 --- a/internal/scheduling/reservations/commitments/controller_test.go +++ b/internal/scheduling/reservations/commitments/controller_test.go @@ -183,6 +183,18 @@ func TestReconcileAllocations_HypervisorCRDPath(t *testing.T) { expectedStatusAllocations: map[string]string{"vm-1": "host-1"}, expectedHasGracePeriodAllocs: false, }, + { + name: "old allocation - inactive VM still counted (stopped/shelved)", + reservation: newTestCRReservation(map[string]metav1.Time{ + "vm-stopped": oldTime, + }), + hypervisor: newTestHypervisorCRD("host-1", []hv1.Instance{ + {ID: "vm-stopped", Name: "vm-stopped", Active: false}, // Inactive VM should still be found + }), + config: Config{AllocationGracePeriod: 15 * time.Minute}, + expectedStatusAllocations: map[string]string{"vm-stopped": "host-1"}, + expectedHasGracePeriodAllocs: false, + }, { name: "old allocation - VM not on hypervisor CRD (no NovaClient fallback)", reservation: newTestCRReservation(map[string]metav1.Time{ @@ -331,6 +343,8 @@ func newTestCRReservation(allocations map[string]metav1.Time) *v1alpha1.Reservat } // newTestHypervisorCRD creates a test Hypervisor CRD with instances. +// +//nolint:unparam // name parameter allows future test flexibility func newTestHypervisorCRD(name string, instances []hv1.Instance) *hv1.Hypervisor { return &hv1.Hypervisor{ ObjectMeta: metav1.ObjectMeta{ From 4729f803b38106197519b9ac9304401442f92502 Mon Sep 17 00:00:00 2001 From: mblos Date: Thu, 26 Mar 2026 15:42:42 +0100 Subject: [PATCH 3/7] fix alerts --- .../cortex-nova/alerts/nova.alerts.yaml | 43 +++++++++++++------ 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 784830aac..4b5bba931 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -486,7 +486,10 @@ groups: # Committed Resource Syncer Alerts - alert: CortexNovaCommittedResourceSyncerNotRunning - expr: increase(cortex_committed_resource_syncer_runs_total{service="cortex-nova-metrics"}[2h]) == 0 + expr: | + increase(cortex_committed_resource_syncer_runs_total{service="cortex-nova-metrics"}[2h]) == 0 + or + absent(cortex_committed_resource_syncer_runs_total{service="cortex-nova-metrics"}) for: 5m labels: context: committed-resource-syncer @@ -497,8 +500,10 @@ groups: annotations: summary: "Committed Resource syncer not running" description: > - The committed resource syncer has not run in the last 2 hours. This indicates - that the syncer may have stopped or is encountering errors. Check the syncer logs for errors. + The committed resource syncer has not run in the last 2 hours or the metric is missing. + This indicates that the syncer may have stopped, is encountering errors, or the feature + is not enabled. Check the syncer logs for errors or verify the commitments-sync-task is + in the enabledTasks configuration. - alert: CortexNovaCommittedResourceSyncerErrorsHigh expr: increase(cortex_committed_resource_syncer_errors_total{service="cortex-nova-metrics"}[1h]) > 3 @@ -517,8 +522,11 @@ groups: - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh expr: | - rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h]) - / rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0.05 + ( + rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) + ) > 0.05 + and on() rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0 for: 15m labels: context: committed-resource-syncer @@ -537,8 +545,11 @@ groups: - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh expr: | - rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h]) - / rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0 + ( + rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) + ) > 0 + and on() rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0 for: 15m labels: context: committed-resource-syncer @@ -557,10 +568,13 @@ groups: - alert: CortexNovaCommittedResourceSyncerLocalChangeRateHigh expr: | ( - rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) + - rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) + - rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) - ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0.01 + ( + rate(cortex_committed_resource_syncer_reservations_created_total{service="cortex-nova-metrics"}[1h]) + + rate(cortex_committed_resource_syncer_reservations_deleted_total{service="cortex-nova-metrics"}[1h]) + + rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) + ) / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) + ) > 0.01 + and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 for: 15m labels: context: committed-resource-syncer @@ -578,8 +592,11 @@ groups: - alert: CortexNovaCommittedResourceSyncerRepairRateHigh expr: | - rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) - / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 + ( + rate(cortex_committed_resource_syncer_reservations_repaired_total{service="cortex-nova-metrics"}[1h]) + / rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) + ) > 0 + and on() rate(cortex_committed_resource_syncer_commitments_processed_total{service="cortex-nova-metrics"}[1h]) > 0 for: 15m labels: context: committed-resource-syncer From b8f55a1e6a630a56762b3cb2990d766ee3b1e970 Mon Sep 17 00:00:00 2001 From: mblos Date: Thu, 26 Mar 2026 16:06:11 +0100 Subject: [PATCH 4/7] allocation bookeeping --- .../cortex-nova/templates/pipelines_kvm.yaml | 28 ++ .../filter_committed_resource_bookkeeping.go | 273 ++++++++++++++++++ .../filters/filter_has_enough_capacity.go | 1 + 3 files changed, 302 insertions(+) create mode 100644 internal/scheduling/nova/plugins/filters/filter_committed_resource_bookkeeping.go diff --git a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml index 8b2519207..d9bd2004d 100644 --- a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml @@ -102,6 +102,20 @@ spec: from the nova scheduler request spec. It supports filtering by host and by aggregates. Aggregates use AND logic between list elements, with comma-separated UUIDs within an element using OR logic. + - name: filter_committed_resource_bookkeeping + description: | + Bookkeeping for committed resource (CR) reservations. Note that unlocking + of CR capacity happens in filter_has_enough_capacity when project ID and + resource group (hw_version) match. This filter handles additional tasks: + tracking which VMs are expected to land on which CR reservations by + updating reservation spec allocations. In the future, this filter will + also enforce that VMs use available CR reservation slots when sufficient + slots exist among candidates. + params: + # Enable updating CR reservation allocations with VM assignments + - {key: updateReservationAllocations, boolValue: true} + # Future: enforce reservation slots (not yet implemented) + - {key: enforceReservationSlots, boolValue: false} weighers: - name: kvm_prefer_smaller_hosts params: @@ -241,6 +255,20 @@ spec: from the nova scheduler request spec. It supports filtering by host and by aggregates. Aggregates use AND logic between list elements, with comma-separated UUIDs within an element using OR logic. + - name: filter_committed_resource_bookkeeping + description: | + Bookkeeping for committed resource (CR) reservations. Note that unlocking + of CR capacity happens in filter_has_enough_capacity when project ID and + resource group (hw_version) match. This filter handles additional tasks: + tracking which VMs are expected to land on which CR reservations by + updating reservation spec allocations. In the future, this filter will + also enforce that VMs use available CR reservation slots when sufficient + slots exist among candidates. + params: + # Enable updating CR reservation allocations with VM assignments + - {key: updateReservationAllocations, boolValue: true} + # Future: enforce reservation slots (not yet implemented) + - {key: enforceReservationSlots, boolValue: false} weighers: - name: kvm_prefer_smaller_hosts params: diff --git a/internal/scheduling/nova/plugins/filters/filter_committed_resource_bookkeeping.go b/internal/scheduling/nova/plugins/filters/filter_committed_resource_bookkeeping.go new file mode 100644 index 000000000..c628fd434 --- /dev/null +++ b/internal/scheduling/nova/plugins/filters/filter_committed_resource_bookkeeping.go @@ -0,0 +1,273 @@ +// Copyright SAP SE +// SPDX-License-Identifier: Apache-2.0 + +package filters + +import ( + "context" + "log/slog" + "time" + + api "github.com/cobaltcore-dev/cortex/api/external/nova" + "github.com/cobaltcore-dev/cortex/api/v1alpha1" + "github.com/cobaltcore-dev/cortex/internal/scheduling/lib" + hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +type FilterCommittedResourceBookkeepingOpts struct { + // UpdateReservationAllocations enables adding VMs to CR reservation spec allocations + // when a matching reservation's host is among the candidates. + // This tracks which VMs are expected to land on which reservations. + // Default: false + UpdateReservationAllocations bool `json:"updateReservationAllocations,omitempty"` + + // EnforceReservationSlots controls whether candidates should be filtered to only + // hosts with available CR reservation slots when enough slots exist. + // When true and sufficient reservation slots are available among candidates, + // non-reservation hosts are filtered out. + // Default: false (not yet implemented) + EnforceReservationSlots bool `json:"enforceReservationSlots,omitempty"` +} + +func (FilterCommittedResourceBookkeepingOpts) Validate() error { return nil } + +type FilterCommittedResourceBookkeeping struct { + lib.BaseFilter[api.ExternalSchedulerRequest, FilterCommittedResourceBookkeepingOpts] +} + +// Filter for committed resource reservation bookkeeping. +// +// Note: Unlocking of CR reservation capacity happens in filter_has_enough_capacity +// when project ID and resource group (hw_version) match. This filter handles +// additional bookkeeping tasks: +// +// 1. UpdateReservationAllocations: Adds VMs to matching reservation spec allocations +// to track which VMs are expected to use which reservations +// 2. EnforceReservationSlots: (Future) Filters candidates to reservation hosts +// when sufficient slots exist +// +// This filter should run AFTER filter_has_enough_capacity to ensure candidates +// have already been filtered based on physical capacity. +func (s *FilterCommittedResourceBookkeeping) Run(traceLog *slog.Logger, request api.ExternalSchedulerRequest) (*lib.FilterWeigherPipelineStepResult, error) { + result := s.IncludeAllHostsFromRequest(request) + + // Skip if no features enabled + if !s.Options.UpdateReservationAllocations && !s.Options.EnforceReservationSlots { + return result, nil + } + + // Get request details + projectID := request.Spec.Data.ProjectID + resourceGroup := request.Spec.Data.Flavor.Data.ExtraSpecs["hw_version"] + instanceUUID := request.Spec.Data.InstanceUUID + + if projectID == "" || resourceGroup == "" { + traceLog.Debug("skipping CR reservation handling: missing projectID or resourceGroup") + return result, nil + } + + // List all reservations + var reservations v1alpha1.ReservationList + if err := s.Client.List(context.Background(), &reservations); err != nil { + return nil, err + } + + // Find matching CR reservations with spare capacity + var matchingReservations []v1alpha1.Reservation + for _, reservation := range reservations.Items { + if !s.isMatchingCRReservation(traceLog, reservation, projectID, resourceGroup, request) { + continue + } + matchingReservations = append(matchingReservations, reservation) + } + + traceLog.Debug("found matching CR reservations", + "count", len(matchingReservations), + "projectID", projectID, + "resourceGroup", resourceGroup) + + // Update reservation allocations if enabled + if s.Options.UpdateReservationAllocations && len(matchingReservations) > 0 && instanceUUID != "" { + s.updateReservationAllocations(traceLog, request, result.Activations, matchingReservations) + } + + // TODO: Implement EnforceReservationSlots logic + // When enabled, filter candidates to only hosts with reservation slots if sufficient slots exist + + return result, nil +} + +// isMatchingCRReservation checks if a reservation is a matching CR reservation with spare capacity. +func (s *FilterCommittedResourceBookkeeping) isMatchingCRReservation( + traceLog *slog.Logger, + reservation v1alpha1.Reservation, + projectID, resourceGroup string, + request api.ExternalSchedulerRequest, +) bool { + // Must be Ready + if !meta.IsStatusConditionTrue(reservation.Status.Conditions, v1alpha1.ReservationConditionReady) { + return false + } + + // Must be a CR reservation + if reservation.Spec.Type != v1alpha1.ReservationTypeCommittedResource { + return false + } + + // Must have CR spec + if reservation.Spec.CommittedResourceReservation == nil { + return false + } + + // Must match project and resource group + if reservation.Spec.CommittedResourceReservation.ProjectID != projectID { + return false + } + if reservation.Spec.CommittedResourceReservation.ResourceGroup != resourceGroup { + return false + } + + // Must have a host + if reservation.Spec.TargetHost == "" && reservation.Status.Host == "" { + return false + } + + // Must have spare capacity + if !s.hasSpareCapacity(traceLog, reservation, request) { + return false + } + + return true +} + +// hasSpareCapacity checks if the reservation has enough spare capacity for the VM. +func (s *FilterCommittedResourceBookkeeping) hasSpareCapacity( + traceLog *slog.Logger, + reservation v1alpha1.Reservation, + request api.ExternalSchedulerRequest, +) bool { + // Calculate current usage from existing allocations + var usedCPU, usedMemory int64 + if reservation.Spec.CommittedResourceReservation != nil { + for _, allocation := range reservation.Spec.CommittedResourceReservation.Allocations { + if cpu, ok := allocation.Resources["cpu"]; ok { + usedCPU += cpu.Value() + } + if memory, ok := allocation.Resources["memory"]; ok { + usedMemory += memory.Value() + } + } + } + + // Get reservation's total capacity + var totalCPU, totalMemory int64 + if cpu, ok := reservation.Spec.Resources["cpu"]; ok { + totalCPU = cpu.Value() + } + if memory, ok := reservation.Spec.Resources["memory"]; ok { + totalMemory = memory.Value() + } + + // Calculate requested resources + //nolint:gosec // VCPUs and MemoryMB are bounded by OpenStack limits, no overflow risk + requestedCPU := int64(request.Spec.Data.Flavor.Data.VCPUs) + //nolint:gosec // VCPUs and MemoryMB are bounded by OpenStack limits, no overflow risk + requestedMemory := int64(request.Spec.Data.Flavor.Data.MemoryMB) * 1_000_000 // Convert MB to bytes + + // Check if there's enough spare capacity + spareCPU := totalCPU - usedCPU + spareMemory := totalMemory - usedMemory + + if spareCPU < requestedCPU || spareMemory < requestedMemory { + traceLog.Debug("reservation has insufficient spare capacity", + "reservation", reservation.Name, + "spareCPU", spareCPU, + "spareMemory", spareMemory, + "requestedCPU", requestedCPU, + "requestedMemory", requestedMemory) + return false + } + + return true +} + +// updateReservationAllocations adds the VM to the spec allocations of matching CR reservations +// whose host is among the candidates. +func (s *FilterCommittedResourceBookkeeping) updateReservationAllocations( + traceLog *slog.Logger, + request api.ExternalSchedulerRequest, + candidates map[string]float64, + matchingReservations []v1alpha1.Reservation, +) { + + instanceUUID := request.Spec.Data.InstanceUUID + if instanceUUID == "" { + traceLog.Warn("skipping reservation allocation update: no instance UUID in request") + return + } + + // Build resources from flavor + //nolint:gosec // VCPUs and MemoryMB are bounded by OpenStack limits, no overflow risk + vmResources := map[hv1.ResourceName]resource.Quantity{ + "cpu": *resource.NewQuantity(int64(request.Spec.Data.Flavor.Data.VCPUs), resource.DecimalSI), + "memory": *resource.NewQuantity(int64(request.Spec.Data.Flavor.Data.MemoryMB)*1_000_000, resource.DecimalSI), + } + + now := metav1.NewTime(time.Now()) + + for _, reservation := range matchingReservations { + // Get reservation host + reservationHost := reservation.Spec.TargetHost + if reservationHost == "" { + reservationHost = reservation.Status.Host + } + + // Check if reservation's host is among candidates + if _, isCandidate := candidates[reservationHost]; !isCandidate { + traceLog.Debug("skipping reservation allocation: host not among candidates", + "reservation", reservation.Name, + "host", reservationHost) + continue + } + + // Check if VM is already in allocations + if reservation.Spec.CommittedResourceReservation.Allocations != nil { + if _, exists := reservation.Spec.CommittedResourceReservation.Allocations[instanceUUID]; exists { + traceLog.Debug("VM already in reservation allocations", + "reservation", reservation.Name, + "instanceUUID", instanceUUID) + continue + } + } + + // Add VM to reservation allocations + reservationCopy := reservation.DeepCopy() + if reservationCopy.Spec.CommittedResourceReservation.Allocations == nil { + reservationCopy.Spec.CommittedResourceReservation.Allocations = make(map[string]v1alpha1.CommittedResourceAllocation) + } + reservationCopy.Spec.CommittedResourceReservation.Allocations[instanceUUID] = v1alpha1.CommittedResourceAllocation{ + CreationTimestamp: now, + Resources: vmResources, + } + + if err := s.Client.Update(context.Background(), reservationCopy); err != nil { + traceLog.Warn("failed to update reservation with VM allocation", + "reservation", reservation.Name, + "instanceUUID", instanceUUID, + "error", err) + // Continue with other reservations - this is best-effort + } else { + traceLog.Info("added VM to CR reservation spec allocations", + "reservation", reservation.Name, + "instanceUUID", instanceUUID, + "host", reservationHost) + } + } +} + +func init() { + Index["filter_committed_resource_bookkeeping"] = func() NovaFilter { return &FilterCommittedResourceBookkeeping{} } +} diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go index d26c7c940..194270b19 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go @@ -329,6 +329,7 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa ) } } + return result, nil } From bd25b2e4c21acf7fddd7b382911546e126fb66f7 Mon Sep 17 00:00:00 2001 From: mblos Date: Thu, 26 Mar 2026 16:09:00 +0100 Subject: [PATCH 5/7] fix alerts --- helm/bundles/cortex-nova/alerts/nova.alerts.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml index 4b5bba931..654adbe1c 100644 --- a/helm/bundles/cortex-nova/alerts/nova.alerts.yaml +++ b/helm/bundles/cortex-nova/alerts/nova.alerts.yaml @@ -350,7 +350,7 @@ groups: - alert: CortexNovaCommittedResourceRejectionRateTooHigh expr: | - rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m]) + sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics", result="rejected"}[5m])) / sum(rate(cortex_committed_resource_change_api_commitment_changes_total{service="cortex-nova-metrics"}[5m])) > 0.5 for: 5m labels: @@ -523,10 +523,10 @@ groups: - alert: CortexNovaCommittedResourceSyncerUnitMismatchRateHigh expr: | ( - rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h]) - / rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) + sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unit_mismatch"}[1h])) + / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) ) > 0.05 - and on() rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0 + and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0 for: 15m labels: context: committed-resource-syncer @@ -546,10 +546,10 @@ groups: - alert: CortexNovaCommittedResourceSyncerUnknownFlavorGroupRateHigh expr: | ( - rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h]) - / rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) + sum(rate(cortex_committed_resource_syncer_commitments_skipped_total{service="cortex-nova-metrics", reason="unknown_flavor_group"}[1h])) + / sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) ) > 0 - and on() rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h]) > 0 + and on() sum(rate(cortex_committed_resource_syncer_commitments_total{service="cortex-nova-metrics"}[1h])) > 0 for: 15m labels: context: committed-resource-syncer From 79f269b2719cb2c0948cca9f21fb62dc7f0f76a7 Mon Sep 17 00:00:00 2001 From: mblos Date: Thu, 26 Mar 2026 16:13:02 +0100 Subject: [PATCH 6/7] Revert "allocation bookeeping" This reverts commit b8f55a1e6a630a56762b3cb2990d766ee3b1e970. --- .../cortex-nova/templates/pipelines_kvm.yaml | 28 -- .../filter_committed_resource_bookkeeping.go | 273 ------------------ .../filters/filter_has_enough_capacity.go | 1 - 3 files changed, 302 deletions(-) delete mode 100644 internal/scheduling/nova/plugins/filters/filter_committed_resource_bookkeeping.go diff --git a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml index d9bd2004d..8b2519207 100644 --- a/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml +++ b/helm/bundles/cortex-nova/templates/pipelines_kvm.yaml @@ -102,20 +102,6 @@ spec: from the nova scheduler request spec. It supports filtering by host and by aggregates. Aggregates use AND logic between list elements, with comma-separated UUIDs within an element using OR logic. - - name: filter_committed_resource_bookkeeping - description: | - Bookkeeping for committed resource (CR) reservations. Note that unlocking - of CR capacity happens in filter_has_enough_capacity when project ID and - resource group (hw_version) match. This filter handles additional tasks: - tracking which VMs are expected to land on which CR reservations by - updating reservation spec allocations. In the future, this filter will - also enforce that VMs use available CR reservation slots when sufficient - slots exist among candidates. - params: - # Enable updating CR reservation allocations with VM assignments - - {key: updateReservationAllocations, boolValue: true} - # Future: enforce reservation slots (not yet implemented) - - {key: enforceReservationSlots, boolValue: false} weighers: - name: kvm_prefer_smaller_hosts params: @@ -255,20 +241,6 @@ spec: from the nova scheduler request spec. It supports filtering by host and by aggregates. Aggregates use AND logic between list elements, with comma-separated UUIDs within an element using OR logic. - - name: filter_committed_resource_bookkeeping - description: | - Bookkeeping for committed resource (CR) reservations. Note that unlocking - of CR capacity happens in filter_has_enough_capacity when project ID and - resource group (hw_version) match. This filter handles additional tasks: - tracking which VMs are expected to land on which CR reservations by - updating reservation spec allocations. In the future, this filter will - also enforce that VMs use available CR reservation slots when sufficient - slots exist among candidates. - params: - # Enable updating CR reservation allocations with VM assignments - - {key: updateReservationAllocations, boolValue: true} - # Future: enforce reservation slots (not yet implemented) - - {key: enforceReservationSlots, boolValue: false} weighers: - name: kvm_prefer_smaller_hosts params: diff --git a/internal/scheduling/nova/plugins/filters/filter_committed_resource_bookkeeping.go b/internal/scheduling/nova/plugins/filters/filter_committed_resource_bookkeeping.go deleted file mode 100644 index c628fd434..000000000 --- a/internal/scheduling/nova/plugins/filters/filter_committed_resource_bookkeeping.go +++ /dev/null @@ -1,273 +0,0 @@ -// Copyright SAP SE -// SPDX-License-Identifier: Apache-2.0 - -package filters - -import ( - "context" - "log/slog" - "time" - - api "github.com/cobaltcore-dev/cortex/api/external/nova" - "github.com/cobaltcore-dev/cortex/api/v1alpha1" - "github.com/cobaltcore-dev/cortex/internal/scheduling/lib" - hv1 "github.com/cobaltcore-dev/openstack-hypervisor-operator/api/v1" - "k8s.io/apimachinery/pkg/api/meta" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -type FilterCommittedResourceBookkeepingOpts struct { - // UpdateReservationAllocations enables adding VMs to CR reservation spec allocations - // when a matching reservation's host is among the candidates. - // This tracks which VMs are expected to land on which reservations. - // Default: false - UpdateReservationAllocations bool `json:"updateReservationAllocations,omitempty"` - - // EnforceReservationSlots controls whether candidates should be filtered to only - // hosts with available CR reservation slots when enough slots exist. - // When true and sufficient reservation slots are available among candidates, - // non-reservation hosts are filtered out. - // Default: false (not yet implemented) - EnforceReservationSlots bool `json:"enforceReservationSlots,omitempty"` -} - -func (FilterCommittedResourceBookkeepingOpts) Validate() error { return nil } - -type FilterCommittedResourceBookkeeping struct { - lib.BaseFilter[api.ExternalSchedulerRequest, FilterCommittedResourceBookkeepingOpts] -} - -// Filter for committed resource reservation bookkeeping. -// -// Note: Unlocking of CR reservation capacity happens in filter_has_enough_capacity -// when project ID and resource group (hw_version) match. This filter handles -// additional bookkeeping tasks: -// -// 1. UpdateReservationAllocations: Adds VMs to matching reservation spec allocations -// to track which VMs are expected to use which reservations -// 2. EnforceReservationSlots: (Future) Filters candidates to reservation hosts -// when sufficient slots exist -// -// This filter should run AFTER filter_has_enough_capacity to ensure candidates -// have already been filtered based on physical capacity. -func (s *FilterCommittedResourceBookkeeping) Run(traceLog *slog.Logger, request api.ExternalSchedulerRequest) (*lib.FilterWeigherPipelineStepResult, error) { - result := s.IncludeAllHostsFromRequest(request) - - // Skip if no features enabled - if !s.Options.UpdateReservationAllocations && !s.Options.EnforceReservationSlots { - return result, nil - } - - // Get request details - projectID := request.Spec.Data.ProjectID - resourceGroup := request.Spec.Data.Flavor.Data.ExtraSpecs["hw_version"] - instanceUUID := request.Spec.Data.InstanceUUID - - if projectID == "" || resourceGroup == "" { - traceLog.Debug("skipping CR reservation handling: missing projectID or resourceGroup") - return result, nil - } - - // List all reservations - var reservations v1alpha1.ReservationList - if err := s.Client.List(context.Background(), &reservations); err != nil { - return nil, err - } - - // Find matching CR reservations with spare capacity - var matchingReservations []v1alpha1.Reservation - for _, reservation := range reservations.Items { - if !s.isMatchingCRReservation(traceLog, reservation, projectID, resourceGroup, request) { - continue - } - matchingReservations = append(matchingReservations, reservation) - } - - traceLog.Debug("found matching CR reservations", - "count", len(matchingReservations), - "projectID", projectID, - "resourceGroup", resourceGroup) - - // Update reservation allocations if enabled - if s.Options.UpdateReservationAllocations && len(matchingReservations) > 0 && instanceUUID != "" { - s.updateReservationAllocations(traceLog, request, result.Activations, matchingReservations) - } - - // TODO: Implement EnforceReservationSlots logic - // When enabled, filter candidates to only hosts with reservation slots if sufficient slots exist - - return result, nil -} - -// isMatchingCRReservation checks if a reservation is a matching CR reservation with spare capacity. -func (s *FilterCommittedResourceBookkeeping) isMatchingCRReservation( - traceLog *slog.Logger, - reservation v1alpha1.Reservation, - projectID, resourceGroup string, - request api.ExternalSchedulerRequest, -) bool { - // Must be Ready - if !meta.IsStatusConditionTrue(reservation.Status.Conditions, v1alpha1.ReservationConditionReady) { - return false - } - - // Must be a CR reservation - if reservation.Spec.Type != v1alpha1.ReservationTypeCommittedResource { - return false - } - - // Must have CR spec - if reservation.Spec.CommittedResourceReservation == nil { - return false - } - - // Must match project and resource group - if reservation.Spec.CommittedResourceReservation.ProjectID != projectID { - return false - } - if reservation.Spec.CommittedResourceReservation.ResourceGroup != resourceGroup { - return false - } - - // Must have a host - if reservation.Spec.TargetHost == "" && reservation.Status.Host == "" { - return false - } - - // Must have spare capacity - if !s.hasSpareCapacity(traceLog, reservation, request) { - return false - } - - return true -} - -// hasSpareCapacity checks if the reservation has enough spare capacity for the VM. -func (s *FilterCommittedResourceBookkeeping) hasSpareCapacity( - traceLog *slog.Logger, - reservation v1alpha1.Reservation, - request api.ExternalSchedulerRequest, -) bool { - // Calculate current usage from existing allocations - var usedCPU, usedMemory int64 - if reservation.Spec.CommittedResourceReservation != nil { - for _, allocation := range reservation.Spec.CommittedResourceReservation.Allocations { - if cpu, ok := allocation.Resources["cpu"]; ok { - usedCPU += cpu.Value() - } - if memory, ok := allocation.Resources["memory"]; ok { - usedMemory += memory.Value() - } - } - } - - // Get reservation's total capacity - var totalCPU, totalMemory int64 - if cpu, ok := reservation.Spec.Resources["cpu"]; ok { - totalCPU = cpu.Value() - } - if memory, ok := reservation.Spec.Resources["memory"]; ok { - totalMemory = memory.Value() - } - - // Calculate requested resources - //nolint:gosec // VCPUs and MemoryMB are bounded by OpenStack limits, no overflow risk - requestedCPU := int64(request.Spec.Data.Flavor.Data.VCPUs) - //nolint:gosec // VCPUs and MemoryMB are bounded by OpenStack limits, no overflow risk - requestedMemory := int64(request.Spec.Data.Flavor.Data.MemoryMB) * 1_000_000 // Convert MB to bytes - - // Check if there's enough spare capacity - spareCPU := totalCPU - usedCPU - spareMemory := totalMemory - usedMemory - - if spareCPU < requestedCPU || spareMemory < requestedMemory { - traceLog.Debug("reservation has insufficient spare capacity", - "reservation", reservation.Name, - "spareCPU", spareCPU, - "spareMemory", spareMemory, - "requestedCPU", requestedCPU, - "requestedMemory", requestedMemory) - return false - } - - return true -} - -// updateReservationAllocations adds the VM to the spec allocations of matching CR reservations -// whose host is among the candidates. -func (s *FilterCommittedResourceBookkeeping) updateReservationAllocations( - traceLog *slog.Logger, - request api.ExternalSchedulerRequest, - candidates map[string]float64, - matchingReservations []v1alpha1.Reservation, -) { - - instanceUUID := request.Spec.Data.InstanceUUID - if instanceUUID == "" { - traceLog.Warn("skipping reservation allocation update: no instance UUID in request") - return - } - - // Build resources from flavor - //nolint:gosec // VCPUs and MemoryMB are bounded by OpenStack limits, no overflow risk - vmResources := map[hv1.ResourceName]resource.Quantity{ - "cpu": *resource.NewQuantity(int64(request.Spec.Data.Flavor.Data.VCPUs), resource.DecimalSI), - "memory": *resource.NewQuantity(int64(request.Spec.Data.Flavor.Data.MemoryMB)*1_000_000, resource.DecimalSI), - } - - now := metav1.NewTime(time.Now()) - - for _, reservation := range matchingReservations { - // Get reservation host - reservationHost := reservation.Spec.TargetHost - if reservationHost == "" { - reservationHost = reservation.Status.Host - } - - // Check if reservation's host is among candidates - if _, isCandidate := candidates[reservationHost]; !isCandidate { - traceLog.Debug("skipping reservation allocation: host not among candidates", - "reservation", reservation.Name, - "host", reservationHost) - continue - } - - // Check if VM is already in allocations - if reservation.Spec.CommittedResourceReservation.Allocations != nil { - if _, exists := reservation.Spec.CommittedResourceReservation.Allocations[instanceUUID]; exists { - traceLog.Debug("VM already in reservation allocations", - "reservation", reservation.Name, - "instanceUUID", instanceUUID) - continue - } - } - - // Add VM to reservation allocations - reservationCopy := reservation.DeepCopy() - if reservationCopy.Spec.CommittedResourceReservation.Allocations == nil { - reservationCopy.Spec.CommittedResourceReservation.Allocations = make(map[string]v1alpha1.CommittedResourceAllocation) - } - reservationCopy.Spec.CommittedResourceReservation.Allocations[instanceUUID] = v1alpha1.CommittedResourceAllocation{ - CreationTimestamp: now, - Resources: vmResources, - } - - if err := s.Client.Update(context.Background(), reservationCopy); err != nil { - traceLog.Warn("failed to update reservation with VM allocation", - "reservation", reservation.Name, - "instanceUUID", instanceUUID, - "error", err) - // Continue with other reservations - this is best-effort - } else { - traceLog.Info("added VM to CR reservation spec allocations", - "reservation", reservation.Name, - "instanceUUID", instanceUUID, - "host", reservationHost) - } - } -} - -func init() { - Index["filter_committed_resource_bookkeeping"] = func() NovaFilter { return &FilterCommittedResourceBookkeeping{} } -} diff --git a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go index 194270b19..d26c7c940 100644 --- a/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go +++ b/internal/scheduling/nova/plugins/filters/filter_has_enough_capacity.go @@ -329,7 +329,6 @@ func (s *FilterHasEnoughCapacity) Run(traceLog *slog.Logger, request api.Externa ) } } - return result, nil } From f2d75b27ad6079b6f893a6a61f5e74173893bbbe Mon Sep 17 00:00:00 2001 From: mblos Date: Wed, 1 Apr 2026 15:47:54 +0200 Subject: [PATCH 7/7] update docs --- .../committed-resource-reservations.md | 138 ++++++++++-------- 1 file changed, 80 insertions(+), 58 deletions(-) diff --git a/docs/reservations/committed-resource-reservations.md b/docs/reservations/committed-resource-reservations.md index fe4b17cc0..5256d7904 100644 --- a/docs/reservations/committed-resource-reservations.md +++ b/docs/reservations/committed-resource-reservations.md @@ -1,54 +1,39 @@ # Committed Resource Reservation System -The committed resource reservation system manages capacity commitments, i.e. strict reservation guarantees usable by projects. +The committed resource (CR) reservation system manages capacity commitments, i.e. strict reservation guarantees. When customers pre-commit to resource usage, Cortex reserves capacity on hypervisors to guarantee availability. -The system integrates with Limes (via the LIQUID protocol) to receive commitments, expose usage and capacity data, and provides acceptance/rejection feedback. - -## File Structure - -```text -internal/scheduling/reservations/commitments/ -├── config.go # Configuration (intervals, API flags, secrets) -├── controller.go # Reconciliation of reservations -├── syncer.go # Periodic sync task with Limes, ensures local state matches Limes' commitments -├── reservation_manager.go # Reservation CRUD operations -├── api.go # HTTP API initialization -├── api_change_commitments.go # Handle commitment changes from Limes and updates local reservations accordingly -├── api_report_usage.go # Report VM usage per project, accounting to commitments or PAYG -├── api_report_capacity.go # Report capacity per AZ -├── api_info.go # Readiness endpoint with versioning (of underlying flavor group configuration) -├── capacity.go # Capacity calculation from Hypervisor CRDs -├── usage.go # VM-to-commitment assignment logic -├── flavor_group_eligibility.go # Validates VMs belong to correct flavor groups -└── state.go # Commitment state helper functions -``` +Cortex receives commitments, exposes usage and capacity data, and provides acceptance/rejection via APIs. + +## Implementation + +The CR reservation implementation is located in `internal/scheduling/reservations/commitments/`. Key components include: +- Controller logic (`controller.go`) +- API endpoints (`api_*.go`) +- Capacity and usage calculation logic (`capacity.go`, `usage.go`) +- Syncer for periodic state sync (`syncer.go`) -## Operations +## Configuration and Observability -### Configuration +**Configuration**: Helm values for intervals, API flags, and pipeline configuration are defined in `helm/bundles/cortex-nova/values.yaml`. Key configuration includes: +- API endpoint toggles (change-commitments, report-usage, report-capacity) +- Reconciliation intervals (grace period, active monitoring) +- Scheduling pipeline selection per flavor group -| Helm Value | Description | -|------------|-------------| -| `committedResourceEnableChangeCommitmentsAPI` | Enable/disable the change-commitments endpoint | -| `committedResourceEnableReportUsageAPI` | Enable/disable the usage reporting endpoint | -| `committedResourceEnableReportCapacityAPI` | Enable/disable the capacity reporting endpoint | -| `committedResourceRequeueIntervalActive` | How often to revalidate active reservations | -| `committedResourceRequeueIntervalRetry` | Retry interval when knowledge not ready | -| `committedResourceChangeAPIWatchReservationsTimeout` | Timeout waiting for reservations to become ready while processing commitment changes via API | -| `committedResourcePipelineDefault` | Default scheduling pipeline | -| `committedResourceFlavorGroupPipelines` | Map of flavor group to pipeline name | -| `committedResourceSyncInterval` | How often the syncer reconciles Limes commitments to Reservation CRDs | +**Metrics and Alerts**: Defined in `helm/bundles/cortex-nova/alerts/nova.alerts.yaml` with prefixes: +- `cortex_committed_resource_change_api_*` +- `cortex_committed_resource_usage_api_*` +- `cortex_committed_resource_capacity_api_*` -Each API endpoint can be disabled independently. The periodic sync task can be disabled by removing it (`commitments-sync-task`) from the list of enabled tasks in the `cortex-nova` Helm chart. +## Lifecycle Management -### Observability +### State (CRDs) +Defined in `api/v1/reservations_types.go`, which contains definitions for CR reservations and failover reservations (see [./failover-reservations.md](./failover-reservations.md)). -Alerts and metrics are defined in `helm/bundles/cortex-nova/alerts/nova.alerts.yaml`. Key metric prefixes: -- `cortex_committed_resource_change_api_*` - Change API metrics -- `cortex_committed_resource_usage_api_*` - Usage API metrics -- `cortex_committed_resource_capacity_api_*` - Capacity API metrics +A reservation CRD represents a single reservation slot on a hypervisor, which holds multiple VMs. +A single CR entry typically refers to multiple reservation CRDs (slots). -## Architecture Overview + +### CR Reservation Lifecycle ```mermaid flowchart LR @@ -56,47 +41,84 @@ flowchart LR Res[(Reservation CRDs)] end - ChangeAPI[Change API] - UsageAPI[Usage API] Syncer[Syncer Task] + ChangeAPI[Change API] + CapacityAPI[Capacity API] Controller[Controller] + UsageAPI[Usage API] Scheduler[Scheduler API] ChangeAPI -->|CRUD| Res Syncer -->|CRUD| Res UsageAPI -->|read| Res + CapacityAPI -->|read| Res + CapacityAPI -->|capacity request| Scheduler Res -->|watch| Controller Controller -->|update spec/status| Res - Controller -->|placement request| Scheduler + Controller -->|reservation placement request| Scheduler +``` + +Reservations are managed through the Change API, Syncer Task, and Controller reconciliation. + +| Component | Event | Timing | Action | +|-----------|-------|--------|--------| +| **Change API / Syncer** | CR Create, Resize, Delete | Immediate/Hourly | Create/update/delete Reservation CRDs | +| **Controller** | Placement | On creation | Find host via scheduler API, set `TargetHost` | +| **Controller** | Optimize unused slots | >> minutes | Assign PAYG VMs or re-place reservations | + +### VM Lifecycle + +VM allocations are tracked within reservations: + +```mermaid +flowchart LR + subgraph State + Res[(Reservation CRDs)] + end + A[Nova Scheduler] -->|VM Create/Migrate/Resize| B[Scheduling Pipeline] + B -->|update Spec.Allocations| Res + Res -->|watch| Controller + Res -->|>>min reconcile| Controller + Controller -->|update spec/status| Res + Controller --> E{Verify allocations} ``` +| Component | Event | Timing | Action | +|-----------|-------|--------|--------| +| **Scheduling Pipeline** | Placement call for: VM Create, Migrate, Resize | Immediate | Update VM in `Spec.Allocations` | +| **Controller** | Reservation CRD update: `Status`/`Spec` `.Allocations` | 1 min | Verify via Nova API and Hypervisor CRD; update `Status`/`Spec` `.Allocations` | +| **Controller** | Regular VM lifecycle check (VM off, deletion); maybe watch Hypervisor CRD VMs | >> min | Verify allocations | -Reservations are managed through the Change API, Syncer Task, and Controller reconciliation. The Usage API provides read-only access to report usage data back to Limes. +**Allocation States**: +- `Spec.Allocations` — Expected VMs (from scheduling events) +- `Status.Allocations` — Confirmed VMs (verified on host) + +**Note**: VM allocations may not consume all resources of a reservation slot. A reservation with 128 GB may have VMs totaling only 96 GB if that's what fits the project's needs. Allocations may exceeding reservation capacity (e.g., after VM resize). ### Change-Commitments API -The change-commitments API receives batched commitment changes from Limes. A request can contain multiple commitment changes across different projects and flavor groups. The semantic is **all-or-nothing**: if any commitment in the batch cannot be fulfilled (e.g., insufficient capacity), the entire request is rejected and rolled back. +The change-commitments API receives batched commitment changes from Limes and manages reservations accordingly. + +**Request Semantics**: A request can contain multiple commitment changes across different projects and flavor groups. The semantic is **all-or-nothing** — if any commitment in the batch cannot be fulfilled (e.g., insufficient capacity), the entire request is rejected and rolled back. -Cortex performs CRUD operations on local Reservation CRDs to match the new desired state: +**Operations**: Cortex performs CRUD operations on local Reservation CRDs to match the new desired state: - Creates new reservations for increased commitment amounts -- Deletes existing reservations -- Cortex preserves existing reservations that already have VMs allocated when possible +- Deletes existing reservations for decreased commitments +- Preserves existing reservations that already have VMs allocated when possible ### Syncer Task -The syncer task runs periodically and fetches all commitments from Limes. It syncs the local Reservation CRD state to match Limes' view of commitments. +The syncer task runs periodically and fetches all commitments from Limes. It syncs the local Reservation CRD state to match Limes' view of commitments. Theoretically, this should find no differences of local state and Limes state. ### Controller (Reconciliation) -The controller watches Reservation CRDs and performs reconciliation: +The controller watches Reservation CRDs and performs two types of reconciliation: -1. **For new reservations** (no target host assigned): - - Calls Cortex for scheduling to find a suitable host - - Assigns the target host and marks the reservation as Ready +**Placement** - Finds hosts for new reservations (calls scheduler API) -2. **For existing reservations** (already have a target host): - - Validates that allocated VMs are still on the expected host - - Updates allocations if VMs have migrated or been deleted - - Requeues for periodic revalidation +**Allocation Verification** - Tracks VM lifecycle on reservations: +- New VMs (< 15min): Checked via Nova API every 1 minute +- Established VMs: Checked via Hypervisor CRD every 5 min - 1 hour +- Missing VMs: Removed after verification ### Usage API