diff --git a/Tiltfile b/Tiltfile index 6871d18b3..cca91740c 100644 --- a/Tiltfile +++ b/Tiltfile @@ -50,6 +50,11 @@ helm_repo( 'https://prometheus-community.github.io/helm-charts', labels=['Repositories'], ) +helm_repo( + 'perses', + 'https://perses.github.io/helm-charts', + labels=['Repositories'], +) ########### Certmanager # Certmanager is required for the validating webhooks in the cortex bundles, so @@ -282,3 +287,20 @@ k8s_resource('cortex-plutono', port_forwards=[ ], links=[ link('http://localhost:5000/d/cortex/cortex?orgId=1', 'cortex dashboard'), ], labels=['Monitoring']) + +helm_resource( + 'cortex-perses', + 'perses/perses', + flags=['--values=./tools/perses/values.yaml'], + port_forwards=[port_forward(5080, 8080, name='perses')], + links=[link('http://localhost:5080', 'perses dashboard')], + labels=['Monitoring'], + resource_deps=['perses'], +) +watch_file('./tools/perses/dashboards') +k8s_yaml(local(' '.join([ + 'kubectl create configmap cortex-perses-dashboards', + '--from-file=./tools/perses/dashboards/', + '--dry-run=client -o yaml |', + 'kubectl label --local -f - perses.dev/resource=true --dry-run=client -o yaml', +]))) diff --git a/tools/perses/dashboards/cortex-compute-kvm-overview.json b/tools/perses/dashboards/cortex-compute-kvm-overview.json new file mode 100644 index 000000000..073c98cce --- /dev/null +++ b/tools/perses/dashboards/cortex-compute-kvm-overview.json @@ -0,0 +1,769 @@ +{ + "kind": "Dashboard", + "metadata": { + "name": "cortex_computer_kvm_-_overview", + "createdAt": "2026-03-31T07:59:22.596660897Z", + "updatedAt": "2026-04-01T09:45:18.10325137Z", + "version": 19, + "tags": [ + "compute", + "infrastructure", + "kvm" + ], + "project": "workload-management" + }, + "spec": { + "display": { + "name": "Cortex Computer KVM - Overview" + }, + "panels": { + "1c635e5426614893a17cdb63fcf68abd": { + "kind": "Panel", + "spec": { + "display": { + "name": "CPU" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "mode": "table", + "position": "right", + "size": "small", + "values": [ + "last" + ] + }, + "querySettings": [ + { + "areaOpacity": 1, + "colorMode": "fixed-single", + "colorValue": "#ff0000", + "queryIndex": 0 + }, + { + "areaOpacity": 1, + "colorMode": "fixed-single", + "colorValue": "#fa00ff", + "queryIndex": 1 + }, + { + "areaOpacity": 1, + "colorMode": "fixed-single", + "colorValue": "#006eff", + "queryIndex": 2 + }, + { + "areaOpacity": 1, + "colorMode": "fixed-single", + "colorValue": "#00ff73", + "queryIndex": 3 + } + ], + "visual": { + "areaOpacity": 0.3, + "connectNulls": false, + "display": "line", + "lineStyle": "solid", + "lineWidth": 1.25, + "pointRadius": 2.75, + "stack": "all" + }, + "yAxis": { + "format": { + "shortValues": true, + "unit": "decimal" + }, + "label": "", + "min": 0, + "show": true + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(max(cortex_kvm_host_capacity_usage{resource=\"cpu\", type=\"utilized\", availability_zone=~\"$availability_zone\", workload_type=~\"$workload_type\", cpu_architecture=~\"$cpu_architecture\", building_block=~\"$building_block\", compute_host=~\"$compute_host\", enabled=~\"$enabled\", maintenance=~\"$maintenance\", decommissioned=~\"$decommissioned\", external_customer=~\"$external_customer\"}) by (compute_host))", + "seriesNameFormat": "Workload" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(max(cortex_kvm_host_capacity_usage{resource=\"cpu\", type=\"failover\", availability_zone=~\"$availability_zone\", workload_type=~\"$workload_type\", cpu_architecture=~\"$cpu_architecture\", building_block=~\"$building_block\", compute_host=~\"$compute_host\", enabled=~\"$enabled\", maintenance=~\"$maintenance\", decommissioned=~\"$decommissioned\", external_customer=~\"$external_customer\"}) by (compute_host))", + "seriesNameFormat": "Failover" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(max(cortex_kvm_host_capacity_usage{resource=\"cpu\", type=\"reserved\", availability_zone=~\"$availability_zone\", workload_type=~\"$workload_type\", cpu_architecture=~\"$cpu_architecture\", building_block=~\"$building_block\", compute_host=~\"$compute_host\", enabled=~\"$enabled\", maintenance=~\"$maintenance\", decommissioned=~\"$decommissioned\", external_customer=~\"$external_customer\"}) by (compute_host))", + "seriesNameFormat": "Reserved" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(max(cortex_kvm_host_capacity_usage{resource=\"cpu\", type=\"payg\", availability_zone=~\"$availability_zone\", workload_type=~\"$workload_type\", cpu_architecture=~\"$cpu_architecture\", building_block=~\"$building_block\", compute_host=~\"$compute_host\", enabled=~\"$enabled\", maintenance=~\"$maintenance\", decommissioned=~\"$decommissioned\", external_customer=~\"$external_customer\"}) by (compute_host))", + "seriesNameFormat": "Pay As You Go" + } + } + } + } + ] + } + }, + "21facb9aac07491caf86f0b2d5cc8e88": { + "kind": "Panel", + "spec": { + "display": {}, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last-number", + "format": { + "unit": "bytes" + }, + "legendMode": "on", + "thresholds": { + "defaultColor": "#ffffff", + "steps": [] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(max(cortex_kvm_host_capacity_total{resource=\"ram\", availability_zone=~\"$availability_zone\", workload_type=~\"$workload_type\", cpu_architecture=~\"$cpu_architecture\", building_block=~\"$building_block\", compute_host=~\"$compute_host\", enabled=~\"$enabled\", maintenance=~\"$maintenance\", decommissioned=~\"$decommissioned\", external_customer=~\"$external_customer\"}) by (compute_host))", + "seriesNameFormat": "Total" + } + } + } + } + ] + } + }, + "28db47ce8b2e4d24b9952d389a2e2b7d": { + "kind": "Panel", + "spec": { + "display": {}, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last-number", + "colorMode": "background_solid", + "format": { + "unit": "percent-decimal" + }, + "legendMode": "on", + "thresholds": { + "defaultColor": "#EE6C6C", + "steps": [ + { + "value": 0.2 + }, + { + "color": "#59CC8D", + "value": 0.3 + } + ] + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(max(cortex_kvm_host_capacity_usage{resource=\"ram\", type=\"payg\", availability_zone=~\"$availability_zone\", workload_type=~\"$workload_type\", cpu_architecture=~\"$cpu_architecture\", building_block=~\"$building_block\", compute_host=~\"$compute_host\", enabled=~\"$enabled\", maintenance=~\"$maintenance\", decommissioned=~\"$decommissioned\", external_customer=~\"$external_customer\"}) by (compute_host)) /\nsum(max(cortex_kvm_host_capacity_total{resource=\"ram\", availability_zone=~\"$availability_zone\", workload_type=~\"$workload_type\", cpu_architecture=~\"$cpu_architecture\", building_block=~\"$building_block\", compute_host=~\"$compute_host\", enabled=~\"$enabled\", maintenance=~\"$maintenance\", decommissioned=~\"$decommissioned\", external_customer=~\"$external_customer\"}) by (compute_host))", + "seriesNameFormat": "Available (%)" + } + } + } + } + ] + } + }, + "637a56a9a10f4a01adc8fb093372e2d9": { + "kind": "Panel", + "spec": { + "display": { + "name": "RAM" + }, + "plugin": { + "kind": "TimeSeriesChart", + "spec": { + "legend": { + "mode": "table", + "position": "right", + "size": "small", + "values": [ + "last" + ] + }, + "querySettings": [ + { + "areaOpacity": 1, + "colorMode": "fixed-single", + "colorValue": "#ff0000", + "queryIndex": 0 + }, + { + "areaOpacity": 1, + "colorMode": "fixed-single", + "colorValue": "#fa00ff", + "queryIndex": 1 + }, + { + "areaOpacity": 1, + "colorMode": "fixed-single", + "colorValue": "#006eff", + "queryIndex": 2 + }, + { + "areaOpacity": 1, + "colorMode": "fixed-single", + "colorValue": "#00ff73", + "queryIndex": 3 + } + ], + "visual": { + "areaOpacity": 0.3, + "connectNulls": false, + "display": "line", + "lineStyle": "solid", + "lineWidth": 1.25, + "pointRadius": 2.75, + "stack": "all" + }, + "yAxis": { + "format": { + "unit": "bytes" + }, + "label": "", + "min": 0, + "show": true + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(max(cortex_kvm_host_capacity_usage{resource=\"ram\", type=\"utilized\", availability_zone=~\"$availability_zone\", workload_type=~\"$workload_type\", cpu_architecture=~\"$cpu_architecture\", building_block=~\"$building_block\", compute_host=~\"$compute_host\", enabled=~\"$enabled\", maintenance=~\"$maintenance\", decommissioned=~\"$decommissioned\", external_customer=~\"$external_customer\"}) by (compute_host))", + "seriesNameFormat": "Workload" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(max(cortex_kvm_host_capacity_usage{resource=\"ram\", type=\"failover\", availability_zone=~\"$availability_zone\", workload_type=~\"$workload_type\", cpu_architecture=~\"$cpu_architecture\", building_block=~\"$building_block\", compute_host=~\"$compute_host\", enabled=~\"$enabled\", maintenance=~\"$maintenance\", decommissioned=~\"$decommissioned\", external_customer=~\"$external_customer\"}) by (compute_host))", + "seriesNameFormat": "Failover" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(max(cortex_kvm_host_capacity_usage{resource=\"ram\", type=\"reserved\", availability_zone=~\"$availability_zone\", workload_type=~\"$workload_type\", cpu_architecture=~\"$cpu_architecture\", building_block=~\"$building_block\", compute_host=~\"$compute_host\", enabled=~\"$enabled\", maintenance=~\"$maintenance\", decommissioned=~\"$decommissioned\", external_customer=~\"$external_customer\"}) by (compute_host))", + "seriesNameFormat": "Reserved" + } + } + } + }, + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(max(cortex_kvm_host_capacity_usage{resource=\"ram\", type=\"payg\", availability_zone=~\"$availability_zone\", workload_type=~\"$workload_type\", cpu_architecture=~\"$cpu_architecture\", building_block=~\"$building_block\", compute_host=~\"$compute_host\", enabled=~\"$enabled\", maintenance=~\"$maintenance\", decommissioned=~\"$decommissioned\", external_customer=~\"$external_customer\"}) by (compute_host))", + "seriesNameFormat": "Pay As You Go" + } + } + } + } + ] + } + }, + "7cd4fe639659454e935a59412c679758": { + "kind": "Panel", + "spec": { + "display": {}, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last-number", + "format": { + "unit": "decimal" + }, + "legendMode": "on", + "thresholds": { + "defaultColor": "#ffffff" + } + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "sum(max(cortex_kvm_host_capacity_total{resource=\"cpu\", availability_zone=~\"$availability_zone\", workload_type=~\"$workload_type\", cpu_architecture=~\"$cpu_architecture\", building_block=~\"$building_block\", compute_host=~\"$compute_host\", enabled=~\"$enabled\", maintenance=~\"$maintenance\", decommissioned=~\"$decommissioned\", external_customer=~\"$external_customer\"}) by (compute_host))", + "seriesNameFormat": "Total" + } + } + } + } + ] + } + }, + "86ea3898e16b4dd6af2476179c18ebae": { + "kind": "Panel", + "spec": { + "display": { + "name": "Number of Hosts" + }, + "plugin": { + "kind": "StatChart", + "spec": { + "calculation": "last-number", + "format": { + "unit": "decimal" + }, + "legendMode": "auto", + "sparkline": {} + } + }, + "queries": [ + { + "kind": "TimeSeriesQuery", + "spec": { + "plugin": { + "kind": "PrometheusTimeSeriesQuery", + "spec": { + "query": "count(max(cortex_kvm_host_capacity_usage{resource=\"cpu\", type=\"utilized\", availability_zone=~\"$availability_zone\", workload_type=~\"$workload_type\", cpu_architecture=~\"$cpu_architecture\", building_block=~\"$building_block\", compute_host=~\"$compute_host\", enabled=~\"$enabled\", maintenance=~\"$maintenance\", decommissioned=~\"$decommissioned\", external_customer=~\"$external_customer\"}) by (compute_host))", + "seriesNameFormat": "Number of Hosts" + } + } + } + } + ] + } + }, + "9a25fb3bcf30475b8b118e4d54abeeb2": { + "kind": "Panel", + "spec": { + "display": {}, + "plugin": { + "kind": "Markdown", + "spec": { + "text": "