Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion charts/controlplane-operations/Chart.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
apiVersion: v2
name: controlplane-operations
version: 1.1.3
version: 1.1.5
description: A set of Plutono dashboards and Prometheus alerting rules combined with playbooks to ensure effective operations of Controlplane clusters.
maintainers:
- name: Vladimir Videlov (d051408)
Expand Down
148 changes: 148 additions & 0 deletions charts/controlplane-operations/alerts/controlplane-gardener.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,151 @@ groups:
description: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{`{{ $labels.shoot_name }}`}} is not Ready. Network datapath threatened!
summary: Calico Node Pod {{`{{ $labels.pod }}`}} on Shoot {{`{{ $labels.shoot_name }}`}} is not Ready.
{{- end }}

### MCM ###

{{- if not (.Values.prometheusRules.disabled.MCMMachineNotReady | default false) }}
- alert: MCMMachineNotReady
expr: >
sum by (name, project, shoot_name, support_group) (
label_replace(
label_replace(
label_replace(
mcm_machine_status_condition{
job="machine-controller-manager",
condition="Ready"
},
"support_group", "containers", "__name__", ".*"
),
"support_group", "storage", "project", "^storage$"
),
"support_group", "compute", "project", "^compute$"
)
) == 0
for: {{ dig "MCMMachineNotReady" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "MCMMachineNotReady" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineNotReady.md
service: gardener
support_group: "{{`{{ $labels.support_group }}`}}"
annotations:
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is not Ready. Check the Machine's conditions and events for more details.
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is not Ready.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.MCMMachineStuckInTerminating | default false) }}
- alert: MCMMachineStuckInTerminating
expr: >
sum by (name, project, shoot_name, support_group) (
label_replace(
label_replace(
label_replace(
mcm_machine_current_status_phase{
job="machine-controller-manager",
},
"support_group", "containers", "__name__", ".*"
),
"support_group", "storage", "project", "^storage$"
),
"support_group", "compute", "project", "^compute$"
)
) == -4
for: {{ dig "MCMMachineStuckInTerminating" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "MCMMachineStuckInTerminating" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineStuckInTerminating.md
service: gardener
support_group: "{{`{{ $labels.support_group }}`}}"
annotations:
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Terminating state. Check the Machine's conditions and events for more details.
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Terminating state.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.MCMMachineFailed | default false) }}
- alert: MCMMachineFailed
expr: >
sum by (name, project, shoot_name, support_group) (
label_replace(
label_replace(
label_replace(
mcm_machine_current_status_phase{
job="machine-controller-manager",
},
"support_group", "containers", "__name__", ".*"
),
"support_group", "storage", "project", "^storage$"
),
"support_group", "compute", "project", "^compute$"
)
) == -3
for: {{ dig "MCMMachineFailed" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "MCMMachineFailed" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineFailed.md
service: gardener
support_group: "{{`{{ $labels.support_group }}`}}"
annotations:
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in Failed state. Check the Machine's conditions and events for more details.
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in Failed state.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.MCMMachineCrashLoopBackOff | default false) }}
- alert: MCMMachineCrashLoopBackOff
expr: >
sum by (name, project, shoot_name, support_group) (
label_replace(
label_replace(
label_replace(
mcm_machine_current_status_phase{
job="machine-controller-manager",
},
"support_group", "containers", "__name__", ".*"
),
"support_group", "storage", "project", "^storage$"
),
"support_group", "compute", "project", "^compute$"
)
) == -2
for: {{ dig "MCMMachineCrashLoopBackOff" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "MCMMachineCrashLoopBackOff" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineCrashLoopBackOff.md
service: gardener
support_group: "{{`{{ $labels.support_group }}`}}"
annotations:
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in CrashLoopBackOff state. Check the Machine's conditions and events for more details.
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is in CrashLoopBackOff state.
{{- end }}

{{- if not (.Values.prometheusRules.disabled.MCMMachineStuckInPending | default false) }}
- alert: MCMMachineStuckInPending
expr: >
sum by (name, project, shoot_name, support_group) (
label_replace(
label_replace(
label_replace(
mcm_machine_current_status_phase{
job="machine-controller-manager",
},
"support_group", "containers", "__name__", ".*"
),
"support_group", "storage", "project", "^storage$"
),
"support_group", "compute", "project", "^compute$"
)
) == 0
for: {{ dig "MCMMachineStuckInPending" "for" "30m" .Values.prometheusRules }}
labels:
{{ include "controlplane-operations.additionalRuleLabels" . }}
severity: {{ dig "MCMMachineStuckInPending" "severity" "warning" .Values.prometheusRules }}
playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/MCMMachineStuckInPending.md
service: gardener
support_group: "{{`{{ $labels.support_group }}`}}"
annotations:
description: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Pending state. Check the Machine's conditions and events for more details.
summary: Machine {{`{{ $labels.name }}`}} from shoot {{`{{ $labels.shoot_name }}`}} of project {{`{{ $labels.project }}`}} is stuck in Pending state.
{{- end }}
4 changes: 2 additions & 2 deletions charts/controlplane-operations/plugindefinition.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ kind: PluginDefinition
metadata:
name: controlplane-operations
spec:
version: 1.1.3
version: 1.1.5
displayName: Controlplane operations bundle
description: Operations bundle for Controlane clusters
docMarkDownUrl: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/README.md
icon: https://raw.githubusercontent.com/cloudoperators/controlplane-operations/main/charts/controlplane-operations/kubernetes-logo.png
helmChart:
name: controlplane-operations
repository: oci://ghcr.io/cloudoperators/controlplane-operations/charts
version: 1.1.3
version: 1.1.5
options:
- name: prometheusRules.create
description: Create Prometheus rules
Expand Down
Loading