Skip to content
2 changes: 1 addition & 1 deletion sre/dev/remote_cluster/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ kubectl get pods --all-namespaces
Update the `kubeconfig` path in your global configuration:

```bash
vim ../group_vars/environment/cluster.yaml
vim ../../group_vars/environment/cluster.yaml
```

Set the absolute path where the kubeconfig should be downloaded:
Expand Down
21 changes: 21 additions & 0 deletions sre/roles/faults/meta/argument_specs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ argument_specs:
- misconfigured-service-port
- modify-environment-variables
- unsupported-image
- exhaust-node-resources
required: true
type: str
invalid_command:
Expand Down Expand Up @@ -325,6 +326,26 @@ argument_specs:
default: force
required: false
type: str
exhaust_node_resources:
required: false
type: dict
options:
workload:
required: true
type: dict
options:
kind:
choices:
- Deployment
- StatefulSet
required: true
type: str
name:
required: true
type: str
namespace:
required: true
type: str
otel_demo:
required: false
type: dict
Expand Down
9 changes: 9 additions & 0 deletions sre/roles/faults/tasks/inject_custom.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,12 @@
when:
- fault.custom.name == 'unsupported-image'
- fault.custom.unsupported_image is defined

- name: Import exhaust node resources injection tasks
ansible.builtin.import_tasks:
file: inject_custom_exhaust_node_resources.yaml
vars:
spec: "{{ fault.custom.exhaust_node_resources }}"
when:
- fault.custom.name == 'exhaust-node-resources'
- fault.custom.exhaust_node_resources is defined
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
---
- name: Get pods for target workload
kubernetes.core.k8s_info:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
kind: Pod
namespace: "{{ spec.workload.namespace }}"
label_selectors:
- app.kubernetes.io/name={{ spec.workload.name }}
register: workload_pods

- name: Select node hosting target workload
set_fact:
target_node: "{{ workload_pods.resources[0].spec.nodeName }}"

- name: Label the node as target for resource hog
kubernetes.core.k8s:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
resource_definition:
apiVersion: v1
kind: Node
metadata:
name: "{{ target_node }}"
labels:
node-role.kubernetes.io/app: "true"
state: present

- name: Deploy resource hog on target node
kubernetes.core.k8s:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
resource_definition:
apiVersion: apps/v1
kind: Deployment
metadata:
name: fake-resource-hog
namespace: "{{ spec.workload.namespace }}"
spec:
replicas: 1
selector:
matchLabels:
app: fake-resource-hog
template:
metadata:
labels:
app: fake-resource-hog
spec:
nodeSelector:
node-role.kubernetes.io/app: "true"
containers:
- name: stress-container
image: progrium/stress
args: ["--cpu", "8", "--io", "4", "--vm", "2", "--vm-bytes", "512M"]
9 changes: 9 additions & 0 deletions sre/roles/faults/tasks/remove_custom.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,12 @@
when:
- fault.custom.name == 'unsupported-image'
- fault.custom.unsupported_image is defined

- name: Import exhaust node resources removal tasks
ansible.builtin.import_tasks:
file: remove_custom_exhaust_node_resources.yaml
vars:
spec: "{{ fault.custom.exhaust_node_resources }}"
when:
- fault.custom.name == 'exhaust-node-resources'
- fault.custom.exhaust_node_resources is defined
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
---
- name: Get nodes labeled as target for resource hog
kubernetes.core.k8s_info:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
kind: Node
label_selectors:
- node-role.kubernetes.io/app=true
register: node_list

- name: Select the node labeled for this fault
set_fact:
target_node: "{{ (node_list.resources | list)[0].metadata.name }}"
when: node_list.resources | list | length > 0

- name: Remove node label from target node
kubernetes.core.k8s:
kubeconfig: "{{ faults_cluster.kubeconfig }}"
resource_definition:
apiVersion: v1
kind: Node
metadata:
name: "{{ target_node }}"
labels:
node-role.kubernetes.io/app: null
state: present
when: target_node is defined
51 changes: 51 additions & 0 deletions sre/roles/incidents/files/ground_truths/incident_297.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
---
fault:
- entity:
name: cart
group_id: cart
kind: Deployment
type: custom
name: exhaust-node-resources
condition: Node resource exhaustion
category: Change
fault_mechanism: exhaust-node-resources

alerts:
- id: CPUSpend
group_id: cluster-node-1
metadata:
description: Node CPU usage is high due to resource exhaustion
- id: RequestLatency
group_id: cart
metadata:
description: Latency of cart deployment increases due to CPU resource contention

groups:
- id: cart
kind: Deployment
namespace: otel-demo
filter:
- cart.*
root_cause: true
- id: cluster-node-1
kind: Node
namespace: kube-system
filter:
- .*

aliases:
- - cart
- cluster-node-1

propagations:
- source: cluster-node-1
target: cart
condition: Node resource pressure prevents new pods from being scheduled
effect: Cart deployment pods remain pending and requests may fail

recommended_actions:
- solution:
id: rebalance_or_scale
actions:
- Rebalance workloads to another node
- Scale up cluster node capacity
27 changes: 27 additions & 0 deletions sre/roles/incidents/files/specs/incident_297.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
metadata:
complexity: Medium
id: 297
name: Exhaust Node Resources
platform: kubernetes

spec:
environment:
applications:
otel_demo:
enabled: true
tools:
category: sre
selected:
- prometheus
- grafana
- jaeger
- chaos-mesh

faults:
- custom:
name: exhaust-node-resources
exhaust_node_resources:
workload:
kind: Deployment
name: cart
namespace: "{{ applications_helm_releases.otel_demo.namespace }}"
Loading