-
Notifications
You must be signed in to change notification settings - Fork 22
feat: add database resource limit fault #336
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
9e8e0c7
ad38f7e
8cfde82
e8f041d
40d8334
53105fc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
|
|
This file was deleted.
|
Large diffs are not rendered by default.
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| ../roles |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,4 +1,13 @@ | ||||||
| --- | ||||||
| - name: Low Mem CPU Constraints | ||||||
| ansible.builtin.import_tasks: | ||||||
| file: inject_low_mem_cpu_constraints.yaml | ||||||
| vars: | ||||||
| spec: "{{ fault.custom.misconfigured_service_port }}" | ||||||
| when: | ||||||
| - fault.custom.name == 'low-mem-cpu-constraints' | ||||||
|
||||||
| - fault.custom.name == 'low-mem-cpu-constraints' | |
| - fault.custom.name == 'low_resource_limits' |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,68 @@ | ||
| --- | ||
| - name: Create ResourceQuota with hard memory limits | ||
| kubernetes.core.k8s: | ||
| kubeconfig: "{{ faults_cluster.kubeconfig }}" | ||
| resource_definition: | ||
| apiVersion: v1 | ||
| kind: ResourceQuota | ||
| metadata: | ||
| name: "{{ spec.namespace.name }}-memory" | ||
| namespace: "{{ spec.namespace.name }}" | ||
| spec: | ||
| hard: | ||
| requests.memory: 1Gi | ||
| limits.memory: 2Gi | ||
| requests.cpu: "1" | ||
| limits.cpu: "2" | ||
| pods: "10" | ||
| state: present | ||
|
|
||
| - name: Retrieve workload replica information | ||
| kubernetes.core.k8s_info: | ||
| api_version: apps/v1 | ||
| kind: "{{ workload.kind }}" | ||
| kubeconfig: "{{ faults_cluster.kubeconfig }}" | ||
| name: "{{ workload.name }}" | ||
| namespace: "{{ spec.namespace.name }}" | ||
| register: faults_workloads_info | ||
| loop: "{{ spec.workloads }}" | ||
| loop_control: | ||
| label: "{{ workload.kind | lower }}/{{ workload.name }}" | ||
| loop_var: workload | ||
| when: | ||
| - workload.name != 'otel-collector' | ||
|
|
||
| - name: Scale down workloads to 0 replicas | ||
| kubernetes.core.k8s_scale: | ||
| api_version: "{{ result.resources[0].apiVersion }}" | ||
| kind: "{{ result.resources[0].kind }}" | ||
| kubeconfig: "{{ faults_cluster.kubeconfig }}" | ||
| name: "{{ result.resources[0].metadata.name }}" | ||
| namespace: "{{ result.resources[0].metadata.namespace }}" | ||
| replicas: 0 | ||
| wait: true | ||
| wait_timeout: 60 | ||
| loop: "{{ faults_workloads_info.results }}" | ||
| loop_control: | ||
| label: "{{ result.resources[0].kind | lower }}/{{ result.resources[0].metadata.name }}" | ||
| loop_var: result | ||
| when: | ||
| - faults_workloads_info is defined | ||
| - result.resources | length == 1 | ||
|
|
||
| - name: Scale up workloads to original replica count | ||
| kubernetes.core.k8s_scale: | ||
| api_version: "{{ result.resources[0].apiVersion }}" | ||
| kind: "{{ result.resources[0].kind }}" | ||
| kubeconfig: "{{ faults_cluster.kubeconfig }}" | ||
| name: "{{ result.resources[0].metadata.name }}" | ||
| namespace: "{{ result.resources[0].metadata.namespace }}" | ||
| replicas: "{{ result.resources[0].spec.replicas }}" | ||
| wait: false | ||
| loop: "{{ faults_workloads_info.results }}" | ||
| loop_control: | ||
| label: "{{ result.resources[0].kind | lower }}/{{ result.resources[0].metadata.name }}" | ||
| loop_var: result | ||
| when: | ||
| - faults_workloads_info is defined | ||
| - result.resources | length == 1 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| --- | ||
| fault: [] | ||
| alerts: | ||
| - id: CPUSpend | ||
| group_id: otel-demo-namespace-1 | ||
| metadata: | ||
| description: CPU spend increased by 20 percent | ||
| - id: MemorySpend | ||
| group_id: otel-demo-namespace-1 | ||
| metadata: | ||
| description: Memory spend has increased by 20 percent | ||
| groups: | ||
| - id: otel-demo-namespace-1 | ||
| kind: Namespace | ||
| name: otel-demo | ||
| namespace: otel-demo | ||
| root_cause: true | ||
| aliases: | ||
| - - otel-demo-namespace-1 | ||
| propagations: [] | ||
| recommended_actions: | ||
| - solution: | ||
| id: "no_action" | ||
| actions: | ||
| - no changes is needed in application | ||
| - update opencost alert to prevent false alerts |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| --- | ||
| metadata: | ||
| complexity: Low | ||
| id: 32 | ||
| name: Low Mem CPU Constraints | ||
| platform: kubernetes | ||
| spec: | ||
| environment: | ||
| applications: | ||
| otel_demo: | ||
| enabled: true | ||
| tools: | ||
| category: sre | ||
| selected: | ||
| - kubernetes-topology-monitor | ||
| faults: | ||
| - custom: | ||
| name: misconfigured-resource-quota | ||
| misconfigured_network_policy: | ||
| workload: | ||
| kind: Deployment | ||
| name: frontend | ||
| namespace: "{{ applications_helm_releases.otel_demo.namespace }}" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure why these files were deleted? @VincentCCandela
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It may have been been deleted when I ran "ansible-lint roles/faults roles/incidents --fix" to address the linting issues. I did not intentionally delete. Same with sre/ansible.cfg