Skip to content

Commit 6062a2b

Browse files
authored
Fixed cleaning up namespaces in openshift (#524)
# Summary This PR fixes cleaning up namespaces in static OpenShift cluster: - added resource deletion with removing finalizers on test teardown in e2e.sh to avoid namespaces being stuck - updated cluster-cleaner image as most probably it stopped working because the cluster-cleaner docker image was using a very old kubectl version. Both cluster-cleaner and e2e.sh uses custom kubectl scripts to delete namespace. We should migrate to use our reset.go, but it wasn't addressed as part of this PR. ## Proof of Work (Green) [Manually triggered openshift tests](https://spruce.mongodb.com/version/68ea233322c441000704a94d/tasks?sorts=STATUS%3AASC%3BBASE_STATUS%3ADESC) ## Checklist - [ ] Have you linked a jira ticket and/or is the ticket in the title? - [ ] Have you checked whether your jira ticket required DOCSP changes? - [ ] Have you added changelog file? - use `skip-changelog` label if not needed - refer to [Changelog files and Release Notes](https://github.com/mongodb/mongodb-kubernetes/blob/master/CONTRIBUTING.md#changelog-files-and-release-notes) section in CONTRIBUTING.md for more details
1 parent 35b2a30 commit 6062a2b

File tree

10 files changed

+54
-142
lines changed

10 files changed

+54
-142
lines changed

docker/cluster-cleaner/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
FROM python:3-slim-buster
22

3-
ADD https://storage.googleapis.com/kubernetes-release/release/v1.13.3/bin/linux/amd64/kubectl /usr/bin
3+
ADD https://dl.k8s.io/release/v1.34.0/bin/linux/amd64/kubectl /usr/bin
44
RUN chmod +x /usr/bin/kubectl
55

66
COPY scripts/* /

docker/cluster-cleaner/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
IMAGE_VERSION=0.15
1+
IMAGE_VERSION=0.18
22

33
.PHONY: all
44
all: build push install
@@ -18,7 +18,7 @@ install: build push
1818
kubectl create namespace cluster-cleaner || true
1919
helm template . \
2020
--set cleanerVersion=$(IMAGE_VERSION) \
21-
--set namespace=cluster-cleaner\
21+
--set namespace=cluster-cleaner \
2222
--set cleanerNamespace=cluster-cleaner > cluster-cleaner.yaml
2323
kubectl apply -f cluster-cleaner.yaml
2424
rm cluster-cleaner.yaml

docker/cluster-cleaner/scripts/clean-failed-namespaces.sh

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,24 @@
11
#!/usr/bin/env sh
22

3+
touch error.log
4+
tail -F error.log &
5+
36
delete_resources_safely() {
47
resource_type="$1"
58
namespace="$2"
69

710
echo "Attempting normal deletion of $resource_type in $namespace..."
8-
kubectl delete "${resource_type}" --all -n "${namespace}" --wait=true --timeout=10s || true
11+
kubectl delete "${resource_type}" --all -n "${namespace}" --wait=true --timeout=10s 2>error.log|| true
912

1013
# Check if any resources are still stuck
1114
# Let's not fail here and continue deletion
12-
resources=$(kubectl get "$resource_type" -n "${namespace}" --no-headers -o custom-columns=":metadata.name" 2>/dev/null || true)
15+
resources=$(kubectl get "$resource_type" -n "${namespace}" --no-headers -o custom-columns=":metadata.name" 2>error.log || true)
1316

1417
for resource in ${resources}; do
1518
echo "${resource_type}/${resource} is still present, force deleting..."
1619

17-
kubectl patch "${resource_type}" "${resource}" -n "${namespace}" -p '{"metadata":{"finalizers":null}}' --type=merge || true
18-
kubectl delete "${resource_type}" "${resource}" -n "${namespace}" --force --grace-period=0 || true
20+
kubectl patch "${resource_type}" "${resource}" -n "${namespace}" -p '{"metadata":{"finalizers":null}}' --type=merge 2>error.log || true
21+
kubectl delete "${resource_type}" "${resource}" -n "${namespace}" --force --grace-period=0 2>error.log || true
1922
done
2023
}
2124

@@ -29,11 +32,12 @@ if [ -z ${LABELS+x} ]; then
2932
exit 1
3033
fi
3134

35+
3236
echo "Deleting namespaces for evg tasks that are older than ${DELETE_OLDER_THAN_AMOUNT} ${DELETE_OLDER_THAN_UNIT} with label ${LABELS}"
3337
echo "Which are:"
3438
kubectl get namespace -l "${LABELS}" -o name
35-
for namespace in $(kubectl get namespace -l "${LABELS}" -o name); do
36-
creation_time=$(kubectl get "${namespace}" -o jsonpath='{.metadata.creationTimestamp}' 2>/dev/null || echo "")
39+
for namespace in $(kubectl get namespace -l "${LABELS}" -o name 2>error.log); do
40+
creation_time=$(kubectl get "${namespace}" -o jsonpath='{.metadata.creationTimestamp}' 2>error.log || echo "")
3741

3842
if [ -z "$creation_time" ]; then
3943
echo "Namespace ${namespace} does not exist or has no creation timestamp, skipping."
@@ -49,14 +53,17 @@ for namespace in $(kubectl get namespace -l "${LABELS}" -o name); do
4953

5054
echo "Deleting ${namespace_name}"
5155

52-
csrs_in_namespace=$(kubectl get csr -o name | grep "${namespace_name}" || true)
56+
csrs_in_namespace=$(kubectl get csr -o name 2>error.log | grep "${namespace_name}" 2>/dev/null || true)
5357
if [ -n "${csrs_in_namespace}" ]; then
54-
kubectl delete "${csrs_in_namespace}"
58+
kubectl delete "${csrs_in_namespace}" 2>error.log
5559
fi
5660

5761
delete_resources_safely "mdb" "${namespace_name}"
5862
delete_resources_safely "mdbu" "${namespace_name}"
63+
delete_resources_safely "mdbc" "${namespace_name}"
64+
delete_resources_safely "mdbmc" "${namespace_name}"
5965
delete_resources_safely "om" "${namespace_name}"
66+
delete_resources_safely "clustermongodbroles" "${namespace_name}"
6067

6168
echo "Attempting to delete namespace: ${namespace_name}"
6269

docker/cluster-cleaner/scripts/clean-ops-manager.sh

Lines changed: 0 additions & 11 deletions
This file was deleted.

docker/cluster-cleaner/scripts/construction-site.sh

Lines changed: 0 additions & 9 deletions
This file was deleted.

docker/cluster-cleaner/scripts/delete-old-builder-pods.sh

Lines changed: 0 additions & 23 deletions
This file was deleted.

docker/cluster-cleaner/templates/job.yaml

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -84,34 +84,6 @@ spec:
8484
- name: LABELS
8585
value: "evg=task"
8686

87-
# Clean old builder pods
88-
---
89-
apiVersion: batch/v1
90-
kind: CronJob
91-
metadata:
92-
name: cluster-cleaner-delete-builder-pods
93-
namespace: {{ .Values.cleanerNamespace }}
94-
spec:
95-
# Runs every hour
96-
schedule: "0 * * * *"
97-
jobTemplate:
98-
spec:
99-
template:
100-
spec:
101-
serviceAccountName: cluster-cleaner
102-
restartPolicy: Never
103-
104-
containers:
105-
- name: cluster-cleaner
106-
image: 268558157000.dkr.ecr.us-east-1.amazonaws.com/dev/cluster-cleaner:{{ .Values.cleanerVersion }}
107-
imagePullPolicy: Always
108-
command: ["./delete-old-builder-pods.sh"]
109-
env:
110-
- name: DELETE_OLDER_THAN_UNIT
111-
value: "minutes"
112-
- name: DELETE_OLDER_THAN_AMOUNT
113-
value: "20"
114-
11587
# Clean old certificates
11688
---
11789
apiVersion: batch/v1

docker/cluster-cleaner/templates/ops_manager_cleaner_job.yaml

Lines changed: 0 additions & 48 deletions
This file was deleted.

scripts/evergreen/e2e/e2e.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ echo "TEST_NAME is set to: ${TEST_NAME}"
123123

124124
delete_operator "${NAMESPACE}"
125125

126-
# We'll have the task running for the alloca ted time, minus the time it took us
126+
# We'll have the task running for the allocated time, minus the time it took us
127127
# to get all the way here, assuming configuring and deploying the operator can
128128
# take a bit of time. This is needed because Evergreen kills the process *AND*
129129
# Docker containers running on the host when it hits a timeout. Under these

scripts/funcs/kubernetes

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,15 @@ create_image_registries_secret() {
9898
context=$1
9999
namespace=$2
100100
secret_name=$3
101-
101+
102102
# Detect the correct config file path based on container runtime
103103
local config_file
104104
local temp_config_file=""
105105
if command -v podman &> /dev/null && (podman info &> /dev/null || sudo podman info &> /dev/null); then
106106
# For Podman, use root's auth.json since minikube uses sudo podman
107107
config_file="/root/.config/containers/auth.json"
108108
echo "Using Podman config: ${config_file}"
109-
109+
110110
# Create a temporary copy that the current user can read
111111
temp_config_file=$(mktemp)
112112
sudo cp "${config_file}" "${temp_config_file}"
@@ -117,7 +117,7 @@ create_image_registries_secret() {
117117
config_file="${HOME}/.docker/config.json"
118118
echo "Using Docker config: ${config_file}"
119119
fi
120-
120+
121121
# shellcheck disable=SC2154
122122
if kubectl --context "${context}" get namespace "${namespace}"; then
123123
kubectl --context "${context}" -n "${namespace}" delete secret "${secret_name}" --ignore-not-found
@@ -127,7 +127,7 @@ create_image_registries_secret() {
127127
else
128128
echo "Skipping creating pull secret in ${context}/${namespace}. The namespace doesn't exist yet."
129129
fi
130-
130+
131131
# Clean up temporary file
132132
if [[ -n "${temp_config_file}" ]] && [[ -f "${temp_config_file}" ]]; then
133133
rm -f "${temp_config_file}"
@@ -156,6 +156,26 @@ create_image_registries_secret() {
156156
fi
157157
}
158158

159+
force_delete_all_resources_from_namespace() {
160+
resource_type="$1"
161+
namespace="$2"
162+
163+
echo "Attempting normal deletion of ${resource_type} in ${namespace}..."
164+
kubectl delete "${resource_type}" --all -n "${namespace}" --wait=true --timeout=10s || true
165+
166+
# Check if any resources are still stuck
167+
echo "Checking if any resources are still stuck:"
168+
kubectl get "${resource_type}" -n "${namespace}" --no-headers -o custom-columns=":metadata.name" || true
169+
resources=$(kubectl get "${resource_type}" -n "${namespace}" --no-headers -o custom-columns=":metadata.name" 2>/dev/null || true)
170+
171+
for resource in ${resources}; do
172+
echo "${resource_type}/${resource} is still present, force deleting..."
173+
174+
kubectl patch "${resource_type}" "${resource}" -n "${namespace}" -p '{"metadata":{"finalizers":null}}' --type=merge || true
175+
kubectl delete "${resource_type}" "${resource}" -n "${namespace}" --force --grace-period=0 || true
176+
done
177+
}
178+
159179
reset_namespace() {
160180
context=$1
161181
namespace=$2
@@ -166,19 +186,23 @@ reset_namespace() {
166186

167187
set +e
168188

169-
helm uninstall --kube-context="${context}" mongodb-kubernetes-operator || true &
170-
helm uninstall --kube-context="${context}" mongodb-kubernetes-operator-multi-cluster || true &
171-
172189
# Cleans the namespace. Note, that fine-grained cleanup is performed instead of just deleting the namespace as it takes
173190
# considerably less time
174191
title "Cleaning Kubernetes resources in context: ${context}"
175192

176193
ensure_namespace "${namespace}"
177194

178-
kubectl delete --context "${context}" mdb --all -n "${namespace}" || true
179-
kubectl delete --context "${context}" mdbu --all -n "${namespace}" || true
180-
kubectl delete --context "${context}" mdbmc --all -n "${namespace}" || true
181-
kubectl delete --context "${context}" om --all -n "${namespace}" || true
195+
force_delete_all_resources_from_namespace "mdb" "${namespace}"
196+
force_delete_all_resources_from_namespace "mdbu" "${namespace}"
197+
force_delete_all_resources_from_namespace "mdbc" "${namespace}"
198+
force_delete_all_resources_from_namespace "mdbmc" "${namespace}"
199+
force_delete_all_resources_from_namespace "om" "${namespace}"
200+
force_delete_all_resources_from_namespace "clustermongodbroles" "${namespace}"
201+
202+
echo "Sleeping to allow the operator to perform cleanups"
203+
sleep 10
204+
helm uninstall --kube-context="${context}" mongodb-kubernetes-operator || true &
205+
helm uninstall --kube-context="${context}" mongodb-kubernetes-operator-multi-cluster || true &
182206

183207
# Openshift variant runs all tests sequentially. In order to avoid clashes between tests, we need to wait till
184208
# the namespace is gone. This trigger OpenShift Project deletion, which is a "Namespace on Steroids" and it takes

0 commit comments

Comments
 (0)