Skip to content

Commit b0fa6a8

Browse files
Merge pull request #429 from chronicleworks/feature/backport-liveness
* Introduce integrated liveness check
2 parents 37a4466 + 434ddf6 commit b0fa6a8

File tree

17 files changed

+865
-294
lines changed

17 files changed

+865
-294
lines changed

Cargo.lock

Lines changed: 488 additions & 168 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,8 @@ prost = "0.10" # common, sawtooth-protocol, sawtooth-tp: version = "0.10.0"
9898
prost-build = "0.10.0"
9999
prost-types = "0.11.2"
100100
protobuf = "2.27.1"
101+
metrics = "0.21.0"
102+
metrics-exporter-prometheus = "0.12.1"
101103
question = "0.2.2"
102104
r2d2 = "0.8.9"
103105
rand = { version = "0.8.5", features = ["getrandom"] }
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
apiVersion: v1
2+
kind: ConfigMap
3+
metadata:
4+
name: {{ include "common.names.fullname" . }}-scripts
5+
data:
6+
first_depth_charge.sh: |
7+
#!/bin/bash
8+
metrics=$(curl -s http://localhost:9000/metrics)
9+
count=$(echo "$metrics" | grep '^depth_charge_round_trip_count' | awk '{print $2}')
10+
if [[ -z "$count" ]] || [[ $count -eq 0 ]]; then
11+
exit 1
12+
fi
13+
check_timeouts.sh: |
14+
#!/bin/bash
15+
metrics=$(curl -s http://localhost:9000/metrics)
16+
timeouts=$(echo "$metrics" | grep '^depth_charge_timeouts' | awk '{print $2}' | tr -d '\r')
17+
if [[ "$timeouts" =~ ^[0-9]+$ ]] && [[ "$timeouts" -ne 0 ]]; then
18+
echo "Non-zero depth_charge_timeouts detected: $timeouts"
19+
exit 1
20+
else
21+
echo "No non-zero depth_charge_timeouts detected."
22+
exit 0
23+
fi

charts/chronicle/templates/statefulset.yaml

Lines changed: 31 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,16 @@ spec:
123123
{{- end }}
124124
serve-api \
125125
--interface 0.0.0.0:{{ .Values.port }} \
126+
{{- if .Values.livenessProbe.enabled }}
127+
--liveness-interval {{ .Values.livenessProbe.periodSeconds }}
128+
--liveness-deadline {{ .Values.livenessProbe.timeoutSeconds }}
129+
{{- end }}
126130
{{- if .Values.auth.required }}
127131
--require-auth \
128132
{{- end }}
129133
{{ include "chronicle.jwks-url.cli" . }}
130134
{{ include "chronicle.userinfo-url.cli" . }}
131-
{{ include "chronicle.id-claims" . }}
132-
;
135+
{{ include "chronicle.id-claims" . }};
133136
env: {{ include "lib.safeToYaml" .Values.env | nindent 12 }}
134137
- name: RUST_LOG
135138
value: {{ .Values.logLevel }}
@@ -144,6 +147,26 @@ spec:
144147
{{- end }}
145148
{{- include "lib.safeToYaml" .Values.postgres.env | nindent 12 }}
146149
resources: {{- include "lib.safeToYaml" .Values.resources | nindent 12 }}
150+
{{- if .Values.livenessProbe.enabled }}
151+
livenessProbe:
152+
exec:
153+
command:
154+
- /bin/bash
155+
- /scripts/check_timeouts.sh
156+
initialDelaySeconds: 1
157+
periodSeconds: 1
158+
failureThreshold: 1
159+
{{- end}}
160+
{{- if .Values.readinessProbe.enabled }}
161+
readinessProbe:
162+
exec:
163+
command:
164+
- /bin/bash
165+
- /scripts/first_depth_charge.sh
166+
initialDelaySeconds: 1
167+
periodSeconds: 1
168+
failureThreshold: 600
169+
{{- end}}
147170
volumeMounts:
148171
- name: chronicle-config
149172
mountPath: /etc/chronicle/config/
@@ -152,107 +175,10 @@ spec:
152175
readOnly: true
153176
- name: chronicle-data
154177
mountPath: /var/lib/chronicle/store/
178+
- name: check-metrics-available
179+
mountPath: /scripts/
180+
readOnly: true
155181
{{- include "lib.volumeMounts" .Values.extraVolumeMounts | nindent 12 }}
156-
{{- if .Values.livenessProbe.enabled }}
157-
livenessProbe:
158-
exec:
159-
command:
160-
- bash
161-
- -c
162-
- |
163-
PROBE_ID="startup_$(LC_ALL=C tr -dc A-Za-z0-9 </dev/urandom | head -c 13)" &&
164-
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ") &&
165-
echo '[
166-
{
167-
"@id": "_:n1",
168-
"@type": [
169-
"http://btp.works/chronicleoperations/ns#ActivityExists"
170-
],
171-
"http://btp.works/chronicleoperations/ns#activityName": [
172-
{
173-
"@value": "'"$PROBE_ID"'"
174-
}
175-
],
176-
"http://btp.works/chronicleoperations/ns#namespaceName": [
177-
{
178-
"@value": "{{ .Values.livenessProbe.namespaceName }}"
179-
}
180-
],
181-
"http://btp.works/chronicleoperations/ns#namespaceUuid": [
182-
{
183-
"@value": "{{ .Values.livenessProbe.namespaceUuid }}"
184-
}
185-
]
186-
}
187-
]' > /tmp/import.json &&
188-
echo "Probe ID: $PROBE_ID" &&
189-
chronicle \
190-
-c /etc/chronicle/config/config.toml \
191-
--console-logging json \
192-
--sawtooth tcp://{{ include "chronicle.sawtooth.service" . }}:{{ include "chronicle.sawtooth.sawcomp" . }} \
193-
--remote-database \
194-
--database-name {{ .Values.postgres.database }} \
195-
--database-username {{ .Values.postgres.user }} \
196-
--database-host {{ .Values.postgres.host }} \
197-
{{- if not .Values.opa.enabled }}
198-
--embedded-opa-policy \
199-
{{- end }}
200-
import {{ .Values.livenessProbe.namespaceName }} {{ .Values.livenessProbe.namespaceUuid }} < /tmp/import.json
201-
initialDelaySeconds: {{ .Values.livenessProbe.initialDelaySeconds }}
202-
periodSeconds: {{ .Values.livenessProbe.periodSeconds }}
203-
timeoutSeconds: {{ .Values.livenessProbe.timeoutSeconds }}
204-
failureThreshold: {{ .Values.livenessProbe.failureThreshold }}
205-
{{- end }}
206-
{{- if .Values.startUpProbe.enabled }}
207-
startupProbe:
208-
exec:
209-
command:
210-
- bash
211-
- -c
212-
- |
213-
PROBE_ID="startup_$(LC_ALL=C tr -dc A-Za-z0-9 </dev/urandom | head -c 13)" &&
214-
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ") &&
215-
echo '[
216-
{
217-
"@id": "_:n1",
218-
"@type": [
219-
"http://btp.works/chronicleoperations/ns#ActivityExists"
220-
],
221-
"http://btp.works/chronicleoperations/ns#activityName": [
222-
{
223-
"@value": "'"$PROBE_ID"'"
224-
}
225-
],
226-
"http://btp.works/chronicleoperations/ns#namespaceName": [
227-
{
228-
"@value": "{{ .Values.startUpProbe.namespaceName }}"
229-
}
230-
],
231-
"http://btp.works/chronicleoperations/ns#namespaceUuid": [
232-
{
233-
"@value": "{{ .Values.startUpProbe.namespaceUuid }}"
234-
}
235-
]
236-
}
237-
]' > /tmp/import.json &&
238-
echo "Probe ID: $PROBE_ID" &&
239-
chronicle \
240-
-c /etc/chronicle/config/config.toml \
241-
--console-logging json \
242-
--sawtooth tcp://{{ include "chronicle.sawtooth.service" . }}:{{ include "chronicle.sawtooth.sawcomp" . }} \
243-
--remote-database \
244-
--database-name {{ .Values.postgres.database }} \
245-
--database-username {{ .Values.postgres.user }} \
246-
--database-host {{ .Values.postgres.host }} \
247-
{{- if not .Values.opa.enabled }}
248-
--embedded-opa-policy \
249-
{{- end }}
250-
import {{ .Values.startUpProbe.namespaceName }} {{ .Values.startUpProbe.namespaceUuid }} < /tmp/import.json
251-
initialDelaySeconds: {{ .Values.startUpProbe.initialDelaySeconds }}
252-
periodSeconds: {{ .Values.startUpProbe.periodSeconds }}
253-
timeoutSeconds: {{ .Values.startUpProbe.timeoutSeconds }}
254-
failureThreshold: {{ .Values.startUpProbe.failureThreshold }}
255-
{{- end }}
256182
volumes:
257183
- name: chronicle-secrets
258184
persistentVolumeClaim:
@@ -263,6 +189,9 @@ spec:
263189
- name: chronicle-config
264190
configMap:
265191
name: {{ .Release.Name }}-chronicle-config
192+
- name: check-metrics-available
193+
configMap:
194+
name: {{ include "common.names.fullname" . }}-scripts
266195
{{- if not .Values.postgres.persistence.enabled }}
267196
- name: "pgdata"
268197
emptyDir: {}

charts/chronicle/values.yaml

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,15 @@ auth:
2424

2525
## @md | `livenessProbe.enabled` | if true, enables the liveness probe | false |
2626
livenessProbe:
27-
enabled: false
27+
enabled: true
2828
## @md | `livenessProbe.timeoutSeconds` | number of seconds after which the probe times out | 10 |
2929
timeoutSeconds: 20
3030
## @md | `livenessProbe.periodSeconds` | how often (in seconds) to perform the probe | 60 |
3131
periodSeconds: 60
32-
## @md | `livenessProbe.failureThreshold` | when a probe fails, Kubernetes will try failureThreshold times before giving up | 1 |
33-
failureThreshold: 1
34-
## @md | `livenessProbe.namespaceName` | the Chronicle namespace in which the probe operates | default |
35-
namespaceName: default
36-
## @md | `livenessProbe.namespaceUuid` | the UUID of the Chronicle namespace in which the probe operates | fd717fd6-70f1-44c1-81de-287d5e101089 |
37-
namespaceUuid: fd717fd6-70f1-44c1-81de-287d5e101089
32+
33+
## @md | `readinessProbe.enabled` | if true, enables the readiness probe | false |
34+
readinessProbe:
35+
enabled: true
3836

3937
## @md | `startUpProbe.enabled` | if true, enables the startup probe | true |
4038
startUpProbe:

crates/api/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ opentelemetry = { workspace = true }
4040
parking_lot = { workspace = true }
4141
poem = { workspace = true }
4242
r2d2 = { workspace = true }
43+
metrics = { workspace = true }
44+
metrics-exporter-prometheus = { workspace = true }
4345
rand = { workspace = true }
4446
rand_core = { workspace = true }
4547
reqwest = { workspace = true }

crates/api/src/chronicle_graphql/mod.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ use async_graphql::{
55
SimpleObject, Subscription, SubscriptionType,
66
};
77

8-
98
use async_graphql_poem::{
109
GraphQL, GraphQLBatchRequest, GraphQLBatchResponse, GraphQLProtocol, GraphQLSubscription,
1110
GraphQLWebSocket,

0 commit comments

Comments
 (0)