Skip to content

Commit c33c416

Browse files
authored
Merge pull request #1886 from HideoYamauchi/trac5267-4
Mid: storage-mon: daemonize storage_mon to deal with I/O hangs.(Add daemon/client mode)
2 parents 92f1625 + d1cf0b4 commit c33c416

File tree

5 files changed

+869
-138
lines changed

5 files changed

+869
-138
lines changed

configure.ac

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -620,6 +620,7 @@ fi
620620
PKG_CHECK_MODULES([GLIB], [$GPKGNAME])
621621
CPPFLAGS="$CPPFLAGS $GLIB_CFLAGS"
622622
LIBS="$LIBS $GLIB_LIBS"
623+
PKG_CHECK_MODULES([LIBQB], "libqb")
623624

624625
dnl ========================================================================
625626
dnl Headers

heartbeat/storage-mon.in

Lines changed: 183 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -48,20 +48,26 @@
4848
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
4949

5050
#
51-
STORAGEMON=$HA_BIN/storage_mon
52-
ATTRDUP=/usr/sbin/attrd_updater
51+
STORAGEMON=${HA_BIN}/storage_mon
52+
ATTRDUP=${HA_SBIN_DIR}/attrd_updater
53+
PIDFILE=${HA_VARRUN}/storage-mon-${OCF_RESOURCE_INSTANCE}.pid
54+
ATTRNAME="#health-${OCF_RESOURCE_INSTANCE}"
5355

5456
OCF_RESKEY_CRM_meta_interval_default="0"
5557
OCF_RESKEY_io_timeout_default="10"
58+
OCF_RESKEY_check_interval_default="30"
5659
OCF_RESKEY_inject_errors_default=""
5760
OCF_RESKEY_state_file_default="${HA_RSCTMP%%/}/storage-mon-${OCF_RESOURCE_INSTANCE}.state"
61+
OCF_RESKEY_daemonize_default="false"
5862

5963
# Explicitly list all environment variables used, to make static analysis happy
6064
: ${OCF_RESKEY_CRM_meta_interval:=${OCF_RESKEY_CRM_meta_interval_default}}
6165
: ${OCF_RESKEY_drives:=""}
6266
: ${OCF_RESKEY_io_timeout:=${OCF_RESKEY_io_timeout_default}}
67+
: ${OCF_RESKEY_check_interval:=${OCF_RESKEY_check_interval_default}}
6368
: ${OCF_RESKEY_inject_errors:=${OCF_RESKEY_inject_errors_default}}
6469
: ${OCF_RESKEY_state_file:=${OCF_RESKEY_state_file_default}}
70+
: ${OCF_RESKEY_daemonize:=${OCF_RESKEY_daemonize_default}}
6571

6672
#######################################################################
6773

@@ -106,6 +112,14 @@ Specify disk I/O timeout in seconds. Minimum 1, recommended 10 (default).
106112
<content type="integer" default="${OCF_RESKEY_io_timeout_default}" />
107113
</parameter>
108114
115+
<parameter name="check_interval" unique="0">
116+
<longdesc lang="en">
117+
Specify interval between I/O checks in seconds.(Only supported with the damonize option.)
118+
</longdesc>
119+
<shortdesc lang="en">I/O check interval</shortdesc>
120+
<content type="integer" default="${OCF_RESKEY_check_interval_default}" />
121+
</parameter>
122+
109123
<parameter name="inject_errors" unique="0">
110124
<longdesc lang="en">
111125
Used only for testing! Specify % of I/O errors to simulate drives failures.
@@ -114,6 +128,14 @@ Used only for testing! Specify % of I/O errors to simulate drives failures.
114128
<content type="integer" default="${OCF_RESKEY_inject_errors_default}" />
115129
</parameter>
116130
131+
<parameter name="daemonize" unique="0">
132+
<longdesc lang="en">
133+
Specifies to start storage-mon as a daemon and check for devices.
134+
</longdesc>
135+
<shortdesc lang="en">start storage-mon with daemon</shortdesc>
136+
<content type="boolean" default="${OCF_RESKEY_daemonize_default}" />
137+
</parameter>
138+
117139
</parameters>
118140
119141
<actions>
@@ -146,6 +168,11 @@ storage-mon_init() {
146168
exit $OCF_ERR_INSTALLED
147169
fi
148170

171+
if [ ! -x "$ATTRDUP" ] ; then
172+
ocf_log err "${ATTRDUP} not installed."
173+
exit $OCF_ERR_INSTALLED
174+
fi
175+
149176
i=0
150177
for DRIVE in ${OCF_RESKEY_drives}; do
151178
if [ ! -e "$DRIVE" ] ; then
@@ -161,7 +188,12 @@ storage-mon_init() {
161188
fi
162189

163190
if [ "${OCF_RESKEY_io_timeout}" -lt "1" ]; then
164-
ocf_log err "Minimum timeout is 1. Recommended 10 (default)."
191+
ocf_log err "Minimum timeout is 1. Recommended ${OCF_RESKEY_io_timeout_default} (default)."
192+
exit $OCF_ERR_CONFIGURED
193+
fi
194+
195+
if [ "${OCF_RESKEY_check_interval}" -lt "1" ]; then
196+
ocf_log err "Minimum interval to check is 1. default ${OCF_RESKEY_check_interval_default}."
165197
exit $OCF_ERR_CONFIGURED
166198
fi
167199

@@ -173,77 +205,181 @@ storage-mon_init() {
173205
fi
174206
}
175207

176-
storage-mon_validate() {
177-
storage-mon_init
208+
storage-mon_update_attribute() {
178209

179-
# Is the state directory writable?
180-
state_dir=$(dirname "$OCF_RESKEY_state_file")
181-
touch "$state_dir/$$"
182-
if [ $? -ne 0 ]; then
183-
return $OCF_ERR_CONFIGURED
184-
fi
185-
rm "$state_dir/$$"
210+
while :
211+
do
212+
"$ATTRDUP" -n ${ATTRNAME} -U "$1" -d "5s"
213+
rc=$?
214+
if [ $rc -eq 0 ]; then
215+
break
216+
fi
186217

218+
ocf_log debug "${1} attribute by attrd_updater failed"
219+
if [ "$1" = "red" ]; then
220+
# If the attrd_updater fails with the red attribute, return an error to let pacemaker handle the failure immediately.
221+
return $OCF_ERR_GENERIC
222+
fi
223+
done
187224
return $OCF_SUCCESS
188225
}
189226

190227
storage-mon_monitor() {
191-
storage-mon_init
228+
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
229+
storage-mon_init
192230

193-
# Monitor _MUST!_ differentiate correctly between running
194-
# (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
195-
# That is THREE states, not just yes/no.
231+
# Monitor _MUST!_ differentiate correctly between running
232+
# (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING).
233+
# That is THREE states, not just yes/no.
196234

197-
if [ ! -f "${OCF_RESKEY_state_file}" ]; then
198-
return $OCF_NOT_RUNNING
199-
fi
235+
if [ ! -f "${OCF_RESKEY_state_file}" ]; then
236+
return $OCF_NOT_RUNNING
237+
fi
200238

201-
# generate command line
202-
cmdline=""
203-
for DRIVE in ${OCF_RESKEY_drives}; do
204-
cmdline="$cmdline --device $DRIVE --score 1"
205-
done
206-
cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}"
207-
if [ -n "${OCF_RESKEY_inject_errors}" ]; then
208-
cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
209-
fi
210-
$STORAGEMON $cmdline
211-
if [ $? -ne 0 ]; then
212-
status="red"
239+
# generate command line
240+
cmdline=""
241+
for DRIVE in ${OCF_RESKEY_drives}; do
242+
cmdline="$cmdline --device $DRIVE --score 1"
243+
done
244+
cmdline="$cmdline --timeout ${OCF_RESKEY_io_timeout}"
245+
if [ -n "${OCF_RESKEY_inject_errors}" ]; then
246+
cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
247+
fi
248+
$STORAGEMON $cmdline
249+
if [ $? -ne 0 ]; then
250+
status="red"
251+
else
252+
status="green"
253+
fi
254+
255+
storage-mon_update_attribute $status
256+
return "$?"
213257
else
214-
status="green"
215-
fi
258+
ocf_pidfile_status "${PIDFILE}" > /dev/null 2>&1
259+
case "$?" in
260+
0) rc=$OCF_SUCCESS;;
261+
1|2) rc=$OCF_NOT_RUNNING;;
262+
*) rc=$OCF_ERR_GENERIC;;
263+
esac
264+
265+
if [ $rc -ne $OCF_SUCCESS ]; then
266+
return "$rc"
267+
fi
268+
if [ "$1" = "pid_check_only" ]; then
269+
return "$rc"
270+
fi
216271

217-
"$ATTRDUP" -n "#health-${OCF_RESOURCE_INSTANCE}" -U "$status" -d "5s"
218-
return $OCF_SUCCESS
272+
# generate client command line
273+
cmdline=""
274+
cmdline="$cmdline --client --attrname ${ATTRNAME}"
275+
while :
276+
do
277+
# 0 : Normal.
278+
# greater than 0 : monitoring error.
279+
# 255(-1) : communication system error.
280+
# 254(-2) : Not all checks completed for first device in daemon mode.
281+
$STORAGEMON $cmdline
282+
rc=$?
283+
case "$rc" in
284+
254|255)
285+
# If there is a communication error or the initial check of all devices has not been completed,
286+
# it will loop and try to reconnect.
287+
# When everything ends with a communication error during monitor, a monitor timeout occurs.
288+
ocf_log debug "client monitor error : $rc"
289+
;;
290+
0)
291+
status="green"
292+
break
293+
;;
294+
*)
295+
status="red"
296+
break
297+
;;
298+
esac
299+
done
300+
301+
storage-mon_update_attribute $status
302+
return "$?"
303+
fi
219304
}
220305

221306
storage-mon_start() {
222-
storage-mon_monitor
223-
if [ $? -eq $OCF_SUCCESS ]; then
224-
return $OCF_SUCCESS
307+
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
308+
storage-mon_monitor
309+
if [ $? -eq $OCF_SUCCESS ]; then
310+
return $OCF_SUCCESS
311+
fi
312+
touch "${OCF_RESKEY_state_file}"
313+
else
314+
storage-mon_init
315+
# generate command line
316+
cmdline=""
317+
for DRIVE in ${OCF_RESKEY_drives}; do
318+
cmdline="$cmdline --device $DRIVE --score 1"
319+
done
320+
cmdline="$cmdline --daemonize --timeout ${OCF_RESKEY_io_timeout} --interval ${OCF_RESKEY_check_interval} --pidfile ${PIDFILE} --attrname ${ATTRNAME}"
321+
if [ -n "${OCF_RESKEY_inject_errors}" ]; then
322+
cmdline="$cmdline --inject-errors-percent ${OCF_RESKEY_inject_errors}"
323+
fi
324+
$STORAGEMON $cmdline
325+
if [ "$?" -ne 0 ]; then
326+
return $OCF_ERR_GENERIC
327+
fi
225328
fi
226-
touch "${OCF_RESKEY_state_file}"
227329
}
228330

229331
storage-mon_stop() {
230332
storage-mon_monitor
231-
if [ $? -eq $OCF_SUCCESS ]; then
232-
rm "${OCF_RESKEY_state_file}"
333+
rc=$?
334+
335+
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
336+
if [ $rc -eq $OCF_SUCCESS ]; then
337+
rm "${OCF_RESKEY_state_file}"
338+
fi
339+
else
340+
case "$rc" in
341+
$OCF_SUCCESS)
342+
;;
343+
$OCF_NOT_RUNNING)
344+
return "$OCF_SUCCESS";;
345+
*)
346+
return "$rc";;
347+
esac
348+
349+
kill -TERM $(cat "${PIDFILE}")
350+
if [ "$?" -ne 0 ]; then
351+
return $OCF_ERR_GENERIC
352+
fi
353+
354+
while true; do
355+
storage-mon_monitor pid_check_only
356+
rc="$?"
357+
case "$rc" in
358+
$OCF_SUCCESS)
359+
;;
360+
$OCF_NOT_RUNNING)
361+
return "$OCF_SUCCESS";;
362+
*)
363+
return "$rc";;
364+
esac
365+
sleep 1
366+
done
233367
fi
234368
return $OCF_SUCCESS
235369
}
236370

237371
storage-mon_validate() {
238372
storage-mon_init
239373

240-
# Is the state directory writable?
241-
state_dir=$(dirname "${OCF_RESKEY_state_file}")
242-
touch "$state_dir/$$"
243-
if [ $? -ne 0 ]; then
244-
return $OCF_ERR_CONFIGURED
374+
if ! ocf_is_true "$OCF_RESKEY_daemonize"; then
375+
# Is the state directory writable?
376+
state_dir=$(dirname "${OCF_RESKEY_state_file}")
377+
touch "$state_dir/$$"
378+
if [ $? -ne 0 ]; then
379+
return $OCF_ERR_CONFIGURED
380+
fi
381+
rm "$state_dir/$$"
245382
fi
246-
rm "$state_dir/$$"
247383

248384
return $OCF_SUCCESS
249385
}

resource-agents.spec.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ Provides: heartbeat-resources = %{version}
5555
BuildRequires: make
5656
BuildRequires: automake autoconf pkgconfig gcc
5757
BuildRequires: perl
58-
BuildRequires: libxslt glib2-devel
58+
BuildRequires: libxslt glib2-devel libqb-devel
5959
BuildRequires: systemd
6060
BuildRequires: which
6161

tools/Makefile.am

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,8 @@ sfex_stat_LDADD = $(GLIBLIB) -lplumb -lplumbgpl
7474
findif_SOURCES = findif.c
7575

7676
storage_mon_SOURCES = storage_mon.c
77-
storage_mon_CFLAGS = -D_GNU_SOURCE
77+
storage_mon_CFLAGS = -D_GNU_SOURCE ${LIBQB_CFLAGS}
78+
storage_mon_LDADD = ${LIBQB_LIBS}
7879

7980
if BUILD_TICKLE
8081
halib_PROGRAMS += tickle_tcp

0 commit comments

Comments
 (0)