Skip to content

Commit 44c353d

Browse files
authored
Merge pull request #6801 from grondo/issue#6795
job-manager: add `stop-queues-on-restart` configuration parameter
2 parents bc7b3c9 + a4d5d8e commit 44c353d

File tree

3 files changed

+54
-3
lines changed

3 files changed

+54
-3
lines changed

doc/man5/flux-config-job-manager.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,17 @@ inactive-age-limit
2727
inactive-num-limit
2828
(optional) Integer maximum number of inactive jobs retained in the KVS.
2929

30+
stop-queues-on-restart
31+
(optional) Boolean value indicating if the job manager should automatically
32+
stop any started queues during a restart. Queues stopped in this manner will
33+
have their stop reason set to ::
34+
35+
Automatically stopped due to restart
36+
37+
while queues stopped before a shutdown will remain stopped with their
38+
original stop reason. The default value is ``false``, which means that
39+
started queues will remain started upon restart.
40+
3041
plugins
3142
(optional) An array of objects defining a list of jobtap plugin directives.
3243
Each directive follows the format defined in the :ref:`plugin_directive`

src/modules/job-manager/queue.c

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ struct queue_ctx {
8282
zhashx_t *named;
8383
};
8484
bool have_named_queues;
85+
bool stop_on_restart; // stop started queues on restart
8586
};
8687

8788
static void dequeue_jobs (struct queue_ctx *qctx, const char *name);
@@ -403,7 +404,18 @@ static int restore_state_v1 (struct queue_ctx *qctx, json_t *entry)
403404
return -1;
404405
}
405406
if (start) {
406-
if (queue_start (q, false) < 0)
407+
/* If job-manager.stop-queues-on-restart is enabled, then
408+
* stop this started queue with an automated message, otherwise
409+
* leave the queue started.
410+
*/
411+
if (qctx->stop_on_restart) {
412+
if (queue_stop_one (qctx,
413+
q,
414+
"Automatically stopped due to restart",
415+
false) < 0)
416+
return -1;
417+
}
418+
else if (queue_start (q, false) < 0)
407419
return -1;
408420
}
409421
else {
@@ -493,7 +505,12 @@ static int queue_configure (const flux_conf_t *conf,
493505
struct queue_ctx *qctx = arg;
494506
json_t *queues;
495507

496-
if (flux_conf_unpack (conf, NULL, "{s:o}", "queues", &queues) == 0
508+
if (flux_conf_unpack (conf,
509+
NULL,
510+
"{s?{s?b} s:o}",
511+
"job-manager",
512+
"stop-queues-on-restart", &qctx->stop_on_restart,
513+
"queues", &queues) == 0
497514
&& json_object_size (queues) > 0) {
498515
const char *name;
499516
json_t *value;
@@ -1025,6 +1042,8 @@ struct queue_ctx *queue_ctx_create (struct job_manager *ctx)
10251042
if (!(qctx = calloc (1, sizeof (*qctx))))
10261043
return NULL;
10271044
qctx->ctx = ctx;
1045+
qctx->stop_on_restart = false;
1046+
10281047
if (!(qctx->anon = queue_create (NULL, NULL)))
10291048
goto error;
10301049
if (flux_msg_handler_addvec (ctx->h,

t/t2219-job-manager-restart.t

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,28 @@ test_expect_success 'verify that named queue start/stop persists across restart'
159159
grep "^batch: Scheduling is stopped: xyzzy" dump_queue_start_3.out &&
160160
grep "^SCHED" dump_queue_start_3.out
161161
'
162-
162+
test_expect_success 'verify that stop-queues-on-restart config works' '
163+
mkdir -p conf.d &&
164+
cat >conf.d/queues.toml <<-EOT &&
165+
[job-manager]
166+
stop-queues-on-restart = true
167+
[queues.debug]
168+
[queues.batch]
169+
EOT
170+
flux start --config-path=$(pwd)/conf.d \
171+
-Scontent.dump=dump_queue_stop.tar \
172+
sh -c "flux queue start --all; flux queue stop -m xxyyzz batch" &&
173+
flux start --config-path=$(pwd)/conf.d \
174+
-Scontent.restore=dump_queue_stop.tar \
175+
flux queue status -v > dump_stop.out &&
176+
test_debug "cat dump_stop.out" &&
177+
grep \
178+
"batch: Scheduling is stopped: xxyyzz" \
179+
dump_stop.out &&
180+
grep \
181+
"debug: Scheduling is stopped: Automatically stopped due to restart" \
182+
dump_stop.out
183+
'
163184
test_expect_success 'checkpointed queue no longer configured on restart is ignored' '
164185
mkdir -p conf.d &&
165186
cat >conf.d/queues.toml <<-EOT &&

0 commit comments

Comments
 (0)