Skip to content

Commit b88c0b4

Browse files
authored
Merge pull request #6765 from Martchus/reject-if-broken
Reject jobs if worker is broken when receiving a new job
2 parents e5c0222 + bc373d2 commit b88c0b4

File tree

4 files changed

+24
-14
lines changed

4 files changed

+24
-14
lines changed

lib/OpenQA/Worker/CommandHandler.pm

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,19 +134,24 @@ sub _handle_command_developer_session_start {
134134
$current_job->developer_session_running(1);
135135
}
136136

137+
sub _reevaluate_and_return_current_error ($client, $worker) {
138+
# invoke send_status which will re-evaluate the availability updating $worker->current_error
139+
# note: This will not work if there's currently a job but then we will not accept a new job anyway.
140+
$client->send_status(on_error => 1) unless $worker->current_job;
141+
return $worker->current_error;
142+
}
143+
137144
sub _can_grab_job {
138145
my ($client, $worker, $webui_host, $current_job, $job_ids_to_grab) = @_;
139146
my $reason_to_reject_job;
140147

141-
# refuse new job(s) if the worker is
142-
# * in an error state
143-
# * stopping
148+
# refuse new job(s) if the worker is in an error state or stopping
144149
if ($worker->is_stopping) {
145150
$reason_to_reject_job = 'currently stopping';
146151
# note: Not rejecting the job here; declaring the worker as offline which is done in any case
147152
# should be sufficient.
148153
}
149-
elsif (my $current_error = $worker->current_error) {
154+
elsif (my $current_error = _reevaluate_and_return_current_error($client, $worker)) {
150155
$reason_to_reject_job = $current_error;
151156
$client->reject_jobs($job_ids_to_grab, $reason_to_reject_job);
152157
}

lib/OpenQA/Worker/WebUIConnection.pm

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ sub calculate_status_update_interval ($self) {
360360
}
361361

362362
# sends the overall worker status
363-
sub send_status ($self) {
363+
sub send_status ($self, %args) {
364364
# ensure an ongoing timer is cancelled in case send_status has been called manually
365365
if (my $send_status_timer = delete $self->{_send_status_timer}) {
366366
Mojo::IOLoop->remove($send_status_timer);
@@ -370,7 +370,7 @@ sub send_status ($self) {
370370
my $websocket_connection = $self->websocket_connection;
371371
return undef unless $websocket_connection;
372372
my $status = $self->{_last_worker_status} = $self->worker->status;
373-
$websocket_connection->send({json => $status});
373+
$websocket_connection->send({json => $status}) if !$args{on_error} || defined $status->{reason};
374374
}
375375

376376
sub send_status_delayed ($self) {

t/24-worker-webui-connection.t

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,7 @@ subtest 'retry delay configurable' => sub {
366366

367367
subtest 'send status' => sub {
368368
my $ws = OpenQA::Test::FakeWebSocketTransaction->new;
369+
$client->worker->current_error('some error');
369370
$client->websocket_connection($ws);
370371
$client->send_status();
371372
is_deeply($ws->sent_messages, [{json => {fake_status => 1, reason => 'some error'}}], 'status sent')
@@ -424,6 +425,7 @@ subtest 'command handler' => sub {
424425
my $command_handler = OpenQA::Worker::CommandHandler->new($client);
425426
my $ws = OpenQA::Test::FakeWebSocketTransaction->new;
426427
my $worker = $client->worker;
428+
$worker->current_error('some error');
427429
$client->websocket_connection($ws);
428430

429431
# test at least some of the error cases
@@ -437,13 +439,13 @@ qr/Ignoring WS message from http:\/\/test-host with type livelog_stop and job ID
437439
my %job = (id => 42, settings => {});
438440
combined_like { $command_handler->handle_command(undef, {type => WORKER_COMMAND_GRAB_JOB, job => \%job}) }
439441
qr/Refusing to grab job.*: some error/, 'ignoring grab_job while in error-state';
440-
442+
is $worker->current_job, undef, 'no job has been accepted while in error-state';
441443
my %job_info = (sequence => [$job{id}], data => {42 => \%job});
442444
combined_like {
443445
$command_handler->handle_command(undef, {type => WORKER_COMMAND_GRAB_JOBS, job_info => \%job_info})
444446
}
445447
qr/Refusing to grab job.*: some error/, 'ignoring grab_jobs while in error-state';
446-
is($worker->current_job, undef, 'no job has been accepted while in error-state');
448+
is $worker->current_job, undef, 'no jobs have been accepted while in error-state';
447449

448450
$worker->current_error(undef);
449451
$worker->is_stopping(1);
@@ -497,11 +499,12 @@ qr/Ignoring WS message from http:\/\/test-host with type livelog_stop and job ID
497499
$worker->current_job_ids([]);
498500
$worker->is_busy(0);
499501
my $rejection = sub { {json => {job_ids => shift, reason => shift, type => 'rejected'}} };
502+
my %fake_error_status = (json => {fake_status => 1, reason => 'some error'});
500503
is_deeply(
501504
$ws->sent_messages,
502505
[
503-
$rejection->([42], 'some error'),
504-
$rejection->([42], 'some error'),
506+
\%fake_error_status, $rejection->([42], 'some error'), # status + rejection due to error state
507+
\%fake_error_status, $rejection->([42], 'some error'), # status + rejection due to error state
505508
$rejection->(['but no settings'], 'the provided job is invalid'),
506509
$rejection->(['42'], 'job info lacks execution sequence'),
507510
$rejection->(['42'], 'already busy with a job from foo'),
@@ -512,8 +515,8 @@ qr/Ignoring WS message from http:\/\/test-host with type livelog_stop and job ID
512515

513516
# test accepting a job
514517
is($worker->current_job, undef, 'no job accepted so far');
515-
$command_handler->handle_command(undef,
516-
{type => WORKER_COMMAND_GRAB_JOB, job => {id => 25, settings => {FOO => 'bar'}}});
518+
my %cmd = (type => WORKER_COMMAND_GRAB_JOB, job => {id => 25, settings => {FOO => 'bar'}});
519+
$command_handler->handle_command(undef, \%cmd);
517520
my $accepted_job = $worker->current_job;
518521
is_deeply($accepted_job->info, {id => 25, settings => {FOO => 'bar'}}, 'job accepted');
519522

@@ -587,8 +590,10 @@ qr/Ignoring WS message from http:\/\/test-host with type livelog_stop and job ID
587590
is_deeply $worker->skipped_jobs, [[43, 'quit']], 'pending job is going to be skipped';
588591

589592
# reacting to info message
593+
$worker->current_error('yet another error');
594+
$client->send_status;
590595
ok !$client->{_send_status_timer}, 'sending status update not scheduled yet';
591-
combined_like { $command_handler->handle_command(undef, {type => 'info', seen => 1}) } qr/some error/,
596+
combined_like { $command_handler->handle_command(undef, {type => 'info', seen => 1}) } qr/yet another error/,
592597
'status update logged';
593598
ok $client->{_send_status_timer}, 'sending status update with delay after receiving "seen" message';
594599
};

t/lib/OpenQA/Test/FakeWorker.pm

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ has is_executing_single_job => 1;
2828

2929
sub stop_current_job ($self, $reason) { $self->stop_current_job_called($reason) }
3030
sub stop ($self) { $self->is_stopping(1) }
31-
sub status ($self) { {fake_status => 1, reason => 'some error'} }
31+
sub status ($self) { {fake_status => 1, reason => $self->current_error} }
3232

3333
sub accept_job ($self, $client, $job_info) {
3434
$self->current_job(OpenQA::Worker::Job->new($self, $client, $job_info));

0 commit comments

Comments
 (0)