Skip to content

Commit 9f589c5

Browse files
authored
Configurable startup timeout for supervisor; better exit detection (#10042)
This change does not fix any known problems, but is intended to help us get better diagnostics on issues like #8000 wherein the supervisor appears to be timing out at startup. Specifically, it seems possible that the supervisor is actually exited in these cases or is taking extra long to start, so: - make the timeout configurable so that if there actually is a situation in which it takes a super long time for first boot (AV scan? insanely slow hardware? something else?) it is survivable - move the close listener handler to _before_ we open the terminal, so that if the terminal closes immediately after starting we'll catch it - periodically while polling, check to see if the process is alive; it's possible it is exiting but the terminal isn't closing or the close listener isn't getting triggered
1 parent f5b24bd commit 9f589c5

File tree

3 files changed

+49
-26
lines changed

3 files changed

+49
-26
lines changed

extensions/positron-supervisor/package.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,13 @@
4040
"default": 30,
4141
"description": "%configuration.connectionTimeout.description%"
4242
},
43+
"kernelSupervisor.startupTimeout": {
44+
"scope": "window",
45+
"type": "number",
46+
"default": 10,
47+
"minimum": 1,
48+
"description": "%configuration.startupTimeout.description%"
49+
},
4350
"kernelSupervisor.logLevel": {
4451
"scope": "window",
4552
"type": "string",

extensions/positron-supervisor/package.nls.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"configuration.shutdownTimeout.indefinitely.description": "Leave kernels running indefinitely",
1919
"configuration.showTerminal.description": "Show the host terminal for the Positron kernel supervisor",
2020
"configuration.connectionTimeout.description": "Timeout in seconds for connecting to the kernel's sockets",
21+
"configuration.startupTimeout.description": "Timeout in seconds for starting the kernel supervisor",
2122
"configuration.attachOnStartup.description": "Run <f5> before starting up Jupyter kernel (when supported)",
2223
"configuration.sleepOnStartup.description": "Sleep for n seconds before starting up Jupyter kernel (when supported)",
2324
"configuration.runInShell.description": "Run kernels in a login shell (on Unix-like systems)",

extensions/positron-supervisor/src/KallichoreAdapterApi.ts

Lines changed: 41 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ export class KCApi implements PositronSupervisorApi {
289289
// Consult configuration to see if we should show this terminal
290290
const config = vscode.workspace.getConfiguration('kernelSupervisor');
291291
const showTerminal = config.get<boolean>('showTerminal', false);
292+
const startupTimeout = config.get<number>('startupTimeout', 10) * 1000;
292293

293294
// Create a temporary file with a random name to use for logs
294295
const logFile = path.join(os.tmpdir(), `kallichore-${sessionId}.log`);
@@ -367,25 +368,11 @@ export class KCApi implements PositronSupervisorApi {
367368
}
368369
}
369370

370-
// Start the server in a new terminal
371-
this.log(`Starting Kallichore server ${shellPath} with connection file ${connectionFile}`);
372-
const terminal = vscode.window.createTerminal({
373-
name: 'Kallichore',
374-
shellPath: wrapperPath,
375-
shellArgs,
376-
message: `*** Kallichore Server (${shellPath}) ***`,
377-
hideFromUser: !showTerminal,
378-
isTransient: false
379-
} satisfies vscode.TerminalOptions);
380-
381-
// Flag to track if the terminal exited before the start barrier opened
382-
let exited = false;
383-
384371
// Listen for the terminal to close. If it closes unexpectedly before
385372
// the start barrier opens, provide some feedback.
386373
const closeListener = vscode.window.onDidCloseTerminal(async (closedTerminal) => {
387374
// Ignore closed terminals that aren't the one we started
388-
if (closedTerminal !== terminal) {
375+
if (closedTerminal !== this._terminal) {
389376
return;
390377
}
391378

@@ -405,8 +392,8 @@ export class KCApi implements PositronSupervisorApi {
405392

406393
// Read the contents of the output file and log it
407394
const contents = fs.readFileSync(outFile, 'utf8');
408-
if (terminal.exitStatus && terminal.exitStatus.code) {
409-
this.log(`Supervisor terminal closed with exit code ${terminal.exitStatus.code}; output:\n${contents}`);
395+
if (this._terminal.exitStatus && this._terminal.exitStatus.code) {
396+
this.log(`Supervisor terminal closed with exit code ${this._terminal.exitStatus.code}; output:\n${contents}`);
410397
} else {
411398
this.log(`Supervisor terminal closed unexpectedly; output:\n${contents}`);
412399
}
@@ -424,6 +411,24 @@ export class KCApi implements PositronSupervisorApi {
424411
}
425412
});
426413

414+
415+
// Start the server in a new terminal
416+
this.log(`Starting Kallichore server ${shellPath} ` +
417+
`with connection file ${connectionFile} and ` +
418+
`${startupTimeout}ms startup timeout`);
419+
420+
this._terminal = vscode.window.createTerminal({
421+
name: 'Kallichore',
422+
shellPath: wrapperPath,
423+
shellArgs,
424+
message: `*** Kallichore Server (${shellPath}) ***`,
425+
hideFromUser: !showTerminal,
426+
isTransient: false
427+
} satisfies vscode.TerminalOptions);
428+
429+
// Flag to track if the terminal exited before the start barrier opened
430+
let exited = false;
431+
427432
// Ensure this listener is disposed when the API is disposed
428433
this._disposables.push(closeListener);
429434

@@ -437,7 +442,7 @@ export class KCApi implements PositronSupervisorApi {
437442
}));
438443

439444
// Wait for the terminal to start and get the PID
440-
let processId = await terminal.processId;
445+
let processId = await this._terminal.processId;
441446

442447
// Wait for the connection file to be written by the server
443448
let connectionData: KallichoreServerState | undefined = undefined;
@@ -481,6 +486,16 @@ export class KCApi implements PositronSupervisorApi {
481486
this.log(`Error reading connection file (attempt ${retry}): ${err}`);
482487
}
483488

489+
// Every 10 retries, check to see if the server process has exited.
490+
if (!exited && processId && retry % 10 === 0) {
491+
try {
492+
process.kill(processId, 0);
493+
} catch (err) {
494+
this.log(`Kallichore server PID ${processId} is not running`);
495+
exited = true;
496+
}
497+
}
498+
484499
// Has the terminal exited? if it has, there's no point in continuing to retry.
485500
if (exited) {
486501
let message = `The supervisor process exited unexpectedly during startup`;
@@ -497,7 +512,7 @@ export class KCApi implements PositronSupervisorApi {
497512
}
498513

499514
const elapsed = Date.now() - startTime;
500-
if (elapsed > 10000) {
515+
if (elapsed > startupTimeout) {
501516
let message = `Connection file was not created after ${elapsed}ms`;
502517

503518
// Include any output from the server process to help diagnose the problem
@@ -517,7 +532,7 @@ export class KCApi implements PositronSupervisorApi {
517532

518533
if (!connectionData) {
519534
let message = `Timed out waiting for connection file to be ` +
520-
`created at ${connectionFile} after 10 seconds`;
535+
`created at ${connectionFile} after ${startupTimeout}ms`;
521536

522537
// Include any output from the server process to help diagnose the problem
523538
if (fs.existsSync(outFile)) {
@@ -571,10 +586,10 @@ export class KCApi implements PositronSupervisorApi {
571586

572587
// ECONNREFUSED (for TCP) and ENOENT (for sockets) are normal
573588
// conditions during startup; the server isn't ready yet. Keep
574-
// trying up to 10 seconds from the time we got a process ID
575-
// established.
589+
// trying up to the startup timeout from the time we got a
590+
// process ID established.
576591
if (err.code === 'ECONNREFUSED' || err.code === 'ENOENT') {
577-
if (elapsed < 10000) {
592+
if (elapsed < startupTimeout) {
578593
// Log every few attempts. We don't want to overwhelm
579594
// the logs, and it's normal for us to encounter a few
580595
// connection refusals before the server is ready.
@@ -607,9 +622,9 @@ export class KCApi implements PositronSupervisorApi {
607622
}
608623

609624
// If the request times out, go ahead and try again as long as
610-
// it hasn't been more than 10 seconds since we started. This
611-
// can happen if the server is slow to start.
612-
if (err.code === 'ETIMEDOUT' && elapsed < 10000) {
625+
// the startup timeout hasn't been reached. This can happen if
626+
// the server is slow to start.
627+
if (err.code === 'ETIMEDOUT' && elapsed < startupTimeout) {
613628
this.log(`Request for server status timed out; retrying (attempt ${retry + 1}, ${elapsed}ms)`);
614629
continue;
615630
}

0 commit comments

Comments
 (0)