|
1 | 1 | import json
|
2 | 2 | import os
|
3 | 3 | import platform
|
| 4 | +import signal |
4 | 5 | import sys
|
| 6 | +import threading |
5 | 7 | import time
|
6 | 8 |
|
7 | 9 | from metaflow import current
|
|
37 | 39 | from metaflow.unbounded_foreach import UBF_CONTROL
|
38 | 40 |
|
39 | 41 | from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata
|
40 |
| -from .kubernetes import KubernetesException |
| 42 | +from .kubernetes import KubernetesException, SPOT_INTERRUPT_EXITCODE |
41 | 43 | from .kube_utils import validate_kube_labels, parse_kube_keyvalue_list
|
42 | 44 |
|
43 | 45 | try:
|
@@ -548,6 +550,29 @@ def task_pre_step(
|
548 | 550 | self._save_logs_sidecar = Sidecar("save_logs_periodically")
|
549 | 551 | self._save_logs_sidecar.start()
|
550 | 552 |
|
| 553 | + # Set up signal handling for spot termination |
| 554 | + main_pid = os.getpid() |
| 555 | + |
| 556 | + def _termination_timer(): |
| 557 | + time.sleep(30) |
| 558 | + os.kill(main_pid, signal.SIGALRM) |
| 559 | + |
| 560 | + def _spot_term_signal_handler(*args, **kwargs): |
| 561 | + if os.path.isfile(current.spot_termination_notice): |
| 562 | + print( |
| 563 | + "Spot instance termination detected. Starting a timer to end the Metaflow task." |
| 564 | + ) |
| 565 | + timer_thread = threading.Thread( |
| 566 | + target=_termination_timer, daemon=True |
| 567 | + ) |
| 568 | + timer_thread.start() |
| 569 | + |
| 570 | + def _curtain_call(*args, **kwargs): |
| 571 | + # custom exit code in case of Spot termination |
| 572 | + sys.exit(SPOT_INTERRUPT_EXITCODE) |
| 573 | + |
| 574 | + signal.signal(signal.SIGUSR1, _spot_term_signal_handler) |
| 575 | + signal.signal(signal.SIGALRM, _curtain_call) |
551 | 576 | # Start spot termination monitor sidecar.
|
552 | 577 | # TODO: A nicer way to pass the main process id to a Sidecar, in order to allow sidecars to send signals back to the main process.
|
553 | 578 | os.environ["MF_MAIN_PID"] = str(os.getpid())
|
|
0 commit comments