|
22 | 22 |
|
23 | 23 | from service_configuration_lib import utils
|
24 | 24 | from service_configuration_lib.text_colors import TextColors
|
| 25 | +from service_configuration_lib.utils import EPHEMERAL_PORT_END |
| 26 | +from service_configuration_lib.utils import EPHEMERAL_PORT_START |
25 | 27 |
|
26 | 28 | AWS_CREDENTIALS_DIR = '/etc/boto_cfg/'
|
27 | 29 | AWS_ENV_CREDENTIALS_PROVIDER = 'com.amazonaws.auth.EnvironmentVariableCredentialsProvider'
|
|
32 | 34 | GPUS_HARD_LIMIT = 15
|
33 | 35 | CLUSTERMAN_METRICS_YAML_FILE_PATH = '/nail/srv/configs/clusterman_metrics.yaml'
|
34 | 36 | CLUSTERMAN_YAML_FILE_PATH = '/nail/srv/configs/clusterman.yaml'
|
| 37 | +SPARK_TRON_JOB_USER = 'TRON' |
35 | 38 |
|
36 | 39 | NON_CONFIGURABLE_SPARK_OPTS = {
|
37 | 40 | 'spark.master',
|
@@ -295,7 +298,7 @@ def _get_k8s_spark_env(
|
295 | 298 | paasta_service: str,
|
296 | 299 | paasta_instance: str,
|
297 | 300 | docker_img: str,
|
298 |
| - pod_template_path: str, |
| 301 | + pod_template_path: Optional[str], |
299 | 302 | volumes: Optional[List[Mapping[str, str]]],
|
300 | 303 | paasta_pool: str,
|
301 | 304 | driver_ui_port: int,
|
@@ -335,9 +338,12 @@ def _get_k8s_spark_env(
|
335 | 338 | 'spark.kubernetes.executor.label.yelp.com/pool': paasta_pool,
|
336 | 339 | 'spark.kubernetes.executor.label.paasta.yelp.com/pool': paasta_pool,
|
337 | 340 | 'spark.kubernetes.executor.label.yelp.com/owner': 'core_ml',
|
338 |
| - 'spark.kubernetes.executor.podTemplateFile': pod_template_path, |
339 | 341 | **_get_k8s_docker_volumes_conf(volumes),
|
340 | 342 | }
|
| 343 | + |
| 344 | + if pod_template_path is not None: |
| 345 | + spark_env['spark.kubernetes.executor.podTemplateFile'] = pod_template_path |
| 346 | + |
341 | 347 | if service_account_name is not None:
|
342 | 348 | spark_env.update(
|
343 | 349 | {
|
@@ -419,12 +425,13 @@ def get_total_driver_memory_mb(spark_conf: Dict[str, str]) -> int:
|
419 | 425 |
|
420 | 426 | class SparkConfBuilder:
|
421 | 427 |
|
422 |
| - def __init__(self): |
423 |
| - self.spark_srv_conf = dict() |
424 |
| - self.spark_constants = dict() |
425 |
| - self.default_spark_srv_conf = dict() |
426 |
| - self.mandatory_default_spark_srv_conf = dict() |
427 |
| - self.spark_costs = dict() |
| 428 | + def __init__(self, is_driver_on_k8s_tron: bool = False): |
| 429 | + self.is_driver_on_k8s_tron = is_driver_on_k8s_tron |
| 430 | + self.spark_srv_conf: Dict[str, Any] = dict() |
| 431 | + self.spark_constants: Dict[str, Any] = dict() |
| 432 | + self.default_spark_srv_conf: Dict[str, Any] = dict() |
| 433 | + self.mandatory_default_spark_srv_conf: Dict[str, Any] = dict() |
| 434 | + self.spark_costs: Dict[str, Dict[str, float]] = dict() |
428 | 435 |
|
429 | 436 | try:
|
430 | 437 | (
|
@@ -628,7 +635,7 @@ def compute_executor_instances_k8s(self, user_spark_opts: Dict[str, str]) -> int
|
628 | 635 | )
|
629 | 636 |
|
630 | 637 | # Deprecation message
|
631 |
| - if 'spark.cores.max' in user_spark_opts: |
| 638 | + if not self.is_driver_on_k8s_tron and 'spark.cores.max' in user_spark_opts: |
632 | 639 | log.warning(
|
633 | 640 | f'spark.cores.max is DEPRECATED. Replace with '
|
634 | 641 | f'spark.executor.instances={executor_instances} in --spark-args and in your service code '
|
@@ -1102,23 +1109,27 @@ def get_spark_conf(
|
1102 | 1109 | spark_app_base_name
|
1103 | 1110 | )
|
1104 | 1111 |
|
1105 |
| - # Pick a port from a pre-defined port range, which will then be used by our Jupyter |
1106 |
| - # server metric aggregator API. The aggregator API collects Prometheus metrics from multiple |
1107 |
| - # Spark sessions and exposes them through a single endpoint. |
1108 |
| - try: |
1109 |
| - ui_port = int( |
1110 |
| - (spark_opts_from_env or {}).get('spark.ui.port') or |
1111 |
| - utils.ephemeral_port_reserve_range( |
1112 |
| - self.spark_constants.get('preferred_spark_ui_port_start'), |
1113 |
| - self.spark_constants.get('preferred_spark_ui_port_end'), |
1114 |
| - ), |
1115 |
| - ) |
1116 |
| - except Exception as e: |
1117 |
| - log.warning( |
1118 |
| - f'Could not get an available port using srv-config port range: {e}. ' |
1119 |
| - 'Using default port range to get an available port.', |
1120 |
| - ) |
1121 |
| - ui_port = utils.ephemeral_port_reserve_range() |
| 1112 | + if self.is_driver_on_k8s_tron: |
| 1113 | + # For Tron-launched driver on k8s, we use a static Spark UI port |
| 1114 | + ui_port: int = self.spark_constants.get('preferred_spark_ui_port_start', EPHEMERAL_PORT_START) |
| 1115 | + else: |
| 1116 | + # Pick a port from a pre-defined port range, which will then be used by our Jupyter |
| 1117 | + # server metric aggregator API. The aggregator API collects Prometheus metrics from multiple |
| 1118 | + # Spark sessions and exposes them through a single endpoint. |
| 1119 | + try: |
| 1120 | + ui_port = int( |
| 1121 | + (spark_opts_from_env or {}).get('spark.ui.port') or |
| 1122 | + utils.ephemeral_port_reserve_range( |
| 1123 | + self.spark_constants.get('preferred_spark_ui_port_start', EPHEMERAL_PORT_START), |
| 1124 | + self.spark_constants.get('preferred_spark_ui_port_end', EPHEMERAL_PORT_END), |
| 1125 | + ), |
| 1126 | + ) |
| 1127 | + except Exception as e: |
| 1128 | + log.warning( |
| 1129 | + f'Could not get an available port using srv-config port range: {e}. ' |
| 1130 | + 'Using default port range to get an available port.', |
| 1131 | + ) |
| 1132 | + ui_port = utils.ephemeral_port_reserve_range() |
1122 | 1133 |
|
1123 | 1134 | spark_conf = {**(spark_opts_from_env or {}), **_filter_user_spark_opts(user_spark_opts)}
|
1124 | 1135 | random_postfix = utils.get_random_string(4)
|
@@ -1157,12 +1168,14 @@ def get_spark_conf(
|
1157 | 1168 | )
|
1158 | 1169 |
|
1159 | 1170 | # Add pod template file
|
1160 |
| - pod_template_path = utils.generate_pod_template_path() |
1161 |
| - try: |
1162 |
| - utils.create_pod_template(pod_template_path, app_base_name) |
1163 |
| - except Exception as e: |
1164 |
| - log.error(f'Failed to generate Spark executor pod template: {e}') |
1165 |
| - pod_template_path = '' |
| 1171 | + pod_template_path: Optional[str] = None |
| 1172 | + if not self.is_driver_on_k8s_tron: |
| 1173 | + pod_template_path = utils.generate_pod_template_path() |
| 1174 | + try: |
| 1175 | + utils.create_pod_template(pod_template_path, app_base_name) |
| 1176 | + except Exception as e: |
| 1177 | + log.error(f'Failed to generate Spark executor pod template: {e}') |
| 1178 | + pod_template_path = None |
1166 | 1179 |
|
1167 | 1180 | if cluster_manager == 'kubernetes':
|
1168 | 1181 | spark_conf.update(_get_k8s_spark_env(
|
|
0 commit comments