@@ -111,7 +111,7 @@ static int mca_pml_ubcl_export_local_endpoint_handle(const int type)
111111
112112 err = ubcl_export_local_endpoint_handle (type , endpoint_h , & remote_rank_u64 );
113113 if (UBCL_SUCCESS != err ) {
114- return OMPI_ERROR ;
114+ return ubcl_error_to_ompi ( err ) ;
115115 }
116116
117117 mca_pml_ubcl_endpoint_modex_put (type , (void * ) endpoint_h , size );
@@ -120,10 +120,10 @@ static int mca_pml_ubcl_export_local_endpoint_handle(const int type)
120120 * The actual recv rank will be allocated during add_procs calls */
121121 err = ubcl_close_local_endpoint_channel (type , remote_rank_u64 );
122122 if (UBCL_SUCCESS != err ) {
123- mca_pml_ubcl_warn (OMPI_ERROR ,
123+ mca_pml_ubcl_warn (ubcl_error_to_ompi ( err ) ,
124124 "PML/UBCL failed to clean local endpoint (very unlikely error)."
125125 " For safety reason PML will be disabled." );
126- return OMPI_ERROR ;
126+ return ubcl_error_to_ompi ( err ) ;
127127 }
128128
129129 return OMPI_SUCCESS ;
@@ -133,35 +133,31 @@ int mca_pml_ubcl_create_local_endpoint(void)
133133{
134134 int type ;
135135 ubcl_error_t err ;
136- int ompi_error ;
137136
138137 type = UBCL_ENDPOINT_TYPE_SELF ;
139138 err = ubcl_create_local_endpoint (type );
140139 if (UBCL_SUCCESS != err ) {
141- mca_pml_ubcl_error ( OMPI_ERROR , "Failed ubcl_create_local_endpoint %d (%d)" , type , err );
140+ mca_pml_ubcl_warn ( ubcl_error_to_ompi ( err ) , "Failed ubcl_create_local_endpoint %d (%d)" , type , err );
142141 }
143142
144- /* UBCL_ENDPOINT_SHM */
145143 if (!mca_pml_ubcl_component .force_intranode_bxi ) {
146144 type = UBCL_ENDPOINT_TYPE_SHMEM ;
147145 err = ubcl_create_local_endpoint (type );
148- if (UBCL_SUCCESS ! = err ) {
149- mca_pml_ubcl_error ( OMPI_ERROR , "Failed ubcl_create_local_endpoint %d (%d)" , type , err );
146+ if (UBCL_SUCCESS = = err ) {
147+ err = mca_pml_ubcl_export_local_endpoint_handle ( type );
150148 }
151- ompi_error = mca_pml_ubcl_export_local_endpoint_handle (type );
152- if (OMPI_SUCCESS != ompi_error ) {
153- return ompi_error ;
149+ if (UBCL_SUCCESS != err ) {
150+ mca_pml_ubcl_warn (ubcl_error_to_ompi (err ), "Failed ubcl_create_local_endpoint %d (%d)" , type , err );
154151 }
155152 }
156153
157154 type = UBCL_ENDPOINT_TYPE_BXI ;
158155 err = ubcl_create_local_endpoint (type );
159- if (UBCL_SUCCESS ! = err ) {
160- mca_pml_ubcl_error ( OMPI_ERROR , "Failed ubcl_create_local_endpoint %d (%d)" , type , err );
156+ if (UBCL_SUCCESS = = err ) {
157+ err = mca_pml_ubcl_export_local_endpoint_handle ( type );
161158 }
162- ompi_error = mca_pml_ubcl_export_local_endpoint_handle (type );
163- if (OMPI_SUCCESS != ompi_error ) {
164- return ompi_error ;
159+ if (UBCL_SUCCESS != err ) {
160+ mca_pml_ubcl_warn (ubcl_error_to_ompi (err ), "Failed ubcl_create_local_endpoint %d (%d)" , type , err );
165161 }
166162
167163 return OMPI_SUCCESS ;
@@ -170,20 +166,23 @@ int mca_pml_ubcl_create_local_endpoint(void)
170166int mca_pml_ubcl_free_local_endpoints ()
171167{
172168 int ret ;
173- /* Finalize BXI */
174169 ret = ubcl_free_local_endpoint (UBCL_ENDPOINT_TYPE_BXI );
175- if (UBCL_SUCCESS != ret ) {
176- return OMPI_ERROR ;
170+ if (UBCL_SUCCESS != ret && UBCL_ERR_NOT_AVAILABLE != ret ) {
171+ /* If the transport was unavailable we silence the error,
172+ * we're closing it anyway */
173+ return ubcl_error_to_ompi (ret );
177174 }
175+
178176 if (!mca_pml_ubcl_component .force_intranode_bxi ) {
179177 ret = ubcl_free_local_endpoint (UBCL_ENDPOINT_TYPE_SHMEM );
180- if (UBCL_SUCCESS != ret ) {
181- return OMPI_ERROR ;
178+ if (UBCL_SUCCESS != ret && UBCL_ERR_NOT_AVAILABLE != ret ) {
179+ return ubcl_error_to_ompi ( ret ) ;
182180 }
183181 }
182+
184183 ret = ubcl_free_local_endpoint (UBCL_ENDPOINT_TYPE_SELF );
185- if (UBCL_SUCCESS != ret ) {
186- return OMPI_ERROR ;
184+ if (UBCL_SUCCESS != ret && UBCL_ERR_NOT_AVAILABLE != ret ) {
185+ return ubcl_error_to_ompi ( ret ) ;
187186 }
188187
189188 return OMPI_SUCCESS ;
@@ -255,7 +254,7 @@ static int mca_pml_ubcl_create_recv_endpoint(uint64_t sender_rank, const int typ
255254
256255 err = ubcl_export_local_endpoint_handle (type , endpoint_h , & remote_rank_u64 );
257256 if (UBCL_SUCCESS != err ) {
258- return OMPI_ERROR ;
257+ return ubcl_error_to_ompi ( err ) ;
259258 }
260259
261260 return OMPI_SUCCESS ;
@@ -270,11 +269,11 @@ static int mca_pml_ubcl_create_self_endpoints(uint64_t remote_rank)
270269
271270 err = ubcl_export_local_endpoint_handle (type , endpoint_h , & my_rank );
272271 if (UBCL_SUCCESS != err ) {
273- return OMPI_ERROR ;
272+ return ubcl_error_to_ompi ( err ) ;
274273 }
275274 err = ubcl_create_remote_endpoint (my_rank , my_rank , type , endpoint_h );
276275 if (UBCL_SUCCESS != err ) {
277- return OMPI_ERROR ;
276+ return ubcl_error_to_ompi ( err ) ;
278277 }
279278
280279 return OMPI_SUCCESS ;
@@ -296,6 +295,25 @@ static int get_endpoint_type(ompi_proc_t *proc)
296295 }
297296}
298297
298+ static enum ubcl_endpoint_type_t mca_pml_ubcl_get_higher_transport (
299+ enum ubcl_endpoint_type_t type )
300+ {
301+ switch ((int ) type ) {
302+ case UBCL_ENDPOINT_TYPE_SELF :
303+ case UBCL_ENDPOINT_TYPE_SHMEM :
304+ type ++ ;
305+ break ;
306+ /* There are no valid higher transport */
307+ case UBCL_ENDPOINT_TYPE_BXI :
308+ default :
309+ type = UBCL_ENDPOINT_TYPE_NONE ;
310+ /* Not a valid transport */
311+ break ;
312+ }
313+
314+ return type ;
315+ }
316+
299317void mca_pml_ubcl_endpoint_retain (ompi_proc_t * proc )
300318{
301319 mca_common_ubcl_endpoint_t * endpoint = NULL ;
@@ -312,6 +330,7 @@ void mca_pml_ubcl_endpoint_retain(ompi_proc_t *proc)
312330static int mca_pml_ubcl_create_endpoints (ompi_proc_t * proc )
313331{
314332 int err = OMPI_SUCCESS ;
333+ enum ubcl_endpoint_type_t type ;
315334 mca_common_ubcl_endpoint_t * new_endpoint ;
316335
317336 new_endpoint = malloc (sizeof (mca_common_ubcl_endpoint_t ));
@@ -322,29 +341,58 @@ static int mca_pml_ubcl_create_endpoints(ompi_proc_t *proc)
322341
323342 new_endpoint -> refcount = 0 ; //we increment it to 1 in endpoint_retain
324343 new_endpoint -> rank = mca_pml_forge_rank (proc );
325- new_endpoint -> type = get_endpoint_type (proc );
344+ type = get_endpoint_type (proc );
326345
327- if (UBCL_ENDPOINT_TYPE_SELF == new_endpoint -> type ) {
346+ if (UBCL_ENDPOINT_TYPE_SELF == type ) {
328347 err = mca_pml_ubcl_create_self_endpoints ((uint64_t ) new_endpoint -> rank );
329- goto end ;
330- }
331348
332- err = mca_pml_ubcl_create_recv_endpoint (new_endpoint -> rank , new_endpoint -> type );
333- if (OMPI_SUCCESS != err ) {
334- mca_pml_ubcl_error (err , "Failed to create recv endpoint for rank %zu\n" ,
335- new_endpoint -> rank );
349+ /* If the transport is unvailable (either explicitely disabled,
350+ * or just unavailable) we do not return any error
351+ * If UBCL encountered another error we return it */
352+ if (OMPI_SUCCESS == err ) {
353+ goto end ;
354+ } else if (OMPI_ERR_NOT_AVAILABLE != err ) {
355+ goto error ;
356+ }
336357 }
337358
338- err = mca_pml_ubcl_create_send_endpoint (proc , new_endpoint -> rank , new_endpoint -> type );
359+ /* If a transport is unavailable only a higher transport can take its place,
360+ * ie. if SHM is unavailable, SELF cannot replace it but BXI can */
361+ do {
362+ err = mca_pml_ubcl_create_recv_endpoint (new_endpoint -> rank , type );
363+
364+ if (OMPI_ERR_NOT_AVAILABLE == err ) {
365+ type = mca_pml_ubcl_get_higher_transport (type );
366+ if (UBCL_ENDPOINT_TYPE_NONE == type ) {
367+ mca_pml_ubcl_warn (err , "Failed to create recv endpoint for rank %zu\n" ,
368+ new_endpoint -> rank );
369+ goto error ;
370+ }
371+ } else if (OMPI_SUCCESS != err ) {
372+ mca_pml_ubcl_warn (err , "Failed to create recv endpoint for rank %zu\n" ,
373+ new_endpoint -> rank );
374+ goto error ;
375+ }
376+ } while (OMPI_SUCCESS != err );
377+
378+ /* No need to loop again, if the transport became unavailable between
379+ * the last operation and this one we can consider this a error */
380+ err = mca_pml_ubcl_create_send_endpoint (proc , new_endpoint -> rank , type );
339381 if (OMPI_SUCCESS != err ) {
340- mca_pml_ubcl_error (err , "Failed to create send endpoint for rank %zu\n" ,
341- new_endpoint -> rank );
382+ mca_pml_ubcl_warn (err , "Failed to create send endpoint for rank %zu\n" ,
383+ new_endpoint -> rank );
384+ goto error ;
342385 }
343386
344387end :
388+ new_endpoint -> type = type ;
345389 (proc )-> proc_endpoints [OMPI_PROC_ENDPOINT_TAG_PML ] = new_endpoint ;
346390 mca_pml_ubcl_endpoint_retain (proc );
347391
392+ return UBCL_SUCCESS ;
393+
394+ error :
395+ free (new_endpoint );
348396 return err ;
349397}
350398
0 commit comments