@@ -277,40 +277,25 @@ def _should_replace_connection(self, peer_id: ID, conn: INetConn) -> bool:
277277 async def _replace_unhealthy_connection (
278278 self , peer_id : ID , old_conn : INetConn
279279 ) -> None :
280- """Replace an unhealthy connection with a new one."""
280+ """
281+ Replace an unhealthy connection with a new one.
282+
283+ This method establishes a new connection BEFORE closing the old one
284+ to avoid dropping below minimum connection threshold.
285+ """
281286 try :
282287 logger .info (f"Replacing unhealthy connection for peer { peer_id } " )
283288
284- # Check if we have enough connections remaining
289+ # Check current connection count
285290 current_connections = self .swarm .connections .get (peer_id , [])
286- remaining_after_removal = len (current_connections ) - 1
287-
288- # Only remove if we have more than the minimum required
289- if remaining_after_removal < self .config .min_connections_per_peer :
290- logger .warning (
291- f"Not replacing connection to { peer_id } : would go below minimum "
292- f"({ remaining_after_removal } < "
293- f"{ self .config .min_connections_per_peer } )"
294- )
295- return
296-
297- # Clean up health tracking first
298- self .swarm .cleanup_connection_health (peer_id , old_conn )
299-
300- # Remove from active connections
301- if (
302- peer_id in self .swarm .connections
303- and old_conn in self .swarm .connections [peer_id ]
304- ):
305- self .swarm .connections [peer_id ].remove (old_conn )
291+ current_count = len (current_connections )
306292
307- # Close the unhealthy connection
308- try :
309- await old_conn .close ()
310- except Exception as e :
311- logger .debug (f"Error closing unhealthy connection: { e } " )
293+ # Strategy: Try to establish new connection first, then close
294+ # old one. This prevents us from being stuck with a bad
295+ # connection at minimum threshold
312296
313- # Try to establish a new connection to maintain connectivity
297+ # First, try to establish a new connection
298+ new_conn = None
314299 try :
315300 logger .info (f"Attempting to dial replacement connection to { peer_id } " )
316301 new_conn = await self .swarm .dial_peer_replacement (peer_id )
@@ -322,12 +307,39 @@ async def _replace_unhealthy_connection(
322307 logger .warning (
323308 f"Failed to establish replacement connection to { peer_id } "
324309 )
325-
326310 except Exception as e :
327311 logger .error (
328312 f"Error establishing replacement connection to { peer_id } : { e } "
329313 )
330314
315+ # If we successfully established a new connection, or if we have enough
316+ # connections to safely remove the bad one, proceed with cleanup
317+ if new_conn or current_count > self .config .min_connections_per_peer :
318+ # Clean up health tracking
319+ self .swarm .cleanup_connection_health (peer_id , old_conn )
320+
321+ # Remove from active connections
322+ if (
323+ peer_id in self .swarm .connections
324+ and old_conn in self .swarm .connections [peer_id ]
325+ ):
326+ self .swarm .connections [peer_id ].remove (old_conn )
327+
328+ # Close the unhealthy connection
329+ try :
330+ await old_conn .close ()
331+ logger .info (f"Closed unhealthy connection to { peer_id } " )
332+ except Exception as e :
333+ logger .debug (f"Error closing unhealthy connection: { e } " )
334+ else :
335+ # We couldn't establish a new connection and we're at minimum
336+ # Keep the unhealthy connection rather than having no connection
337+ logger .warning (
338+ f"Keeping unhealthy connection to { peer_id } : "
339+ f"failed to establish replacement and at minimum threshold "
340+ f"({ current_count } connections)"
341+ )
342+
331343 except Exception as e :
332344 logger .error (f"Error replacing connection to { peer_id } : { e } " )
333345
0 commit comments