@@ -2,8 +2,11 @@ package cluster
22
33import (
44 "sync"
5+ "time"
56
67 log "github.com/Sirupsen/logrus"
8+ "github.com/docker/engine-api/types/network"
9+ "golang.org/x/net/context"
710)
811
912// Watchdog listens to cluster events and handles container rescheduling
@@ -61,6 +64,7 @@ func (w *Watchdog) rescheduleContainers(e *Engine) {
6164 defer w .Unlock ()
6265
6366 log .Debugf ("Node %s failed - rescheduling containers" , e .ID )
67+
6468 for _ , c := range e .Containers () {
6569
6670 // Skip containers which don't have an "on-node-failure" reschedule policy.
@@ -75,23 +79,76 @@ func (w *Watchdog) rescheduleContainers(e *Engine) {
7579 // will abort because the name is already taken.
7680 c .Engine .removeContainer (c )
7781
78- newContainer , err := w .cluster .CreateContainer (c .Config , c .Info .Name , nil )
82+ // keep track of all global networks this container is connected to
83+ globalNetworks := make (map [string ]* network.EndpointSettings )
84+ // if the existing containter has global network endpoints,
85+ // they need to be removed with force option
86+ // "docker network disconnect -f network containername" only takes containername
87+ name := c .Info .Name
88+ if len (name ) == 0 || len (name ) == 1 && name [0 ] == '/' {
89+ log .Errorf ("container %s has no name" , c .ID )
90+ continue
91+ }
92+ // cut preceeding '/'
93+ if name [0 ] == '/' {
94+ name = name [1 :]
95+ }
96+
97+ if c .NetworkSettings != nil && len (c .NetworkSettings .Networks ) > 0 {
98+ // find an engine to do disconnect work
99+ randomEngine , err := w .cluster .RANDOMENGINE ()
100+ if err != nil {
101+ log .Errorf ("Failed to find an engine to do network cleanup for container %s: %v" , c .ID , err )
102+ // add the container back, so we can retry later
103+ c .Engine .AddContainer (c )
104+ continue
105+ }
106+
107+ clusterNetworks := w .cluster .Networks ().Uniq ()
108+ for networkName , endpoint := range c .NetworkSettings .Networks {
109+ net := clusterNetworks .Get (endpoint .NetworkID )
110+ if net != nil && net .Scope == "global" {
111+ // record the nework, they should be reconstructed on the new container
112+ globalNetworks [networkName ] = endpoint
113+ ctx , cancel := context .WithTimeout (context .Background (), 10 * time .Second )
114+ defer cancel ()
115+ err = randomEngine .apiClient .NetworkDisconnect (ctx , networkName , name , true )
116+ if err != nil {
117+ // do not abort here as this endpoint might have been removed before
118+ log .Warnf ("Failed to remove network endpoint from old container %s: %v" , name , err )
119+ }
120+ }
121+ }
122+ }
79123
124+ newContainer , err := w .cluster .CreateContainer (c .Config , c .Info .Name , nil )
80125 if err != nil {
81126 log .Errorf ("Failed to reschedule container %s: %v" , c .ID , err )
82127 // add the container back, so we can retry later
83128 c .Engine .AddContainer (c )
84- } else {
85- log .Infof ("Rescheduled container %s from %s to %s as %s" , c .ID , c .Engine .Name , newContainer .Engine .Name , newContainer .ID )
86- if c .Info .State .Running {
87- log .Infof ("Container %s was running, starting container %s" , c .ID , newContainer .ID )
88- if err := w .cluster .StartContainer (newContainer , nil ); err != nil {
89- log .Errorf ("Failed to start rescheduled container %s: %v" , newContainer .ID , err )
90- }
129+ continue
130+ }
131+
132+ // Docker create command cannot create a container with multiple networks
133+ // see https://github.com/docker/docker/issues/17750
134+ // Add the global networks one by one
135+ for networkName , endpoint := range globalNetworks {
136+ ctx , cancel := context .WithTimeout (context .Background (), 10 * time .Second )
137+ defer cancel ()
138+ err = newContainer .Engine .apiClient .NetworkConnect (ctx , networkName , name , endpoint )
139+ if err != nil {
140+ log .Warnf ("Failed to connect network %s to container %s: %v" , networkName , name , err )
91141 }
92142 }
93- }
94143
144+ log .Infof ("Rescheduled container %s from %s to %s as %s" , c .ID , c .Engine .Name , newContainer .Engine .Name , newContainer .ID )
145+ if c .Info .State .Running {
146+ log .Infof ("Container %s was running, starting container %s" , c .ID , newContainer .ID )
147+ if err := w .cluster .StartContainer (newContainer , nil ); err != nil {
148+ log .Errorf ("Failed to start rescheduled container %s: %v" , newContainer .ID , err )
149+ }
150+ }
151+ }
95152}
96153
97154// NewWatchdog creates a new watchdog
0 commit comments