8000 Fix 'failed to get network during CreateEndpoint' by xinfengliu · Pull Request #41011 · moby/moby · GitHub
[go: up one dir, main page]

Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions daemon/cluster/executor/container/attachment.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,7 @@ func (nc *networkAttacherController) Terminate(ctx context.Context) error {
}

func (nc *networkAttacherController) Remove(ctx context.Context) error {
// Try removing the network referenced in this task in case this
// task is the last one referencing it
return nc.adapter.removeNetworks(ctx)
return nil
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks scary; shouldn't these be removed, or is the code in tryDetachContainerFromClusterNetwork meant to replace this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is the code tryDetachContainerFromClusterNetwork meant to replace this. If removeNetworks is in controller's method, then it is asynchronized with container-start, and will cause test case in docker/for-linux#888 (comment) to fail.

Another way may be adding a delay in controller's Start(), but it is not reliable.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

instead of doing this, what are your thoughts on applying a retry logic below in Start ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@arkodg I'm afraid we are not able to do retry logic in Start, nc.adapter.networkAttach(ctx) in Start just sends a message to trigger container starting to continue.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

trying to understand why

// Retry network attach again if we failed to
won't help now that you have fixed the error codes with moby/libnetwork#2554

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@arkodg
See:

n, config, err := daemon.findAndAttachNetwork(container, idOrName, endpointConfig)
if err != nil {
return err
}
if n == nil {
return nil
}
var operIPAM bool
if config != nil {
if epConfig, ok := config.EndpointsConfig[n.Name()]; ok {
if endpointConfig.IPAMConfig == nil ||
(endpointConfig.IPAMConfig.IPv4Address == "" &&
endpointConfig.IPAMConfig.IPv6Address == "" &&
len(endpointConfig.IPAMConfig.LinkLocalIPs) == 0) {
operIPAM = true
}
// copy IPAMConfig and NetworkID from epConfig via AttachNetwork
endpointConfig.IPAMConfig = epConfig.IPAMConfig
endpointConfig.NetworkID = epConfig.NetworkID
}
}
if err := daemon.updateNetworkConfig(container, n, endpointConfig, updateSettings); err != nil {
return err
}
controller := daemon.netController
sb := daemon.getNetworkSandbox(container)
createOptions, err := buildCreateEndpointOptions(container, n, endpointConfig, sb, daemon.configStore.DNS)
if err != nil {
return err
}
endpointName := strings.TrimPrefix(container.Name, "/")
ep, err := n.CreateEndpoint(endpointName, createOptions...)

The original problem occurs at L784, the retry logic inside L748 does not help.

}

func (nc *networkAttacherController) Close() error {
Expand Down
1 change: 1 addition & 0 deletions daemon/cluster/executor/container/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ func (r *controller) Start(ctx context.Context) error {
// Retry network creation again if we
// failed because some of the networks
// were not found.
log.G(ctx).WithError(lnErr).Debugf("Network not found when starting container %s, will recreate the networks", r.adapter.container.name())
if err := r.adapter.createNetworks(ctx); err != nil {
return err
}
Expand Down
4 changes: 4 additions & 0 deletions daemon/container_operations.go
Original file line number Diff line number Diff line change
Expand Up @@ -940,12 +940,16 @@ func (daemon *Daemon) disconnectFromNetwork(container *container.Container, n li

func (daemon *Daemon) tryDetachContainerFromClusterNetwork(network libnetwork.Network, container *container.Container) {
if daemon.clusterProvider != nil && network.Info().Dynamic() && !container.Managed {
// For container created from `docker run --net <attachable-overlay-network>`
if err := daemon.clusterProvider.DetachNetwork(network.Name(), container.ID); err != nil {
logrus.Warnf("error detaching from network %s: %v", network.Name(), err)
if err := daemon.clusterProvider.DetachNetwork(network.ID(), container.ID); err != nil {
logrus.Warnf("error detaching from network %s: %v", network.ID(), err)
}
}
if err := daemon.DeleteManagedNetwork(network.ID()); err != nil {
logrus.WithError(err).Warnf("error deleting network %s", network.ID())
}
}
attributes := map[string]string{
"container": container.ID,
Expand Down
33 changes: 33 additions & 0 deletions integration/service/network_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package service // import "github.com/docker/docker/integration/service"
import (
"context"
"testing"
"time"

"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/network"
Expand Down Expand Up @@ -115,3 +116,35 @@ func TestDockerNetworkReConnect(t *testing.T) {
assert.NilError(t, err)
assert.Check(t, is.DeepEqual(n1, n2))
}

func TestDockerRestartWithAttachbleNetwork(t *testing.T) {
skip.If(t, testEnv.DaemonInfo.OSType == "windows")
defer setupTest(t)()
d := swarm.NewSwarm(t, testEnv)
defer d.Stop(t)
client := d.NewClientT(t)
defer client.Close()
ctx := context.Background()

name := t.Name() + "dummyNet"
net.CreateNoError(ctx, t, client, name,
net.WithDriver("overlay"),
net.WithAttachable(),
)

c1 := container.Create(ctx, t, client, func(c *container.TestContainerConfig) {
c.NetworkingConfig = &network.NetworkingConfig{
EndpointsConfig: map[string]*network.EndpointSettings{
name: {},
},
}
})

err := client.ContainerStart(ctx, c1, types.ContainerStartOptions{})
assert.NilError(t, err)

var timeout time.Duration = 20 * time.Second
err = client.ContainerRestart(ctx, c1, &timeout)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does this work with a smaller restart value , default in compose and cli is 10s

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@arkodg Stopping busybox container need at least 10 seconds since busybox does not have signal handler for SIGTERM. So I feel the timeout here needs to be longer than 10 seconds.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah how about 11 :trollface:

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm afraid setting timeout to 11s will cause CI flaky.

assert.NilError(t, err)

}
0