diff --git a/libnetwork/drivers/overlay/joinleave.go b/libnetwork/drivers/overlay/joinleave.go index a07838b760..11edf43765 100644 --- a/libnetwork/drivers/overlay/joinleave.go +++ b/libnetwork/drivers/overlay/joinleave.go @@ -200,7 +200,7 @@ func (d *driver) EventNotify(etype driverapi.EventType, nid, tableName, key stri } if etype == driverapi.Delete { - d.peerDelete(nid, eid, addr.IP, addr.Mask, mac, vtep) + d.peerDelete(nid, eid, addr.IP, addr.Mask, mac, vtep, false) return } @@ -234,9 +234,11 @@ func (d *driver) Leave(nid, eid string) error { n.leaveSandbox() - if err := d.checkEncryption(nid, nil, 0, true, false); err != nil { - logrus.Warn(err) - } + // if err := d.checkEncryption(nid, nil, 0, true, false); err != nil { + // logrus.Warn(err) + // } + + d.peerDelete(nid, eid, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), true) return nil } diff --git a/libnetwork/drivers/overlay/ov_network.go b/libnetwork/drivers/overlay/ov_network.go index 0e9ca77866..419f22d03d 100644 --- a/libnetwork/drivers/overlay/ov_network.go +++ b/libnetwork/drivers/overlay/ov_network.go @@ -760,12 +760,11 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) { continue } - logrus.Debugf("miss notification: dest IP %v, dest MAC %v", ip, mac) - if neigh.State&(netlink.NUD_STALE|netlink.NUD_INCOMPLETE) == 0 { continue } + logrus.Debugf("miss notification: dest IP %v, dest MAC %v", ip, mac) if n.driver.isSerfAlive() { mac, IPmask, vtep, err := n.driver.resolvePeer(n.id, ip) if err != nil { @@ -775,43 +774,17 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) { n.driver.peerAdd(n.id, "dummy", ip, IPmask, mac, vtep, l2Miss, l3Miss, false) } else { // If the gc_thresh values are lower kernel might knock off the neighor entries. - // When we get a L3 miss check if its a valid peer and reprogram the neighbor - // entry again. Rate limit it to once attempt every 500ms, just in case a faulty - // container sends a flood of packets to invalid peers - if !l3Miss { - continue - } + // This case can happen only for local entries, from the documentation (http://man7.org/linux/man-pages/man7/arp.7.html): + // Entries which are marked as permanent are never deleted by the garbage-collector. if time.Since(t) > 500*time.Millisecond { t = time.Now() - n.programNeighbor(ip) + logrus.Warnf("miss notification for peer:%+v l3Miss:%t l2Miss:%t, if the problem persist check the gc_thresholds", neigh, l3Miss, l2Miss) } } } } } -func (n *network) programNeighbor(ip net.IP) { - peerMac, _, _, err := n.driver.peerDbSearch(n.id, ip) - if err != nil { - logrus.Errorf("Reprogramming on L3 miss failed for %s, no peer entry", ip) - return - } - s := n.getSubnetforIPAddr(ip) - if s == nil { - logrus.Errorf("Reprogramming on L3 miss failed for %s, not a valid subnet", ip) - return - } - sbox := n.sandbox() - if sbox == nil { - logrus.Errorf("Reprogramming on L3 miss failed for %s, overlay sandbox missing", ip) - return - } - if err := sbox.AddNeighbor(ip, peerMac, true, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil { - logrus.Errorf("Reprogramming on L3 miss failed for %s: %v", ip, err) - return - } -} - func (d *driver) addNetwork(n *network) { d.Lock() d.networks[n.id] = n diff --git a/libnetwork/drivers/overlay/ov_serf.go b/libnetwork/drivers/overlay/ov_serf.go index 6e034ada46..f644799afd 100644 --- a/libnetwork/drivers/overlay/ov_serf.go +++ b/libnetwork/drivers/overlay/ov_serf.go @@ -122,7 +122,7 @@ func (d *driver) processEvent(u serf.UserEvent) { case "join": d.peerAdd(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr), false, false, false) case "leave": - d.peerDelete(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr)) + d.peerDelete(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr), false) } } @@ -135,13 +135,13 @@ func (d *driver) processQuery(q *serf.Query) { fmt.Printf("Failed to scan query payload string: %v\n", err) } - peerMac, peerIPMask, vtep, err := d.peerDbSearch(nid, net.ParseIP(ipStr)) + pKey, pEntry, err := d.peerDbSearch(nid, net.ParseIP(ipStr)) if err != nil { return } - logrus.Debugf("Sending peer query resp mac %s, mask %s, vtep %s", peerMac, net.IP(peerIPMask), vtep) - q.Respond([]byte(fmt.Sprintf("%s %s %s", peerMac.String(), net.IP(peerIPMask).String(), vtep.String()))) + logrus.Debugf("Sending peer query resp mac %v, mask %s, vtep %s", pKey.peerMac, net.IP(pEntry.peerIPMask).String(), pEntry.vtep) + q.Respond([]byte(fmt.Sprintf("%s %s %s", pKey.peerMac.String(), net.IP(pEntry.peerIPMask).String(), pEntry.vtep.String()))) } func (d *driver) resolvePeer(nid string, peerIP net.IP) (net.HardwareAddr, net.IPMask, net.IP, error) { diff --git a/libnetwork/drivers/overlay/overlay.go b/libnetwork/drivers/overlay/overlay.go index 2bae0823e1..f029c5cce4 100644 --- a/libnetwork/drivers/overlay/overlay.go +++ b/libnetwork/drivers/overlay/overlay.go @@ -262,7 +262,7 @@ func (d *driver) nodeJoin(advertiseAddress, bindAddress string, self bool) { d.Unlock() // If containers are already running on this network update the - // advertiseaddress in the peerDB + // advertise address in the peerDB d.localJoinOnce.Do(func() { d.peerDBUpdateSelf() }) diff --git a/libnetwork/drivers/overlay/overlay.proto b/libnetwork/drivers/overlay/overlay.proto index 45b8c9de7e..3133386e03 100644 --- a/libnetwork/drivers/overlay/overlay.proto +++ b/libnetwork/drivers/overlay/overlay.proto @@ -24,4 +24,4 @@ message PeerRecord { // which this container is running and can be reached by // building a tunnel to that host IP. string tunnel_endpoint_ip = 3 [(gogoproto.customname) = "TunnelEndpointIP"]; -} \ No newline at end of file +} diff --git a/libnetwork/drivers/overlay/peerdb.go b/libnetwork/drivers/overlay/peerdb.go index f953e3c872..7779f3fd59 100644 --- a/libnetwork/drivers/overlay/peerdb.go +++ b/libnetwork/drivers/overlay/peerdb.go @@ -8,6 +8,7 @@ import ( "syscall" "github.com/docker/libnetwork/common" + "github.com/docker/libnetwork/osl" "github.com/sirupsen/logrus" ) @@ -22,12 +23,42 @@ type peerEntry struct { eid string vtep net.IP peerIPMask net.IPMask - inSandbox bool isLocal bool } +func (p *peerEntry) MarshalDB() peerEntryDB { + ones, bits := p.peerIPMask.Size() + return peerEntryDB{ + eid: p.eid, + vtep: p.vtep.String(), + isLocal: p.isLocal, + peerIPMaskOnes: ones, + peerIPMaskBits: bits, + } +} + +// This the structure saved into the set (SetMatrix), due to the implementation of it +// the value inserted in the set has to be Hashable so the []byte had to be converted into +// strings +type peerEntryDB struct { + eid string + vtep string + peerIPMaskOnes int + peerIPMaskBits int + isLocal bool +} + +func (p *peerEntryDB) UnMarshalDB() peerEntry { + return peerEntry{ + eid: p.eid, + vtep: net.ParseIP(p.vtep), + peerIPMask: net.CIDRMask(p.peerIPMaskOnes, p.peerIPMaskBits), + isLocal: p.isLocal, + } +} + type peerMap struct { - mp map[string]peerEntry + mp common.SetMatrix sync.Mutex } @@ -54,11 +85,7 @@ func (pKey *peerKey) Scan(state fmt.ScanState, verb rune) error { } pKey.peerMac, err = net.ParseMAC(string(macB)) - if err != nil { - return err - } - - return nil + return err } func (d *driver) peerDbWalk(f func(string, *peerKey, *peerEntry) bool) error { @@ -87,10 +114,13 @@ func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool } mp := map[string]peerEntry{} - pMap.Lock() - for pKeyStr, pEntry := range pMap.mp { - mp[pKeyStr] = pEntry + for _, pKeyStr := range pMap.mp.Keys() { + entryDBList, ok := pMap.mp.Get(pKeyStr) + if ok { + peerEntryDB := entryDBList[0].(peerEntryDB) + mp[pKeyStr] = peerEntryDB.UnMarshalDB() + } } pMap.Unlock() @@ -107,45 +137,38 @@ func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool return nil } -func (d *driver) peerDbSearch(nid string, peerIP net.IP) (net.HardwareAddr, net.IPMask, net.IP, error) { - var ( - peerMac net.HardwareAddr - vtep net.IP - peerIPMask net.IPMask - found bool - ) - +func (d *driver) peerDbSearch(nid string, peerIP net.IP) (*peerKey, *peerEntry, error) { + var pKeyMatched *peerKey + var pEntryMatched *peerEntry err := d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool { if pKey.peerIP.Equal(peerIP) { - peerMac = pKey.peerMac - peerIPMask = pEntry.peerIPMask - vtep = pEntry.vtep - found = true - return found + pKeyMatched = pKey + pEntryMatched = pEntry + return true } - return found + return false }) if err != nil { - return nil, nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err) + return nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err) } - if !found { - return nil, nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP) + if pKeyMatched == nil || pEntryMatched == nil { + return nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP) } - return peerMac, peerIPMask, vtep, nil + return pKeyMatched, pEntryMatched, nil } func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, - peerMac net.HardwareAddr, vtep net.IP, isLocal bool) { + peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) { d.peerDb.Lock() pMap, ok := d.peerDb.mp[nid] if !ok { d.peerDb.mp[nid] = &peerMap{ - mp: make(map[string]peerEntry), + mp: common.NewSetMatrix(), } pMap = d.peerDb.mp[nid] @@ -165,18 +188,24 @@ func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask } pMap.Lock() - pMap.mp[pKey.String()] = pEntry - pMap.Unlock() + defer pMap.Unlock() + b, i := pMap.mp.Insert(pKey.String(), pEntry.MarshalDB()) + if i != 1 { + // Transient case, there is more than one endpoint that is using the same IP,MAC pair + s, _ := pMap.mp.String(pKey.String()) + logrus.Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s) + } + return b, i } func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, - peerMac net.HardwareAddr, vtep net.IP) peerEntry { + peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) { d.peerDb.Lock() pMap, ok := d.peerDb.mp[nid] if !ok { d.peerDb.Unlock() - return peerEntry{} + return false, 0 } d.peerDb.Unlock() @@ -185,22 +214,22 @@ func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPM peerMac: peerMac, } - pMap.Lock() - - pEntry, ok := pMap.mp[pKey.String()] - if ok { - // Mismatched endpoint ID(possibly outdated). Do not - // delete peerdb - if pEntry.eid != eid { - pMap.Unlock() - return pEntry - } + pEntry := peerEntry{ + eid: eid, + vtep: vtep, + peerIPMask: peerIPMask, + isLocal: isLocal, } - delete(pMap.mp, pKey.String()) - pMap.Unlock() - - return pEntry + pMap.Lock() + defer pMap.Unlock() + b, i := pMap.mp.Remove(pKey.String(), pEntry.MarshalDB()) + if i != 0 { + // Transient case, there is more than one endpoint that is using the same IP,MAC pair + s, _ := pMap.mp.String(pKey.String()) + logrus.Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s) + } + return b, i } // The overlay uses a lazy initialization approach, this means that when a network is created @@ -253,7 +282,7 @@ func (d *driver) peerOpRoutine(ctx context.Context, ch chan *peerOperation) { case peerOperationADD: err = d.peerAddOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.l2Miss, op.l3Miss, true, op.localPeer) case peerOperationDELETE: - err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP) + err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.localPeer) } if err != nil { logrus.Warnf("Peer operation failed:%s op:%v", err, op) @@ -303,19 +332,27 @@ func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, } func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, - peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, updateOnlyDB bool) error { + peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, localPeer bool) error { if err := validateID(nid, eid); err != nil { return err } + var dbEntries int + var inserted bool if updateDB { - d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, false) - if updateOnlyDB { - return nil + inserted, dbEntries = d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer) + if !inserted { + logrus.Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v", + nid, eid, peerIP, peerMac, localPeer, vtep) } } + // Local peers do not need any further configuration + if localPeer { + return nil + } + n := d.network(nid) if n == nil { return nil @@ -353,20 +390,26 @@ func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask // Add neighbor entry for the peer IP if err := sbox.AddNeighbor(peerIP, peerMac, l3Miss, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil { - return fmt.Errorf("could not add neighbor entry into the sandbox: %v", err) + if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 1 { + // We are in the transient case so only the first configuration is programmed into the kernel + // Upon deletion if the active configuration is deleted the next one from the database will be restored + // Note we are skipping also the next configuration + return nil + } + return fmt.Errorf("could not add neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) } // Add fdb entry to the bridge for the peer mac if err := sbox.AddNeighbor(vtep, peerMac, l2Miss, sbox.NeighborOptions().LinkName(s.vxlanName), sbox.NeighborOptions().Family(syscall.AF_BRIDGE)); err != nil { - return fmt.Errorf("could not add fdb entry into the sandbox: %v", err) + return fmt.Errorf("could not add fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) } return nil } func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, - peerMac net.HardwareAddr, vtep net.IP) { + peerMac net.HardwareAddr, vtep net.IP, localPeer bool) { callerName := common.CallerName(1) d.peerOpCh <- &peerOperation{ opType: peerOperationDELETE, @@ -377,17 +420,22 @@ func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMas peerMac: peerMac, vtepIP: vtep, callerName: callerName, + localPeer: localPeer, } } func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask, - peerMac net.HardwareAddr, vtep net.IP) error { + peerMac net.HardwareAddr, vtep net.IP, localPeer bool) error { if err := validateID(nid, eid); err != nil { return err } - pEntry := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep) + deleted, dbEntries := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer) + if !deleted { + logrus.Warnf("Entry was not in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v", + nid, eid, peerIP, peerMac, localPeer, vtep) + } n := d.network(nid) if n == nil { @@ -399,31 +447,38 @@ func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPM return nil } - // Delete fdb entry to the bridge for the peer mac only if the - // entry existed in local peerdb. If it is a stale delete - // request, still call DeleteNeighbor but only to cleanup any - // leftover sandbox neighbor cache and not actually delete the - // kernel state. - if (eid == pEntry.eid && vtep.Equal(pEntry.vtep)) || - (eid != pEntry.eid && !vtep.Equal(pEntry.vtep)) { - if err := sbox.DeleteNeighbor(vtep, peerMac, - eid == pEntry.eid && vtep.Equal(pEntry.vtep)); err != nil { - return fmt.Errorf("could not delete fdb entry into the sandbox: %v", err) - } - } - - // Delete neighbor entry for the peer IP - if eid == pEntry.eid { - if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil { - return fmt.Errorf("could not delete neighbor entry into the sandbox: %v", err) - } - } - if err := d.checkEncryption(nid, vtep, 0, false, false); err != nil { logrus.Warn(err) } - return nil + // Remove fdb entry to the bridge for the peer mac + if err := sbox.DeleteNeighbor(vtep, peerMac, true); err != nil { + if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 { + // We fall in here if there is a transient state and if the neighbor that is being deleted + // was never been configured into the kernel (we allow only 1 configuration at the time per mapping) + return nil + } + return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) + } + + // Delete neighbor entry for the peer IP + if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil { + return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err) + } + + if dbEntries == 0 { + return nil + } + + // If there is still an entry into the database and the deletion went through without errors means that there is now no + // configuration active in the kernel. + // Restore one configuration for the directly from the database, note that is guaranteed that there is one + peerKey, peerEntry, err := d.peerDbSearch(nid, peerIP) + if err != nil { + logrus.Errorf("peerDeleteOp unable to restore a configuration for nid:%s ip:%v mac:%v err:%s", nid, peerIP, peerMac, err) + return err + } + return d.peerAddOp(nid, peerEntry.eid, peerIP, peerEntry.peerIPMask, peerKey.peerMac, peerEntry.vtep, false, false, false, peerEntry.isLocal) } func (d *driver) pushLocalDb() { diff --git a/libnetwork/osl/neigh_linux.go b/libnetwork/osl/neigh_linux.go index 4e479489fa..fa5ac9af04 100644 --- a/libnetwork/osl/neigh_linux.go +++ b/libnetwork/osl/neigh_linux.go @@ -9,6 +9,17 @@ import ( "github.com/vishvananda/netlink" ) +// NeighborSearchError indicates that the neighbor is already present +type NeighborSearchError struct { + ip net.IP + mac net.HardwareAddr + present bool +} + +func (n NeighborSearchError) Error() string { + return fmt.Sprintf("Search neighbor failed for IP %v, mac %v, present in db:%t", n.ip, n.mac, n.present) +} + // NeighOption is a function option type to set interface options type NeighOption func(nh *neigh) @@ -41,7 +52,7 @@ func (n *networkNamespace) DeleteNeighbor(dstIP net.IP, dstMac net.HardwareAddr, nh := n.findNeighbor(dstIP, dstMac) if nh == nil { - return fmt.Errorf("could not find the neighbor entry to delete") + return NeighborSearchError{dstIP, dstMac, false} } if osDelete { @@ -103,26 +114,27 @@ func (n *networkNamespace) DeleteNeighbor(dstIP net.IP, dstMac net.HardwareAddr, } } n.Unlock() - logrus.Debugf("Neighbor entry deleted for IP %v, mac %v", dstIP, dstMac) + logrus.Debugf("Neighbor entry deleted for IP %v, mac %v osDelete:%t", dstIP, dstMac, osDelete) return nil } func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, force bool, options ...NeighOption) error { var ( - iface netlink.Link - err error + iface netlink.Link + err error + neighborAlreadyPresent bool ) // If the namespace already has the neighbor entry but the AddNeighbor is called // because of a miss notification (force flag) program the kernel anyway. nh := n.findNeighbor(dstIP, dstMac) if nh != nil { + neighborAlreadyPresent = true + logrus.Warnf("Neighbor entry already present for IP %v, mac %v neighbor:%+v forceUpdate:%t", dstIP, dstMac, nh, force) if !force { - logrus.Warnf("Neighbor entry already present for IP %v, mac %v", dstIP, dstMac) - return nil + return NeighborSearchError{dstIP, dstMac, true} } - logrus.Warnf("Force kernel update, Neighbor entry already present for IP %v, mac %v", dstIP, dstMac) } nh = &neigh{ @@ -146,8 +158,7 @@ func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, fo if nh.linkDst != "" { iface, err = nlh.LinkByName(nh.linkDst) if err != nil { - return fmt.Errorf("could not find interface with destination name %s: %v", - nh.linkDst, err) + return fmt.Errorf("could not find interface with destination name %s: %v", nh.linkDst, err) } } @@ -167,7 +178,11 @@ func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, fo } if err := nlh.NeighSet(nlnh); err != nil { - return fmt.Errorf("could not add neighbor entry: %v", err) + return fmt.Errorf("could not add neighbor entry:%+v error:%v", nlnh, err) + } + + if neighborAlreadyPresent { + return nil } n.Lock()