1
0
Fork 0
mirror of https://github.com/moby/moby.git synced 2022-11-09 12:21:53 -05:00

Handle IP reuse in overlay

In case of IP reuse locally there was a race condition
that was leaving the overlay namespace with wrong configuration
causing connectivity issues.
This commit introduces the use of setMatrix to handle the transient
state and make sure that the proper configuration is maintained

Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>
This commit is contained in:
Flavio Crisciani 2017-09-05 10:43:20 -07:00
parent ef2e91707d
commit 711d033757
7 changed files with 175 additions and 130 deletions

View file

@ -200,7 +200,7 @@ func (d *driver) EventNotify(etype driverapi.EventType, nid, tableName, key stri
}
if etype == driverapi.Delete {
d.peerDelete(nid, eid, addr.IP, addr.Mask, mac, vtep)
d.peerDelete(nid, eid, addr.IP, addr.Mask, mac, vtep, false)
return
}
@ -234,9 +234,11 @@ func (d *driver) Leave(nid, eid string) error {
n.leaveSandbox()
if err := d.checkEncryption(nid, nil, 0, true, false); err != nil {
logrus.Warn(err)
}
// if err := d.checkEncryption(nid, nil, 0, true, false); err != nil {
// logrus.Warn(err)
// }
d.peerDelete(nid, eid, ep.addr.IP, ep.addr.Mask, ep.mac, net.ParseIP(d.advertiseAddress), true)
return nil
}

View file

@ -760,12 +760,11 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) {
continue
}
logrus.Debugf("miss notification: dest IP %v, dest MAC %v", ip, mac)
if neigh.State&(netlink.NUD_STALE|netlink.NUD_INCOMPLETE) == 0 {
continue
}
logrus.Debugf("miss notification: dest IP %v, dest MAC %v", ip, mac)
if n.driver.isSerfAlive() {
mac, IPmask, vtep, err := n.driver.resolvePeer(n.id, ip)
if err != nil {
@ -775,43 +774,17 @@ func (n *network) watchMiss(nlSock *nl.NetlinkSocket) {
n.driver.peerAdd(n.id, "dummy", ip, IPmask, mac, vtep, l2Miss, l3Miss, false)
} else {
// If the gc_thresh values are lower kernel might knock off the neighor entries.
// When we get a L3 miss check if its a valid peer and reprogram the neighbor
// entry again. Rate limit it to once attempt every 500ms, just in case a faulty
// container sends a flood of packets to invalid peers
if !l3Miss {
continue
}
// This case can happen only for local entries, from the documentation (http://man7.org/linux/man-pages/man7/arp.7.html):
// Entries which are marked as permanent are never deleted by the garbage-collector.
if time.Since(t) > 500*time.Millisecond {
t = time.Now()
n.programNeighbor(ip)
logrus.Warnf("miss notification for peer:%+v l3Miss:%t l2Miss:%t, if the problem persist check the gc_thresholds", neigh, l3Miss, l2Miss)
}
}
}
}
}
func (n *network) programNeighbor(ip net.IP) {
peerMac, _, _, err := n.driver.peerDbSearch(n.id, ip)
if err != nil {
logrus.Errorf("Reprogramming on L3 miss failed for %s, no peer entry", ip)
return
}
s := n.getSubnetforIPAddr(ip)
if s == nil {
logrus.Errorf("Reprogramming on L3 miss failed for %s, not a valid subnet", ip)
return
}
sbox := n.sandbox()
if sbox == nil {
logrus.Errorf("Reprogramming on L3 miss failed for %s, overlay sandbox missing", ip)
return
}
if err := sbox.AddNeighbor(ip, peerMac, true, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil {
logrus.Errorf("Reprogramming on L3 miss failed for %s: %v", ip, err)
return
}
}
func (d *driver) addNetwork(n *network) {
d.Lock()
d.networks[n.id] = n

View file

@ -122,7 +122,7 @@ func (d *driver) processEvent(u serf.UserEvent) {
case "join":
d.peerAdd(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr), false, false, false)
case "leave":
d.peerDelete(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr))
d.peerDelete(nid, eid, net.ParseIP(ipStr), net.IPMask(net.ParseIP(maskStr).To4()), mac, net.ParseIP(vtepStr), false)
}
}
@ -135,13 +135,13 @@ func (d *driver) processQuery(q *serf.Query) {
fmt.Printf("Failed to scan query payload string: %v\n", err)
}
peerMac, peerIPMask, vtep, err := d.peerDbSearch(nid, net.ParseIP(ipStr))
pKey, pEntry, err := d.peerDbSearch(nid, net.ParseIP(ipStr))
if err != nil {
return
}
logrus.Debugf("Sending peer query resp mac %s, mask %s, vtep %s", peerMac, net.IP(peerIPMask), vtep)
q.Respond([]byte(fmt.Sprintf("%s %s %s", peerMac.String(), net.IP(peerIPMask).String(), vtep.String())))
logrus.Debugf("Sending peer query resp mac %v, mask %s, vtep %s", pKey.peerMac, net.IP(pEntry.peerIPMask).String(), pEntry.vtep)
q.Respond([]byte(fmt.Sprintf("%s %s %s", pKey.peerMac.String(), net.IP(pEntry.peerIPMask).String(), pEntry.vtep.String())))
}
func (d *driver) resolvePeer(nid string, peerIP net.IP) (net.HardwareAddr, net.IPMask, net.IP, error) {

View file

@ -262,7 +262,7 @@ func (d *driver) nodeJoin(advertiseAddress, bindAddress string, self bool) {
d.Unlock()
// If containers are already running on this network update the
// advertiseaddress in the peerDB
// advertise address in the peerDB
d.localJoinOnce.Do(func() {
d.peerDBUpdateSelf()
})

View file

@ -24,4 +24,4 @@ message PeerRecord {
// which this container is running and can be reached by
// building a tunnel to that host IP.
string tunnel_endpoint_ip = 3 [(gogoproto.customname) = "TunnelEndpointIP"];
}
}

View file

@ -8,6 +8,7 @@ import (
"syscall"
"github.com/docker/libnetwork/common"
"github.com/docker/libnetwork/osl"
"github.com/sirupsen/logrus"
)
@ -22,12 +23,42 @@ type peerEntry struct {
eid string
vtep net.IP
peerIPMask net.IPMask
inSandbox bool
isLocal bool
}
func (p *peerEntry) MarshalDB() peerEntryDB {
ones, bits := p.peerIPMask.Size()
return peerEntryDB{
eid: p.eid,
vtep: p.vtep.String(),
isLocal: p.isLocal,
peerIPMaskOnes: ones,
peerIPMaskBits: bits,
}
}
// This the structure saved into the set (SetMatrix), due to the implementation of it
// the value inserted in the set has to be Hashable so the []byte had to be converted into
// strings
type peerEntryDB struct {
eid string
vtep string
peerIPMaskOnes int
peerIPMaskBits int
isLocal bool
}
func (p *peerEntryDB) UnMarshalDB() peerEntry {
return peerEntry{
eid: p.eid,
vtep: net.ParseIP(p.vtep),
peerIPMask: net.CIDRMask(p.peerIPMaskOnes, p.peerIPMaskBits),
isLocal: p.isLocal,
}
}
type peerMap struct {
mp map[string]peerEntry
mp common.SetMatrix
sync.Mutex
}
@ -54,11 +85,7 @@ func (pKey *peerKey) Scan(state fmt.ScanState, verb rune) error {
}
pKey.peerMac, err = net.ParseMAC(string(macB))
if err != nil {
return err
}
return nil
return err
}
func (d *driver) peerDbWalk(f func(string, *peerKey, *peerEntry) bool) error {
@ -87,10 +114,13 @@ func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool
}
mp := map[string]peerEntry{}
pMap.Lock()
for pKeyStr, pEntry := range pMap.mp {
mp[pKeyStr] = pEntry
for _, pKeyStr := range pMap.mp.Keys() {
entryDBList, ok := pMap.mp.Get(pKeyStr)
if ok {
peerEntryDB := entryDBList[0].(peerEntryDB)
mp[pKeyStr] = peerEntryDB.UnMarshalDB()
}
}
pMap.Unlock()
@ -107,45 +137,38 @@ func (d *driver) peerDbNetworkWalk(nid string, f func(*peerKey, *peerEntry) bool
return nil
}
func (d *driver) peerDbSearch(nid string, peerIP net.IP) (net.HardwareAddr, net.IPMask, net.IP, error) {
var (
peerMac net.HardwareAddr
vtep net.IP
peerIPMask net.IPMask
found bool
)
func (d *driver) peerDbSearch(nid string, peerIP net.IP) (*peerKey, *peerEntry, error) {
var pKeyMatched *peerKey
var pEntryMatched *peerEntry
err := d.peerDbNetworkWalk(nid, func(pKey *peerKey, pEntry *peerEntry) bool {
if pKey.peerIP.Equal(peerIP) {
peerMac = pKey.peerMac
peerIPMask = pEntry.peerIPMask
vtep = pEntry.vtep
found = true
return found
pKeyMatched = pKey
pEntryMatched = pEntry
return true
}
return found
return false
})
if err != nil {
return nil, nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err)
return nil, nil, fmt.Errorf("peerdb search for peer ip %q failed: %v", peerIP, err)
}
if !found {
return nil, nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP)
if pKeyMatched == nil || pEntryMatched == nil {
return nil, nil, fmt.Errorf("peer ip %q not found in peerdb", peerIP)
}
return peerMac, peerIPMask, vtep, nil
return pKeyMatched, pEntryMatched, nil
}
func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
peerMac net.HardwareAddr, vtep net.IP, isLocal bool) {
peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
d.peerDb.Lock()
pMap, ok := d.peerDb.mp[nid]
if !ok {
d.peerDb.mp[nid] = &peerMap{
mp: make(map[string]peerEntry),
mp: common.NewSetMatrix(),
}
pMap = d.peerDb.mp[nid]
@ -165,18 +188,24 @@ func (d *driver) peerDbAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask
}
pMap.Lock()
pMap.mp[pKey.String()] = pEntry
pMap.Unlock()
defer pMap.Unlock()
b, i := pMap.mp.Insert(pKey.String(), pEntry.MarshalDB())
if i != 1 {
// Transient case, there is more than one endpoint that is using the same IP,MAC pair
s, _ := pMap.mp.String(pKey.String())
logrus.Warnf("peerDbAdd transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
}
return b, i
}
func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
peerMac net.HardwareAddr, vtep net.IP) peerEntry {
peerMac net.HardwareAddr, vtep net.IP, isLocal bool) (bool, int) {
d.peerDb.Lock()
pMap, ok := d.peerDb.mp[nid]
if !ok {
d.peerDb.Unlock()
return peerEntry{}
return false, 0
}
d.peerDb.Unlock()
@ -185,22 +214,22 @@ func (d *driver) peerDbDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPM
peerMac: peerMac,
}
pMap.Lock()
pEntry, ok := pMap.mp[pKey.String()]
if ok {
// Mismatched endpoint ID(possibly outdated). Do not
// delete peerdb
if pEntry.eid != eid {
pMap.Unlock()
return pEntry
}
pEntry := peerEntry{
eid: eid,
vtep: vtep,
peerIPMask: peerIPMask,
isLocal: isLocal,
}
delete(pMap.mp, pKey.String())
pMap.Unlock()
return pEntry
pMap.Lock()
defer pMap.Unlock()
b, i := pMap.mp.Remove(pKey.String(), pEntry.MarshalDB())
if i != 0 {
// Transient case, there is more than one endpoint that is using the same IP,MAC pair
s, _ := pMap.mp.String(pKey.String())
logrus.Warnf("peerDbDelete transient condition - Key:%s cardinality:%d db state:%s", pKey.String(), i, s)
}
return b, i
}
// The overlay uses a lazy initialization approach, this means that when a network is created
@ -253,7 +282,7 @@ func (d *driver) peerOpRoutine(ctx context.Context, ch chan *peerOperation) {
case peerOperationADD:
err = d.peerAddOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.l2Miss, op.l3Miss, true, op.localPeer)
case peerOperationDELETE:
err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP)
err = d.peerDeleteOp(op.networkID, op.endpointID, op.peerIP, op.peerIPMask, op.peerMac, op.vtepIP, op.localPeer)
}
if err != nil {
logrus.Warnf("Peer operation failed:%s op:%v", err, op)
@ -303,19 +332,27 @@ func (d *driver) peerAdd(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
}
func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, updateOnlyDB bool) error {
peerMac net.HardwareAddr, vtep net.IP, l2Miss, l3Miss, updateDB, localPeer bool) error {
if err := validateID(nid, eid); err != nil {
return err
}
var dbEntries int
var inserted bool
if updateDB {
d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, false)
if updateOnlyDB {
return nil
inserted, dbEntries = d.peerDbAdd(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
if !inserted {
logrus.Warnf("Entry already present in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
nid, eid, peerIP, peerMac, localPeer, vtep)
}
}
// Local peers do not need any further configuration
if localPeer {
return nil
}
n := d.network(nid)
if n == nil {
return nil
@ -353,20 +390,26 @@ func (d *driver) peerAddOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask
// Add neighbor entry for the peer IP
if err := sbox.AddNeighbor(peerIP, peerMac, l3Miss, sbox.NeighborOptions().LinkName(s.vxlanName)); err != nil {
return fmt.Errorf("could not add neighbor entry into the sandbox: %v", err)
if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 1 {
// We are in the transient case so only the first configuration is programmed into the kernel
// Upon deletion if the active configuration is deleted the next one from the database will be restored
// Note we are skipping also the next configuration
return nil
}
return fmt.Errorf("could not add neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
}
// Add fdb entry to the bridge for the peer mac
if err := sbox.AddNeighbor(vtep, peerMac, l2Miss, sbox.NeighborOptions().LinkName(s.vxlanName),
sbox.NeighborOptions().Family(syscall.AF_BRIDGE)); err != nil {
return fmt.Errorf("could not add fdb entry into the sandbox: %v", err)
return fmt.Errorf("could not add fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
}
return nil
}
func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
peerMac net.HardwareAddr, vtep net.IP) {
peerMac net.HardwareAddr, vtep net.IP, localPeer bool) {
callerName := common.CallerName(1)
d.peerOpCh <- &peerOperation{
opType: peerOperationDELETE,
@ -377,17 +420,22 @@ func (d *driver) peerDelete(nid, eid string, peerIP net.IP, peerIPMask net.IPMas
peerMac: peerMac,
vtepIP: vtep,
callerName: callerName,
localPeer: localPeer,
}
}
func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPMask,
peerMac net.HardwareAddr, vtep net.IP) error {
peerMac net.HardwareAddr, vtep net.IP, localPeer bool) error {
if err := validateID(nid, eid); err != nil {
return err
}
pEntry := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep)
deleted, dbEntries := d.peerDbDelete(nid, eid, peerIP, peerIPMask, peerMac, vtep, localPeer)
if !deleted {
logrus.Warnf("Entry was not in db: nid:%s eid:%s peerIP:%v peerMac:%v isLocal:%t vtep:%v",
nid, eid, peerIP, peerMac, localPeer, vtep)
}
n := d.network(nid)
if n == nil {
@ -399,31 +447,38 @@ func (d *driver) peerDeleteOp(nid, eid string, peerIP net.IP, peerIPMask net.IPM
return nil
}
// Delete fdb entry to the bridge for the peer mac only if the
// entry existed in local peerdb. If it is a stale delete
// request, still call DeleteNeighbor but only to cleanup any
// leftover sandbox neighbor cache and not actually delete the
// kernel state.
if (eid == pEntry.eid && vtep.Equal(pEntry.vtep)) ||
(eid != pEntry.eid && !vtep.Equal(pEntry.vtep)) {
if err := sbox.DeleteNeighbor(vtep, peerMac,
eid == pEntry.eid && vtep.Equal(pEntry.vtep)); err != nil {
return fmt.Errorf("could not delete fdb entry into the sandbox: %v", err)
}
}
// Delete neighbor entry for the peer IP
if eid == pEntry.eid {
if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil {
return fmt.Errorf("could not delete neighbor entry into the sandbox: %v", err)
}
}
if err := d.checkEncryption(nid, vtep, 0, false, false); err != nil {
logrus.Warn(err)
}
return nil
// Remove fdb entry to the bridge for the peer mac
if err := sbox.DeleteNeighbor(vtep, peerMac, true); err != nil {
if _, ok := err.(osl.NeighborSearchError); ok && dbEntries > 0 {
// We fall in here if there is a transient state and if the neighbor that is being deleted
// was never been configured into the kernel (we allow only 1 configuration at the time per <ip,mac> mapping)
return nil
}
return fmt.Errorf("could not delete fdb entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
}
// Delete neighbor entry for the peer IP
if err := sbox.DeleteNeighbor(peerIP, peerMac, true); err != nil {
return fmt.Errorf("could not delete neighbor entry for nid:%s eid:%s into the sandbox:%v", nid, eid, err)
}
if dbEntries == 0 {
return nil
}
// If there is still an entry into the database and the deletion went through without errors means that there is now no
// configuration active in the kernel.
// Restore one configuration for the <ip,mac> directly from the database, note that is guaranteed that there is one
peerKey, peerEntry, err := d.peerDbSearch(nid, peerIP)
if err != nil {
logrus.Errorf("peerDeleteOp unable to restore a configuration for nid:%s ip:%v mac:%v err:%s", nid, peerIP, peerMac, err)
return err
}
return d.peerAddOp(nid, peerEntry.eid, peerIP, peerEntry.peerIPMask, peerKey.peerMac, peerEntry.vtep, false, false, false, peerEntry.isLocal)
}
func (d *driver) pushLocalDb() {

View file

@ -9,6 +9,17 @@ import (
"github.com/vishvananda/netlink"
)
// NeighborSearchError indicates that the neighbor is already present
type NeighborSearchError struct {
ip net.IP
mac net.HardwareAddr
present bool
}
func (n NeighborSearchError) Error() string {
return fmt.Sprintf("Search neighbor failed for IP %v, mac %v, present in db:%t", n.ip, n.mac, n.present)
}
// NeighOption is a function option type to set interface options
type NeighOption func(nh *neigh)
@ -41,7 +52,7 @@ func (n *networkNamespace) DeleteNeighbor(dstIP net.IP, dstMac net.HardwareAddr,
nh := n.findNeighbor(dstIP, dstMac)
if nh == nil {
return fmt.Errorf("could not find the neighbor entry to delete")
return NeighborSearchError{dstIP, dstMac, false}
}
if osDelete {
@ -103,26 +114,27 @@ func (n *networkNamespace) DeleteNeighbor(dstIP net.IP, dstMac net.HardwareAddr,
}
}
n.Unlock()
logrus.Debugf("Neighbor entry deleted for IP %v, mac %v", dstIP, dstMac)
logrus.Debugf("Neighbor entry deleted for IP %v, mac %v osDelete:%t", dstIP, dstMac, osDelete)
return nil
}
func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, force bool, options ...NeighOption) error {
var (
iface netlink.Link
err error
iface netlink.Link
err error
neighborAlreadyPresent bool
)
// If the namespace already has the neighbor entry but the AddNeighbor is called
// because of a miss notification (force flag) program the kernel anyway.
nh := n.findNeighbor(dstIP, dstMac)
if nh != nil {
neighborAlreadyPresent = true
logrus.Warnf("Neighbor entry already present for IP %v, mac %v neighbor:%+v forceUpdate:%t", dstIP, dstMac, nh, force)
if !force {
logrus.Warnf("Neighbor entry already present for IP %v, mac %v", dstIP, dstMac)
return nil
return NeighborSearchError{dstIP, dstMac, true}
}
logrus.Warnf("Force kernel update, Neighbor entry already present for IP %v, mac %v", dstIP, dstMac)
}
nh = &neigh{
@ -146,8 +158,7 @@ func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, fo
if nh.linkDst != "" {
iface, err = nlh.LinkByName(nh.linkDst)
if err != nil {
return fmt.Errorf("could not find interface with destination name %s: %v",
nh.linkDst, err)
return fmt.Errorf("could not find interface with destination name %s: %v", nh.linkDst, err)
}
}
@ -167,7 +178,11 @@ func (n *networkNamespace) AddNeighbor(dstIP net.IP, dstMac net.HardwareAddr, fo
}
if err := nlh.NeighSet(nlnh); err != nil {
return fmt.Errorf("could not add neighbor entry: %v", err)
return fmt.Errorf("could not add neighbor entry:%+v error:%v", nlnh, err)
}
if neighborAlreadyPresent {
return nil
}
n.Lock()