From d8143a5e186faf722d2cae703f0a618c37e588ea Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Tue, 5 Jan 2016 16:41:54 -0800 Subject: Implement first pass on rapid dead path detection, and increment version to 1.1.3 (dev) --- node/Peer.cpp | 55 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 14 deletions(-) (limited to 'node/Peer.cpp') diff --git a/node/Peer.cpp b/node/Peer.cpp index 340f0c10..0f72be9f 100644 --- a/node/Peer.cpp +++ b/node/Peer.cpp @@ -46,8 +46,8 @@ namespace ZeroTier { // Used to send varying values for NAT keepalive static uint32_t _natKeepaliveBuf = 0; -Peer::Peer(const Identity &myIdentity,const Identity &peerIdentity) - throw(std::runtime_error) : +Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Identity &peerIdentity) : + RR(renv), _lastUsed(0), _lastReceive(0), _lastUnicastFrame(0), @@ -72,7 +72,6 @@ Peer::Peer(const Identity &myIdentity,const Identity &peerIdentity) } void Peer::received( - const RuntimeEnvironment *RR, const InetAddress &localAddr, const InetAddress &remoteAddr, unsigned int hops, @@ -199,7 +198,7 @@ void Peer::received( outp.armor(_key,true); RR->node->putPacket(localAddr,remoteAddr,outp.data(),outp.size()); } else { - sendHELLO(RR,localAddr,remoteAddr,now); + sendHELLO(localAddr,remoteAddr,now); } } @@ -214,7 +213,7 @@ void Peer::received( } } -void Peer::sendHELLO(const RuntimeEnvironment *RR,const InetAddress &localAddr,const InetAddress &atAddress,uint64_t now,unsigned int ttl) +void Peer::sendHELLO(const InetAddress &localAddr,const InetAddress &atAddress,uint64_t now,unsigned int ttl) { // _lock not required here since _id is immutable and nothing else is accessed @@ -234,7 +233,7 @@ void Peer::sendHELLO(const RuntimeEnvironment *RR,const InetAddress &localAddr,c RR->node->putPacket(localAddr,atAddress,outp.data(),outp.size(),ttl); } -bool Peer::doPingAndKeepalive(const RuntimeEnvironment *RR,uint64_t now,int inetAddressFamily) +bool Peer::doPingAndKeepalive(uint64_t now,int inetAddressFamily) { Path *p = (Path *)0; @@ -248,7 +247,7 @@ bool Peer::doPingAndKeepalive(const RuntimeEnvironment *RR,uint64_t now,int inet if (p) { if ((now - p->lastReceived()) >= ZT_PEER_DIRECT_PING_DELAY) { //TRACE("PING %s(%s) after %llums/%llums send/receive inactivity",_id.address().toString().c_str(),p->address().toString().c_str(),now - p->lastSend(),now - p->lastReceived()); - sendHELLO(RR,p->localAddress(),p->address(),now); + sendHELLO(p->localAddress(),p->address(),now); p->sent(now); } else if (((now - p->lastSend()) >= ZT_NAT_KEEPALIVE_DELAY)&&(!p->reliable())) { //TRACE("NAT keepalive %s(%s) after %llums/%llums send/receive inactivity",_id.address().toString().c_str(),p->address().toString().c_str(),now - p->lastSend(),now - p->lastReceived()); @@ -264,7 +263,7 @@ bool Peer::doPingAndKeepalive(const RuntimeEnvironment *RR,uint64_t now,int inet return false; } -void Peer::pushDirectPaths(const RuntimeEnvironment *RR,Path *path,uint64_t now,bool force) +void Peer::pushDirectPaths(Path *path,uint64_t now,bool force) { #ifdef ZT_ENABLE_CLUSTER // Cluster mode disables normal PUSH_DIRECT_PATHS in favor of cluster-based peer redirection @@ -332,7 +331,7 @@ void Peer::pushDirectPaths(const RuntimeEnvironment *RR,Path *path,uint64_t now, } } -bool Peer::resetWithinScope(const RuntimeEnvironment *RR,InetAddress::IpScope scope,uint64_t now) +bool Peer::resetWithinScope(InetAddress::IpScope scope,uint64_t now) { Mutex::Lock _l(_lock); unsigned int np = _numPaths; @@ -340,7 +339,7 @@ bool Peer::resetWithinScope(const RuntimeEnvironment *RR,InetAddress::IpScope sc unsigned int y = 0; while (x < np) { if (_paths[x].address().ipScope() == scope) { - sendHELLO(RR,_paths[x].localAddress(),_paths[x].address(),now); + sendHELLO(_paths[x].localAddress(),_paths[x].address(),now); } else { _paths[y++] = _paths[x]; } @@ -383,7 +382,7 @@ bool Peer::networkMembershipCertificatesAgree(uint64_t nwid,const CertificateOfM return false; } -bool Peer::validateAndSetNetworkMembershipCertificate(const RuntimeEnvironment *RR,uint64_t nwid,const CertificateOfMembership &com) +bool Peer::validateAndSetNetworkMembershipCertificate(uint64_t nwid,const CertificateOfMembership &com) { // Sanity checks if ((!com)||(com.issuedTo() != _id.address())) @@ -448,7 +447,7 @@ bool Peer::needsOurNetworkMembershipCertificate(uint64_t nwid,uint64_t now,bool return ((now - tmp) >= (ZT_NETWORK_AUTOCONF_DELAY / 2)); } -void Peer::clean(const RuntimeEnvironment *RR,uint64_t now) +void Peer::clean(uint64_t now) { Mutex::Lock _l(_lock); @@ -485,6 +484,34 @@ void Peer::clean(const RuntimeEnvironment *RR,uint64_t now) } } +bool Peer::_checkPath(Path &p,const uint64_t now) +{ + // assumes _lock is locked + + if (!p.active(now)) + return false; + + if (p.lastSend() > p.lastReceived()) { + if ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) { + TRACE("%s(%s) has not answered, checking if dead (probation: %u)",_id.address().toString().c_str(),p.address().toString().c_str(),p.probation()); + + if ( (_vProto >= 5) && ( !((_vMajor == 1)&&(_vMinor == 1)&&(_vRevision == 0)) ) ) { + // 1.1.1 and newer nodes support ECHO, which is smaller -- but 1.1.0 has a bug so use HELLO there too + Packet outp(_id.address(),RR->identity.address(),Packet::VERB_ECHO); + outp.armor(_key,true); + p.send(RR,outp.data(),outp.size(),now); + } else { + sendHELLO(p.localAddress(),p.address(),now); + p.sent(now); + } + + p.increaseProbation(); + } + } + + return true; +} + Path *Peer::_getBestPath(const uint64_t now) { // assumes _lock is locked @@ -492,7 +519,7 @@ Path *Peer::_getBestPath(const uint64_t now) uint64_t bestPathScore = 0; for(unsigned int i=0;i<_numPaths;++i) { const uint64_t score = _paths[i].score(); - if ((score >= bestPathScore)&&(_paths[i].active(now))) { + if ((score >= bestPathScore)&&(_checkPath(_paths[i],now))) { bestPathScore = score; bestPath = &(_paths[i]); } @@ -507,7 +534,7 @@ Path *Peer::_getBestPath(const uint64_t now,int inetAddressFamily) uint64_t bestPathScore = 0; for(unsigned int i=0;i<_numPaths;++i) { const uint64_t score = _paths[i].score(); - if (((int)_paths[i].address().ss_family == inetAddressFamily)&&(score >= bestPathScore)&&(_paths[i].active(now))) { + if (((int)_paths[i].address().ss_family == inetAddressFamily)&&(score >= bestPathScore)&&(_checkPath(_paths[i],now))) { bestPathScore = score; bestPath = &(_paths[i]); } -- cgit v1.2.3 From 4d94ae77b45e16272fe0c5c685cc20ece5057c32 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Tue, 5 Jan 2016 16:48:35 -0800 Subject: simplify if --- node/Peer.cpp | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'node/Peer.cpp') diff --git a/node/Peer.cpp b/node/Peer.cpp index 0f72be9f..a98d94c4 100644 --- a/node/Peer.cpp +++ b/node/Peer.cpp @@ -491,22 +491,20 @@ bool Peer::_checkPath(Path &p,const uint64_t now) if (!p.active(now)) return false; - if (p.lastSend() > p.lastReceived()) { - if ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) { - TRACE("%s(%s) has not answered, checking if dead (probation: %u)",_id.address().toString().c_str(),p.address().toString().c_str(),p.probation()); - - if ( (_vProto >= 5) && ( !((_vMajor == 1)&&(_vMinor == 1)&&(_vRevision == 0)) ) ) { - // 1.1.1 and newer nodes support ECHO, which is smaller -- but 1.1.0 has a bug so use HELLO there too - Packet outp(_id.address(),RR->identity.address(),Packet::VERB_ECHO); - outp.armor(_key,true); - p.send(RR,outp.data(),outp.size(),now); - } else { - sendHELLO(p.localAddress(),p.address(),now); - p.sent(now); - } - - p.increaseProbation(); + if ( (p.lastSend() > p.lastReceived()) && ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) ) { + TRACE("%s(%s) has not answered, checking if dead (probation: %u)",_id.address().toString().c_str(),p.address().toString().c_str(),p.probation()); + + if ( (_vProto >= 5) && ( !((_vMajor == 1)&&(_vMinor == 1)&&(_vRevision == 0)) ) ) { + // 1.1.1 and newer nodes support ECHO, which is smaller -- but 1.1.0 has a bug so use HELLO there too + Packet outp(_id.address(),RR->identity.address(),Packet::VERB_ECHO); + outp.armor(_key,true); + p.send(RR,outp.data(),outp.size(),now); + } else { + sendHELLO(p.localAddress(),p.address(),now); + p.sent(now); } + + p.increaseProbation(); } return true; -- cgit v1.2.3 From 05b2c0743f1733d2725266ad3249496ed09383a5 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Wed, 6 Jan 2016 10:00:03 -0800 Subject: Tighten up dead path detection. Should now auto-detect dead paths in less than 10 seconds at a very small cost in ECHO requests (or HELLOs for older peers). GitHib issue #272 --- node/Constants.hpp | 2 +- node/Path.hpp | 43 +++++++++++++++++++++++++++++++++++++------ node/Peer.cpp | 31 +++++++++++++++++++++++++++---- 3 files changed, 65 insertions(+), 11 deletions(-) (limited to 'node/Peer.cpp') diff --git a/node/Constants.hpp b/node/Constants.hpp index 1a47112a..a2ba1c1a 100644 --- a/node/Constants.hpp +++ b/node/Constants.hpp @@ -274,7 +274,7 @@ /** * No answer timeout to trigger dead path detection */ -#define ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT 3000 +#define ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT 2500 /** * Probation threshold after which a path becomes dead diff --git a/node/Path.hpp b/node/Path.hpp index 23e47408..7e0a8ed1 100644 --- a/node/Path.hpp +++ b/node/Path.hpp @@ -66,6 +66,8 @@ class Path public: Path() : _lastSend(0), + _lastPing(0), + _lastKeepalive(0), _lastReceived(0), _addr(), _localAddress(), @@ -76,6 +78,8 @@ public: Path(const InetAddress &localAddress,const InetAddress &addr) : _lastSend(0), + _lastPing(0), + _lastKeepalive(0), _lastReceived(0), _addr(addr), _localAddress(localAddress), @@ -98,10 +102,21 @@ public: * * @param t Time of send */ - inline void sent(uint64_t t) - { - _lastSend = t; - } + inline void sent(uint64_t t) { _lastSend = t; } + + /** + * Called when we've sent a ping or echo + * + * @param t Time of send + */ + inline void pinged(uint64_t t) { _lastPing = t; } + + /** + * Called when we send a NAT keepalive + * + * @param t Time of send + */ + inline void sentKeepalive(uint64_t t) { _lastKeepalive = t; } /** * Called when a packet is received from this remote path @@ -145,6 +160,16 @@ public: */ inline uint64_t lastSend() const throw() { return _lastSend; } + /** + * @return Time we last pinged or dead path checked this link + */ + inline uint64_t lastPing() const throw() { return _lastPing; } + + /** + * @return Time of last keepalive + */ + inline uint64_t lastKeepalive() const throw() { return _lastKeepalive; } + /** * @return Time of last receive from this path */ @@ -260,8 +285,10 @@ public: template inline void serialize(Buffer &b) const { - b.append((uint8_t)1); // version + b.append((uint8_t)2); // version b.append((uint64_t)_lastSend); + b.append((uint64_t)_lastPing); + b.append((uint64_t)_lastKeepalive); b.append((uint64_t)_lastReceived); _addr.serialize(b); _localAddress.serialize(b); @@ -273,9 +300,11 @@ public: inline unsigned int deserialize(const Buffer &b,unsigned int startAt = 0) { unsigned int p = startAt; - if (b[p++] != 1) + if (b[p++] != 2) throw std::invalid_argument("invalid serialized Path"); _lastSend = b.template at(p); p += 8; + _lastPing = b.template at(p); p += 8; + _lastKeepalive = b.template at(p); p += 8; _lastReceived = b.template at(p); p += 8; p += _addr.deserialize(b,p); p += _localAddress.deserialize(b,p); @@ -290,6 +319,8 @@ public: private: uint64_t _lastSend; + uint64_t _lastPing; + uint64_t _lastKeepalive; uint64_t _lastReceived; InetAddress _addr; InetAddress _localAddress; diff --git a/node/Peer.cpp b/node/Peer.cpp index a98d94c4..aff610d5 100644 --- a/node/Peer.cpp +++ b/node/Peer.cpp @@ -168,7 +168,10 @@ void Peer::received( } else { uint64_t slotLRmin = 0xffffffffffffffffULL; for(unsigned int p=0;paddress().toString().c_str(),now - p->lastSend(),now - p->lastReceived()); sendHELLO(p->localAddress(),p->address(),now); p->sent(now); - } else if (((now - p->lastSend()) >= ZT_NAT_KEEPALIVE_DELAY)&&(!p->reliable())) { + p->pinged(now); + } else if (((now - std::max(p->lastSend(),p->lastKeepalive())) >= ZT_NAT_KEEPALIVE_DELAY)&&(!p->reliable())) { //TRACE("NAT keepalive %s(%s) after %llums/%llums send/receive inactivity",_id.address().toString().c_str(),p->address().toString().c_str(),now - p->lastSend(),now - p->lastReceived()); _natKeepaliveBuf += (uint32_t)((now * 0x9e3779b1) >> 1); // tumble this around to send constantly varying (meaningless) payloads RR->node->putPacket(p->localAddress(),p->address(),&_natKeepaliveBuf,sizeof(_natKeepaliveBuf)); - p->sent(now); + p->sentKeepalive(now); } else { //TRACE("no PING or NAT keepalive: addr==%s reliable==%d %llums/%llums send/receive inactivity",p->address().toString().c_str(),(int)p->reliable(),now - p->lastSend(),now - p->lastReceived()); } @@ -339,6 +343,8 @@ bool Peer::resetWithinScope(InetAddress::IpScope scope,uint64_t now) unsigned int y = 0; while (x < np) { if (_paths[x].address().ipScope() == scope) { + // Resetting a path means sending a HELLO and then forgetting it. If we + // get OK(HELLO) then it will be re-learned. sendHELLO(_paths[x].localAddress(),_paths[x].address(),now); } else { _paths[y++] = _paths[x]; @@ -491,7 +497,22 @@ bool Peer::_checkPath(Path &p,const uint64_t now) if (!p.active(now)) return false; - if ( (p.lastSend() > p.lastReceived()) && ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) ) { + /* Dead path detection: if we have sent something to this peer and have not + * yet received a reply, double check this path. The majority of outbound + * packets including Ethernet frames do generate some kind of reply either + * immediately or at some point in the near future. This will occasionally + * (every NO_ANSWER_TIMEOUT ms) check paths unnecessarily if traffic that + * does not generate a response is being sent such as multicast announcements + * or frames belonging to unidirectional UDP protocols, but the cost is very + * tiny and the benefit in reliability is very large. This takes care of many + * failure modes including crap NATs that forget links and spurious changes + * to physical network topology that cannot be otherwise detected. + * + * Each time we do this we increment a probation counter in the path. This + * counter is reset on any packet receive over this path. If it reaches the + * MAX_PROBATION threshold the path is considred dead. */ + + if ( (p.lastSend() > p.lastReceived()) && ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) && ((now - p.lastPing()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) ) { TRACE("%s(%s) has not answered, checking if dead (probation: %u)",_id.address().toString().c_str(),p.address().toString().c_str(),p.probation()); if ( (_vProto >= 5) && ( !((_vMajor == 1)&&(_vMinor == 1)&&(_vRevision == 0)) ) ) { @@ -499,9 +520,11 @@ bool Peer::_checkPath(Path &p,const uint64_t now) Packet outp(_id.address(),RR->identity.address(),Packet::VERB_ECHO); outp.armor(_key,true); p.send(RR,outp.data(),outp.size(),now); + p.pinged(now); } else { sendHELLO(p.localAddress(),p.address(),now); p.sent(now); + p.pinged(now); } p.increaseProbation(); -- cgit v1.2.3 From 9aee72099e518636acb243237042049c50dcf483 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Wed, 6 Jan 2016 10:59:39 -0800 Subject: AntiRecursion cleanup and some other minor things. --- node/AntiRecursion.hpp | 6 ++++-- node/Path.cpp | 2 +- node/Peer.cpp | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'node/Peer.cpp') diff --git a/node/AntiRecursion.hpp b/node/AntiRecursion.hpp index 4d9df465..78ee95c2 100644 --- a/node/AntiRecursion.hpp +++ b/node/AntiRecursion.hpp @@ -105,8 +105,9 @@ public: const _ArItem *const end = i + ZT_ANTIRECURSION_HISTORY_SIZE; while (i != end) { #ifdef ZT_NO_TYPE_PUNNING - if (!memcmp(pp,i->tail,32)) + if (!memcmp(pp,i->tail,32)) { return false; + } #else const uint64_t *t = i->tail; const uint64_t *p = reinterpret_cast(pp); @@ -114,8 +115,9 @@ public: bits |= *(t++) ^ *(p++); bits |= *(t++) ^ *(p++); bits |= *t ^ *p; - if (!bits) + if (!bits) { return false; + } #endif ++i; } diff --git a/node/Path.cpp b/node/Path.cpp index e2475751..c67352de 100644 --- a/node/Path.cpp +++ b/node/Path.cpp @@ -34,9 +34,9 @@ namespace ZeroTier { bool Path::send(const RuntimeEnvironment *RR,const void *data,unsigned int len,uint64_t now) { + RR->antiRec->logOutgoingZT(data,len); if (RR->node->putPacket(_localAddress,address(),data,len)) { sent(now); - RR->antiRec->logOutgoingZT(data,len); return true; } return false; diff --git a/node/Peer.cpp b/node/Peer.cpp index aff610d5..1914da97 100644 --- a/node/Peer.cpp +++ b/node/Peer.cpp @@ -253,7 +253,7 @@ bool Peer::doPingAndKeepalive(uint64_t now,int inetAddressFamily) sendHELLO(p->localAddress(),p->address(),now); p->sent(now); p->pinged(now); - } else if (((now - std::max(p->lastSend(),p->lastKeepalive())) >= ZT_NAT_KEEPALIVE_DELAY)&&(!p->reliable())) { + } else if ( ((now - std::max(p->lastSend(),p->lastKeepalive())) >= ZT_NAT_KEEPALIVE_DELAY) && (!p->reliable()) ) { //TRACE("NAT keepalive %s(%s) after %llums/%llums send/receive inactivity",_id.address().toString().c_str(),p->address().toString().c_str(),now - p->lastSend(),now - p->lastReceived()); _natKeepaliveBuf += (uint32_t)((now * 0x9e3779b1) >> 1); // tumble this around to send constantly varying (meaningless) payloads RR->node->putPacket(p->localAddress(),p->address(),&_natKeepaliveBuf,sizeof(_natKeepaliveBuf)); @@ -513,7 +513,7 @@ bool Peer::_checkPath(Path &p,const uint64_t now) * MAX_PROBATION threshold the path is considred dead. */ if ( (p.lastSend() > p.lastReceived()) && ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) && ((now - p.lastPing()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) ) { - TRACE("%s(%s) has not answered, checking if dead (probation: %u)",_id.address().toString().c_str(),p.address().toString().c_str(),p.probation()); + TRACE("%s(%s) does not seem to be answering in a timely manner, checking if dead (probation == %u)",_id.address().toString().c_str(),p.address().toString().c_str(),p.probation()); if ( (_vProto >= 5) && ( !((_vMajor == 1)&&(_vMinor == 1)&&(_vRevision == 0)) ) ) { // 1.1.1 and newer nodes support ECHO, which is smaller -- but 1.1.0 has a bug so use HELLO there too -- cgit v1.2.3