diff options
author | Adam Ierymenko <adam.ierymenko@gmail.com> | 2016-01-06 10:00:03 -0800 |
---|---|---|
committer | Adam Ierymenko <adam.ierymenko@gmail.com> | 2016-01-06 10:00:03 -0800 |
commit | 05b2c0743f1733d2725266ad3249496ed09383a5 (patch) | |
tree | 900fee82abb16b6ea95fbf9b690e4ecf32a66bb9 /node | |
parent | 4d94ae77b45e16272fe0c5c685cc20ece5057c32 (diff) | |
download | infinitytier-05b2c0743f1733d2725266ad3249496ed09383a5.tar.gz infinitytier-05b2c0743f1733d2725266ad3249496ed09383a5.zip |
Tighten up dead path detection. Should now auto-detect dead paths in less than 10 seconds at a very small cost in ECHO requests (or HELLOs for older peers). GitHib issue #272
Diffstat (limited to 'node')
-rw-r--r-- | node/Constants.hpp | 2 | ||||
-rw-r--r-- | node/Path.hpp | 43 | ||||
-rw-r--r-- | node/Peer.cpp | 31 |
3 files changed, 65 insertions, 11 deletions
diff --git a/node/Constants.hpp b/node/Constants.hpp index 1a47112a..a2ba1c1a 100644 --- a/node/Constants.hpp +++ b/node/Constants.hpp @@ -274,7 +274,7 @@ /** * No answer timeout to trigger dead path detection */ -#define ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT 3000 +#define ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT 2500 /** * Probation threshold after which a path becomes dead diff --git a/node/Path.hpp b/node/Path.hpp index 23e47408..7e0a8ed1 100644 --- a/node/Path.hpp +++ b/node/Path.hpp @@ -66,6 +66,8 @@ class Path public: Path() : _lastSend(0), + _lastPing(0), + _lastKeepalive(0), _lastReceived(0), _addr(), _localAddress(), @@ -76,6 +78,8 @@ public: Path(const InetAddress &localAddress,const InetAddress &addr) : _lastSend(0), + _lastPing(0), + _lastKeepalive(0), _lastReceived(0), _addr(addr), _localAddress(localAddress), @@ -98,10 +102,21 @@ public: * * @param t Time of send */ - inline void sent(uint64_t t) - { - _lastSend = t; - } + inline void sent(uint64_t t) { _lastSend = t; } + + /** + * Called when we've sent a ping or echo + * + * @param t Time of send + */ + inline void pinged(uint64_t t) { _lastPing = t; } + + /** + * Called when we send a NAT keepalive + * + * @param t Time of send + */ + inline void sentKeepalive(uint64_t t) { _lastKeepalive = t; } /** * Called when a packet is received from this remote path @@ -146,6 +161,16 @@ public: inline uint64_t lastSend() const throw() { return _lastSend; } /** + * @return Time we last pinged or dead path checked this link + */ + inline uint64_t lastPing() const throw() { return _lastPing; } + + /** + * @return Time of last keepalive + */ + inline uint64_t lastKeepalive() const throw() { return _lastKeepalive; } + + /** * @return Time of last receive from this path */ inline uint64_t lastReceived() const throw() { return _lastReceived; } @@ -260,8 +285,10 @@ public: template<unsigned int C> inline void serialize(Buffer<C> &b) const { - b.append((uint8_t)1); // version + b.append((uint8_t)2); // version b.append((uint64_t)_lastSend); + b.append((uint64_t)_lastPing); + b.append((uint64_t)_lastKeepalive); b.append((uint64_t)_lastReceived); _addr.serialize(b); _localAddress.serialize(b); @@ -273,9 +300,11 @@ public: inline unsigned int deserialize(const Buffer<C> &b,unsigned int startAt = 0) { unsigned int p = startAt; - if (b[p++] != 1) + if (b[p++] != 2) throw std::invalid_argument("invalid serialized Path"); _lastSend = b.template at<uint64_t>(p); p += 8; + _lastPing = b.template at<uint64_t>(p); p += 8; + _lastKeepalive = b.template at<uint64_t>(p); p += 8; _lastReceived = b.template at<uint64_t>(p); p += 8; p += _addr.deserialize(b,p); p += _localAddress.deserialize(b,p); @@ -290,6 +319,8 @@ public: private: uint64_t _lastSend; + uint64_t _lastPing; + uint64_t _lastKeepalive; uint64_t _lastReceived; InetAddress _addr; InetAddress _localAddress; diff --git a/node/Peer.cpp b/node/Peer.cpp index a98d94c4..aff610d5 100644 --- a/node/Peer.cpp +++ b/node/Peer.cpp @@ -168,7 +168,10 @@ void Peer::received( } else { uint64_t slotLRmin = 0xffffffffffffffffULL; for(unsigned int p=0;p<ZT_MAX_PEER_NETWORK_PATHS;++p) { - if (_paths[p].lastReceived() <= slotLRmin) { + if (!_paths[p].active(now)) { + slot = &(_paths[p]); + break; + } else if (_paths[p].lastReceived() <= slotLRmin) { slotLRmin = _paths[p].lastReceived(); slot = &(_paths[p]); } @@ -249,11 +252,12 @@ bool Peer::doPingAndKeepalive(uint64_t now,int inetAddressFamily) //TRACE("PING %s(%s) after %llums/%llums send/receive inactivity",_id.address().toString().c_str(),p->address().toString().c_str(),now - p->lastSend(),now - p->lastReceived()); sendHELLO(p->localAddress(),p->address(),now); p->sent(now); - } else if (((now - p->lastSend()) >= ZT_NAT_KEEPALIVE_DELAY)&&(!p->reliable())) { + p->pinged(now); + } else if (((now - std::max(p->lastSend(),p->lastKeepalive())) >= ZT_NAT_KEEPALIVE_DELAY)&&(!p->reliable())) { //TRACE("NAT keepalive %s(%s) after %llums/%llums send/receive inactivity",_id.address().toString().c_str(),p->address().toString().c_str(),now - p->lastSend(),now - p->lastReceived()); _natKeepaliveBuf += (uint32_t)((now * 0x9e3779b1) >> 1); // tumble this around to send constantly varying (meaningless) payloads RR->node->putPacket(p->localAddress(),p->address(),&_natKeepaliveBuf,sizeof(_natKeepaliveBuf)); - p->sent(now); + p->sentKeepalive(now); } else { //TRACE("no PING or NAT keepalive: addr==%s reliable==%d %llums/%llums send/receive inactivity",p->address().toString().c_str(),(int)p->reliable(),now - p->lastSend(),now - p->lastReceived()); } @@ -339,6 +343,8 @@ bool Peer::resetWithinScope(InetAddress::IpScope scope,uint64_t now) unsigned int y = 0; while (x < np) { if (_paths[x].address().ipScope() == scope) { + // Resetting a path means sending a HELLO and then forgetting it. If we + // get OK(HELLO) then it will be re-learned. sendHELLO(_paths[x].localAddress(),_paths[x].address(),now); } else { _paths[y++] = _paths[x]; @@ -491,7 +497,22 @@ bool Peer::_checkPath(Path &p,const uint64_t now) if (!p.active(now)) return false; - if ( (p.lastSend() > p.lastReceived()) && ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) ) { + /* Dead path detection: if we have sent something to this peer and have not + * yet received a reply, double check this path. The majority of outbound + * packets including Ethernet frames do generate some kind of reply either + * immediately or at some point in the near future. This will occasionally + * (every NO_ANSWER_TIMEOUT ms) check paths unnecessarily if traffic that + * does not generate a response is being sent such as multicast announcements + * or frames belonging to unidirectional UDP protocols, but the cost is very + * tiny and the benefit in reliability is very large. This takes care of many + * failure modes including crap NATs that forget links and spurious changes + * to physical network topology that cannot be otherwise detected. + * + * Each time we do this we increment a probation counter in the path. This + * counter is reset on any packet receive over this path. If it reaches the + * MAX_PROBATION threshold the path is considred dead. */ + + if ( (p.lastSend() > p.lastReceived()) && ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) && ((now - p.lastPing()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) ) { TRACE("%s(%s) has not answered, checking if dead (probation: %u)",_id.address().toString().c_str(),p.address().toString().c_str(),p.probation()); if ( (_vProto >= 5) && ( !((_vMajor == 1)&&(_vMinor == 1)&&(_vRevision == 0)) ) ) { @@ -499,9 +520,11 @@ bool Peer::_checkPath(Path &p,const uint64_t now) Packet outp(_id.address(),RR->identity.address(),Packet::VERB_ECHO); outp.armor(_key,true); p.send(RR,outp.data(),outp.size(),now); + p.pinged(now); } else { sendHELLO(p.localAddress(),p.address(),now); p.sent(now); + p.pinged(now); } p.increaseProbation(); |