From d8143a5e186faf722d2cae703f0a618c37e588ea Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Tue, 5 Jan 2016 16:41:54 -0800 Subject: Implement first pass on rapid dead path detection, and increment version to 1.1.3 (dev) --- node/Path.hpp | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'node/Path.hpp') diff --git a/node/Path.hpp b/node/Path.hpp index c6de6612..23e47408 100644 --- a/node/Path.hpp +++ b/node/Path.hpp @@ -98,14 +98,21 @@ public: * * @param t Time of send */ - inline void sent(uint64_t t) { _lastSend = t; } + inline void sent(uint64_t t) + { + _lastSend = t; + } /** * Called when a packet is received from this remote path * * @param t Time of receive */ - inline void received(uint64_t t) { _lastReceived = t; } + inline void received(uint64_t t) + { + _lastReceived = t; + _probation = 0; + } /** * @param now Current time @@ -114,7 +121,7 @@ public: inline bool active(uint64_t now) const throw() { - return ((now - _lastReceived) < ZT_PEER_ACTIVITY_TIMEOUT); + return (((now - _lastReceived) < ZT_PEER_ACTIVITY_TIMEOUT)&&(_probation < ZT_PEER_DEAD_PATH_DETECTION_MAX_PROBATION)); } /** @@ -240,28 +247,40 @@ public: inline bool isClusterSuboptimal() const { return ((_flags & ZT_PATH_FLAG_CLUSTER_SUBOPTIMAL) != 0); } #endif + /** + * @return Current path probation count (for dead path detect) + */ + inline unsigned int probation() const { return _probation; } + + /** + * Increase this path's probation violation count (for dead path detect) + */ + inline void increaseProbation() { ++_probation; } + template inline void serialize(Buffer &b) const { - b.append((uint8_t)0); // version + b.append((uint8_t)1); // version b.append((uint64_t)_lastSend); b.append((uint64_t)_lastReceived); _addr.serialize(b); _localAddress.serialize(b); b.append((uint16_t)_flags); + b.append((uint16_t)_probation); } template inline unsigned int deserialize(const Buffer &b,unsigned int startAt = 0) { unsigned int p = startAt; - if (b[p++] != 0) + if (b[p++] != 1) throw std::invalid_argument("invalid serialized Path"); _lastSend = b.template at(p); p += 8; _lastReceived = b.template at(p); p += 8; p += _addr.deserialize(b,p); p += _localAddress.deserialize(b,p); _flags = b.template at(p); p += 2; + _probation = b.template at(p); p += 2; _ipScope = _addr.ipScope(); return (p - startAt); } @@ -275,6 +294,7 @@ private: InetAddress _addr; InetAddress _localAddress; unsigned int _flags; + unsigned int _probation; InetAddress::IpScope _ipScope; // memoize this since it's a computed value checked often }; -- cgit v1.2.3 From 05b2c0743f1733d2725266ad3249496ed09383a5 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Wed, 6 Jan 2016 10:00:03 -0800 Subject: Tighten up dead path detection. Should now auto-detect dead paths in less than 10 seconds at a very small cost in ECHO requests (or HELLOs for older peers). GitHib issue #272 --- node/Constants.hpp | 2 +- node/Path.hpp | 43 +++++++++++++++++++++++++++++++++++++------ node/Peer.cpp | 31 +++++++++++++++++++++++++++---- 3 files changed, 65 insertions(+), 11 deletions(-) (limited to 'node/Path.hpp') diff --git a/node/Constants.hpp b/node/Constants.hpp index 1a47112a..a2ba1c1a 100644 --- a/node/Constants.hpp +++ b/node/Constants.hpp @@ -274,7 +274,7 @@ /** * No answer timeout to trigger dead path detection */ -#define ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT 3000 +#define ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT 2500 /** * Probation threshold after which a path becomes dead diff --git a/node/Path.hpp b/node/Path.hpp index 23e47408..7e0a8ed1 100644 --- a/node/Path.hpp +++ b/node/Path.hpp @@ -66,6 +66,8 @@ class Path public: Path() : _lastSend(0), + _lastPing(0), + _lastKeepalive(0), _lastReceived(0), _addr(), _localAddress(), @@ -76,6 +78,8 @@ public: Path(const InetAddress &localAddress,const InetAddress &addr) : _lastSend(0), + _lastPing(0), + _lastKeepalive(0), _lastReceived(0), _addr(addr), _localAddress(localAddress), @@ -98,10 +102,21 @@ public: * * @param t Time of send */ - inline void sent(uint64_t t) - { - _lastSend = t; - } + inline void sent(uint64_t t) { _lastSend = t; } + + /** + * Called when we've sent a ping or echo + * + * @param t Time of send + */ + inline void pinged(uint64_t t) { _lastPing = t; } + + /** + * Called when we send a NAT keepalive + * + * @param t Time of send + */ + inline void sentKeepalive(uint64_t t) { _lastKeepalive = t; } /** * Called when a packet is received from this remote path @@ -145,6 +160,16 @@ public: */ inline uint64_t lastSend() const throw() { return _lastSend; } + /** + * @return Time we last pinged or dead path checked this link + */ + inline uint64_t lastPing() const throw() { return _lastPing; } + + /** + * @return Time of last keepalive + */ + inline uint64_t lastKeepalive() const throw() { return _lastKeepalive; } + /** * @return Time of last receive from this path */ @@ -260,8 +285,10 @@ public: template inline void serialize(Buffer &b) const { - b.append((uint8_t)1); // version + b.append((uint8_t)2); // version b.append((uint64_t)_lastSend); + b.append((uint64_t)_lastPing); + b.append((uint64_t)_lastKeepalive); b.append((uint64_t)_lastReceived); _addr.serialize(b); _localAddress.serialize(b); @@ -273,9 +300,11 @@ public: inline unsigned int deserialize(const Buffer &b,unsigned int startAt = 0) { unsigned int p = startAt; - if (b[p++] != 1) + if (b[p++] != 2) throw std::invalid_argument("invalid serialized Path"); _lastSend = b.template at(p); p += 8; + _lastPing = b.template at(p); p += 8; + _lastKeepalive = b.template at(p); p += 8; _lastReceived = b.template at(p); p += 8; p += _addr.deserialize(b,p); p += _localAddress.deserialize(b,p); @@ -290,6 +319,8 @@ public: private: uint64_t _lastSend; + uint64_t _lastPing; + uint64_t _lastKeepalive; uint64_t _lastReceived; InetAddress _addr; InetAddress _localAddress; diff --git a/node/Peer.cpp b/node/Peer.cpp index a98d94c4..aff610d5 100644 --- a/node/Peer.cpp +++ b/node/Peer.cpp @@ -168,7 +168,10 @@ void Peer::received( } else { uint64_t slotLRmin = 0xffffffffffffffffULL; for(unsigned int p=0;paddress().toString().c_str(),now - p->lastSend(),now - p->lastReceived()); sendHELLO(p->localAddress(),p->address(),now); p->sent(now); - } else if (((now - p->lastSend()) >= ZT_NAT_KEEPALIVE_DELAY)&&(!p->reliable())) { + p->pinged(now); + } else if (((now - std::max(p->lastSend(),p->lastKeepalive())) >= ZT_NAT_KEEPALIVE_DELAY)&&(!p->reliable())) { //TRACE("NAT keepalive %s(%s) after %llums/%llums send/receive inactivity",_id.address().toString().c_str(),p->address().toString().c_str(),now - p->lastSend(),now - p->lastReceived()); _natKeepaliveBuf += (uint32_t)((now * 0x9e3779b1) >> 1); // tumble this around to send constantly varying (meaningless) payloads RR->node->putPacket(p->localAddress(),p->address(),&_natKeepaliveBuf,sizeof(_natKeepaliveBuf)); - p->sent(now); + p->sentKeepalive(now); } else { //TRACE("no PING or NAT keepalive: addr==%s reliable==%d %llums/%llums send/receive inactivity",p->address().toString().c_str(),(int)p->reliable(),now - p->lastSend(),now - p->lastReceived()); } @@ -339,6 +343,8 @@ bool Peer::resetWithinScope(InetAddress::IpScope scope,uint64_t now) unsigned int y = 0; while (x < np) { if (_paths[x].address().ipScope() == scope) { + // Resetting a path means sending a HELLO and then forgetting it. If we + // get OK(HELLO) then it will be re-learned. sendHELLO(_paths[x].localAddress(),_paths[x].address(),now); } else { _paths[y++] = _paths[x]; @@ -491,7 +497,22 @@ bool Peer::_checkPath(Path &p,const uint64_t now) if (!p.active(now)) return false; - if ( (p.lastSend() > p.lastReceived()) && ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) ) { + /* Dead path detection: if we have sent something to this peer and have not + * yet received a reply, double check this path. The majority of outbound + * packets including Ethernet frames do generate some kind of reply either + * immediately or at some point in the near future. This will occasionally + * (every NO_ANSWER_TIMEOUT ms) check paths unnecessarily if traffic that + * does not generate a response is being sent such as multicast announcements + * or frames belonging to unidirectional UDP protocols, but the cost is very + * tiny and the benefit in reliability is very large. This takes care of many + * failure modes including crap NATs that forget links and spurious changes + * to physical network topology that cannot be otherwise detected. + * + * Each time we do this we increment a probation counter in the path. This + * counter is reset on any packet receive over this path. If it reaches the + * MAX_PROBATION threshold the path is considred dead. */ + + if ( (p.lastSend() > p.lastReceived()) && ((p.lastSend() - p.lastReceived()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) && ((now - p.lastPing()) >= ZT_PEER_DEAD_PATH_DETECTION_NO_ANSWER_TIMEOUT) ) { TRACE("%s(%s) has not answered, checking if dead (probation: %u)",_id.address().toString().c_str(),p.address().toString().c_str(),p.probation()); if ( (_vProto >= 5) && ( !((_vMajor == 1)&&(_vMinor == 1)&&(_vRevision == 0)) ) ) { @@ -499,9 +520,11 @@ bool Peer::_checkPath(Path &p,const uint64_t now) Packet outp(_id.address(),RR->identity.address(),Packet::VERB_ECHO); outp.armor(_key,true); p.send(RR,outp.data(),outp.size(),now); + p.pinged(now); } else { sendHELLO(p.localAddress(),p.address(),now); p.sent(now); + p.pinged(now); } p.increaseProbation(); -- cgit v1.2.3