From 6a2ba4baca326272c45930208b70cfedf8cb1638 Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Tue, 1 May 2018 16:32:15 -0700 Subject: Introduced basic multipath support --- node/Node.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'node/Node.cpp') diff --git a/node/Node.cpp b/node/Node.cpp index db511430..71e5b6a7 100644 --- a/node/Node.cpp +++ b/node/Node.cpp @@ -234,7 +234,7 @@ public: } if ((!contacted)&&(_bestCurrentUpstream)) { - const SharedPtr up(_bestCurrentUpstream->getBestPath(_now,true)); + const SharedPtr up(_bestCurrentUpstream->getAppropriatePath(_now,true)); if (up) p->sendHELLO(_tPtr,up->localSocket(),up->address(),_now); } @@ -465,7 +465,7 @@ ZT_PeerList *Node::peers() const p->role = RR->topology->role(pi->second->identity().address()); std::vector< SharedPtr > paths(pi->second->paths(_now)); - SharedPtr bestp(pi->second->getBestPath(_now,false)); + SharedPtr bestp(pi->second->getAppropriatePath(_now,false)); p->pathCount = 0; for(std::vector< SharedPtr >::iterator path(paths.begin());path!=paths.end();++path) { ZT_FAST_MEMCPY(&(p->paths[p->pathCount].address),&((*path)->address()),sizeof(struct sockaddr_storage)); -- cgit v1.2.3 From 17fbb020e733fab8d8be933bb4981927015a10f5 Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Thu, 14 Jun 2018 16:34:45 -0700 Subject: Added multipath field to zerotier-cli status output. Adjusted how path estimates are computed and cached --- include/ZeroTierOne.h | 57 +++++++++++++++++++++++++++++++----- node/Node.cpp | 11 +++++++ node/Path.hpp | 79 ++++++++++++++++++++++++++++++-------------------- node/Peer.cpp | 4 +-- node/Peer.hpp | 14 +++++---- service/OneService.cpp | 51 +++++++++++++++++++++++++++++++- 6 files changed, 169 insertions(+), 47 deletions(-) (limited to 'node/Node.cpp') diff --git a/include/ZeroTierOne.h b/include/ZeroTierOne.h index b3927c2e..a100afd9 100644 --- a/include/ZeroTierOne.h +++ b/include/ZeroTierOne.h @@ -449,13 +449,6 @@ enum ZT_MultipathMode * Will cease sending traffic over links that appear to be stale. */ ZT_MULTIPATH_PROPORTIONALLY_BALANCED = 2, - - /** - * Traffic is allocated across a user-defined interface/allocation - * - * Will cease sending traffic over links that appear to be stale. - */ - ZT_MULTIPATH_MANUALLY_BALANCED = 3 }; /** @@ -1221,6 +1214,56 @@ typedef struct */ uint64_t trustedPathId; + /** + * One-way latency + */ + float latency; + + /** + * How much latency varies over time + */ + float packetDelayVariance; + + /** + * How much observed throughput varies over time + */ + float throughputDisturbCoeff; + + /** + * Packet Error Ratio (PER) + */ + float packetErrorRatio; + + /** + * Packet Loss Ratio (PLR) + */ + float packetLossRatio; + + /** + * Stability of the path + */ + float stability; + + /** + * Current throughput (moving average) + */ + uint64_t throughput; + + /** + * Maximum observed throughput for this path + */ + uint64_t maxThroughput; + + /** + * Percentage of traffic allocated to this path + */ + float allocation; + + /** + * Name of physical interface (for monitoring) + */ + char *ifname; + /** * Is path expired? */ diff --git a/node/Node.cpp b/node/Node.cpp index 71e5b6a7..24deeae2 100644 --- a/node/Node.cpp +++ b/node/Node.cpp @@ -474,6 +474,17 @@ ZT_PeerList *Node::peers() const p->paths[p->pathCount].trustedPathId = RR->topology->getOutboundPathTrust((*path)->address()); p->paths[p->pathCount].expired = 0; p->paths[p->pathCount].preferred = ((*path) == bestp) ? 1 : 0; + p->paths[p->pathCount].latency = (*path)->latency(); + p->paths[p->pathCount].packetDelayVariance = (*path)->packetDelayVariance(); + p->paths[p->pathCount].throughputDisturbCoeff = (*path)->throughputDisturbanceCoefficient(); + p->paths[p->pathCount].packetErrorRatio = (*path)->packetErrorRatio(); + p->paths[p->pathCount].packetLossRatio = (*path)->packetLossRatio(); + p->paths[p->pathCount].stability = (*path)->lastComputedStability(); + p->paths[p->pathCount].throughput = (*path)->meanThroughput(); + p->paths[p->pathCount].maxThroughput = (*path)->maxLifetimeThroughput(); + p->paths[p->pathCount].allocation = (*path)->allocation(); + p->paths[p->pathCount].ifname = (*path)->getName(); + ++p->pathCount; } } diff --git a/node/Path.hpp b/node/Path.hpp index 30655877..6162be20 100644 --- a/node/Path.hpp +++ b/node/Path.hpp @@ -111,15 +111,16 @@ public: _expectingAckAsOf(0), _packetsReceivedSinceLastAck(0), _packetsReceivedSinceLastQoS(0), - _meanThroughput(0.0), _maxLifetimeThroughput(0), + _lastComputedMeanThroughput(0), _bytesAckedSinceLastThroughputEstimation(0), - _meanLatency(0.0), - _packetDelayVariance(0.0), - _packetErrorRatio(0.0), - _packetLossRatio(0), + _lastComputedMeanLatency(0.0), + _lastComputedPacketDelayVariance(0.0), + _lastComputedPacketErrorRatio(0.0), + _lastComputedPacketLossRatio(0), _lastComputedStability(0.0), _lastComputedRelativeQuality(0), + _lastComputedThroughputDistCoeff(0.0), _lastAllocation(0.0) { prepareBuffers(); @@ -142,15 +143,16 @@ public: _expectingAckAsOf(0), _packetsReceivedSinceLastAck(0), _packetsReceivedSinceLastQoS(0), - _meanThroughput(0.0), _maxLifetimeThroughput(0), + _lastComputedMeanThroughput(0), _bytesAckedSinceLastThroughputEstimation(0), - _meanLatency(0.0), - _packetDelayVariance(0.0), - _packetErrorRatio(0.0), - _packetLossRatio(0), + _lastComputedMeanLatency(0.0), + _lastComputedPacketDelayVariance(0.0), + _lastComputedPacketErrorRatio(0.0), + _lastComputedPacketLossRatio(0), _lastComputedStability(0.0), _lastComputedRelativeQuality(0), + _lastComputedThroughputDistCoeff(0.0), _lastAllocation(0.0) { prepareBuffers(); @@ -162,9 +164,11 @@ public: delete _throughputSamples; delete _latencySamples; delete _packetValiditySamples; + delete _throughputDisturbanceSamples; _throughputSamples = NULL; _latencySamples = NULL; _packetValiditySamples = NULL; + _throughputDisturbanceSamples = NULL; } /** @@ -311,7 +315,7 @@ public: inline void recordOutgoingPacket(int64_t now, int64_t packetId, uint16_t payloadLength, Packet::Verb verb) { Mutex::Lock _l(_statistics_m); - if (verb == Packet::VERB_FRAME || verb == Packet::VERB_EXT_FRAME) { + if (verb != Packet::VERB_ACK && verb != Packet::VERB_QOS_MEASUREMENT) { if (packetId % 2 == 0) { // even -> use for ACK _unackedBytes += payloadLength; // Take note that we're expecting a VERB_ACK on this path as of a specific time @@ -336,7 +340,7 @@ public: inline void recordIncomingPacket(int64_t now, int64_t packetId, uint16_t payloadLength, Packet::Verb verb) { Mutex::Lock _l(_statistics_m); - if (verb == Packet::VERB_FRAME || verb == Packet::VERB_EXT_FRAME) { + if (verb != Packet::VERB_ACK && verb != Packet::VERB_QOS_MEASUREMENT) { if (packetId % 2 == 0) { // even -> use for ACK _inACKRecords[packetId] = payloadLength; _packetsReceivedSinceLastAck++; @@ -497,14 +501,14 @@ public: inline int64_t ackAge(int64_t now) { return _expectingAckAsOf ? now - _expectingAckAsOf : 0; } /** - * The maximum observed throughput for this path + * The maximum observed throughput (in bits/s) for this path */ inline uint64_t maxLifetimeThroughput() { return _maxLifetimeThroughput; } /** * @return The mean throughput (in bits/s) of this link */ - inline float meanThroughput() { return _meanThroughput; } + inline uint64_t meanThroughput() { return _lastComputedMeanThroughput; } /** * Assign a new relative quality value for this path in the aggregate link @@ -543,22 +547,22 @@ public: /** * @return Packet delay variance */ - inline float packetDelayVariance() { return _packetDelayVariance; } + inline float packetDelayVariance() { return _lastComputedPacketDelayVariance; } /** * @return Previously-computed mean latency */ - inline float meanLatency() { return _meanLatency; } + inline float meanLatency() { return _lastComputedMeanLatency; } /** * @return Packet loss rate (PLR) */ - inline float packetLossRatio() { return _packetLossRatio; } + inline float packetLossRatio() { return _lastComputedPacketLossRatio; } /** * @return Packet error ratio (PER) */ - inline float packetErrorRatio() { return _packetErrorRatio; } + inline float packetErrorRatio() { return _lastComputedPacketErrorRatio; } /** * Record an invalid incoming packet. This packet failed MAC/compression/cipher checks and will now @@ -571,38 +575,46 @@ public: */ inline char *getAddressString() { return _addrString; } + /** + * @return The current throughput disturbance coefficient + */ + inline float throughputDisturbanceCoefficient() { return _lastComputedThroughputDistCoeff; } + /** * Compute and cache stability and performance metrics. The resultant stability coefficient is a measure of how "well behaved" * this path is. This figure is substantially different from (but required for the estimation of the path's overall "quality". * * @param now Current time */ - inline void processBackgroundPathMeasurements(int64_t now, const int64_t peerId) { + inline void processBackgroundPathMeasurements(int64_t now) { if (now - _lastPathQualityComputeTime > ZT_PATH_QUALITY_COMPUTE_INTERVAL) { Mutex::Lock _l(_statistics_m); _lastPathQualityComputeTime = now; address().toString(_addrString); - _meanThroughput = _throughputSamples->mean(); - _meanLatency = _latencySamples->mean(); - _packetDelayVariance = _latencySamples->stddev(); // Similar to "jitter" (SEE: RFC 3393, RFC 4689) + _lastComputedMeanLatency = _latencySamples->mean(); + _lastComputedPacketDelayVariance = _latencySamples->stddev(); // Similar to "jitter" (SEE: RFC 3393, RFC 4689) + _lastComputedMeanThroughput = (uint64_t)_throughputSamples->mean(); // If no packet validity samples, assume PER==0 - _packetErrorRatio = 1 - (_packetValiditySamples->count() ? _packetValiditySamples->mean() : 1); + _lastComputedPacketErrorRatio = 1 - (_packetValiditySamples->count() ? _packetValiditySamples->mean() : 1); // Compute path stability // Normalize measurements with wildly different ranges into a reasonable range - float normalized_pdv = Utils::normalize(_packetDelayVariance, 0, ZT_PATH_MAX_PDV, 0, 10); - float normalized_la = Utils::normalize(_meanLatency, 0, ZT_PATH_MAX_MEAN_LATENCY, 0, 10); + float normalized_pdv = Utils::normalize(_lastComputedPacketDelayVariance, 0, ZT_PATH_MAX_PDV, 0, 10); + float normalized_la = Utils::normalize(_lastComputedMeanLatency, 0, ZT_PATH_MAX_MEAN_LATENCY, 0, 10); float throughput_cv = _throughputSamples->mean() > 0 ? _throughputSamples->stddev() / _throughputSamples->mean() : 1; // Form an exponential cutoff and apply contribution weights float pdv_contrib = exp((-1)*normalized_pdv) * ZT_PATH_CONTRIB_PDV; float latency_contrib = exp((-1)*normalized_la) * ZT_PATH_CONTRIB_LATENCY; + // Throughput Disturbance Coefficient float throughput_disturbance_contrib = exp((-1)*throughput_cv) * ZT_PATH_CONTRIB_THROUGHPUT_DISTURBANCE; + _throughputDisturbanceSamples->push(throughput_cv); + _lastComputedThroughputDistCoeff = _throughputDisturbanceSamples->mean(); // Obey user-defined ignored contributions pdv_contrib = ZT_PATH_CONTRIB_PDV > 0.0 ? pdv_contrib : 1; latency_contrib = ZT_PATH_CONTRIB_LATENCY > 0.0 ? latency_contrib : 1; throughput_disturbance_contrib = ZT_PATH_CONTRIB_THROUGHPUT_DISTURBANCE > 0.0 ? throughput_disturbance_contrib : 1; - // Compute the quality product + // Stability _lastComputedStability = pdv_contrib + latency_contrib + throughput_disturbance_contrib; - _lastComputedStability *= 1 - _packetErrorRatio; + _lastComputedStability *= 1 - _lastComputedPacketErrorRatio; // Prevent QoS records from sticking around for too long std::map::iterator it = _outQoSRecords.begin(); while (it != _outQoSRecords.end()) { @@ -646,6 +658,7 @@ public: _throughputSamples = new RingBuffer(ZT_PATH_QUALITY_METRIC_WIN_SZ); _latencySamples = new RingBuffer(ZT_PATH_QUALITY_METRIC_WIN_SZ); _packetValiditySamples = new RingBuffer(ZT_PATH_QUALITY_METRIC_WIN_SZ); + _throughputDisturbanceSamples = new RingBuffer(ZT_PATH_QUALITY_METRIC_WIN_SZ); memset(_ifname, 0, 16); memset(_addrString, 0, sizeof(_addrString)); } @@ -677,19 +690,20 @@ private: int16_t _packetsReceivedSinceLastAck; int16_t _packetsReceivedSinceLastQoS; - float _meanThroughput; uint64_t _maxLifetimeThroughput; + uint64_t _lastComputedMeanThroughput; uint64_t _bytesAckedSinceLastThroughputEstimation; - volatile float _meanLatency; - float _packetDelayVariance; + float _lastComputedMeanLatency; + float _lastComputedPacketDelayVariance; - float _packetErrorRatio; - float _packetLossRatio; + float _lastComputedPacketErrorRatio; + float _lastComputedPacketLossRatio; // cached estimates float _lastComputedStability; float _lastComputedRelativeQuality; + float _lastComputedThroughputDistCoeff; float _lastAllocation; // cached human-readable strings for tracing purposes @@ -699,6 +713,7 @@ private: RingBuffer *_throughputSamples; RingBuffer *_latencySamples; RingBuffer *_packetValiditySamples; + RingBuffer *_throughputDisturbanceSamples; }; } // namespace ZeroTier diff --git a/node/Peer.cpp b/node/Peer.cpp index 55132bba..0f471b07 100644 --- a/node/Peer.cpp +++ b/node/Peer.cpp @@ -116,7 +116,7 @@ void Peer::received( } for(unsigned int i=0;iprocessBackgroundPathMeasurements(now, _id.address().toInt()); + _paths[i].p->processBackgroundPathMeasurements(now); } } } @@ -415,7 +415,7 @@ SharedPtr Peer::getAppropriatePath(int64_t now, bool includeExpired) for(unsigned int i=0;iprocessBackgroundPathMeasurements(now, _id.address().toInt()); + _paths[i].p->processBackgroundPathMeasurements(now); } } diff --git a/node/Peer.hpp b/node/Peer.hpp index 9361f665..ddbe6f77 100644 --- a/node/Peer.hpp +++ b/node/Peer.hpp @@ -353,14 +353,18 @@ public: inline int64_t isActive(int64_t now) const { return ((now - _lastNontrivialReceive) < ZT_PEER_ACTIVITY_TIMEOUT); } /** - * @return Latency in milliseconds of best path or 0xffff if unknown / no paths + * @return Latency in milliseconds of best/aggregate path or 0xffff if unknown / no paths */ inline unsigned int latency(const int64_t now) { - SharedPtr bp(getAppropriatePath(now,false)); - if (bp) - return bp->latency(); - return 0xffff; + if (RR->node->getMultipathMode()) { + return (int)computeAggregateLinkMeanLatency(); + } else { + SharedPtr bp(getAppropriatePath(now,false)); + if (bp) + return bp->latency(); + return 0xffff; + } } /** diff --git a/service/OneService.cpp b/service/OneService.cpp index e56a4827..a1a9d981 100644 --- a/service/OneService.cpp +++ b/service/OneService.cpp @@ -298,6 +298,39 @@ static void _peerToJson(nlohmann::json &pj,const ZT_Peer *peer) pj["paths"] = pa; } +static void _peerAggregateLinkToJson(nlohmann::json &pj,const ZT_Peer *peer) +{ + char tmp[256]; + OSUtils::ztsnprintf(tmp,sizeof(tmp),"%.10llx",peer->address); + pj["aggregateLinkLatency"] = peer->latency; + + nlohmann::json pa = nlohmann::json::array(); + for(unsigned int i=0;ipathCount;++i) { + //int64_t lastSend = peer->paths[i].lastSend; + //int64_t lastReceive = peer->paths[i].lastReceive; + nlohmann::json j; + j["address"] = reinterpret_cast(&(peer->paths[i].address))->toString(tmp); + //j["lastSend"] = (lastSend < 0) ? 0 : lastSend; + //j["lastReceive"] = (lastReceive < 0) ? 0 : lastReceive; + //j["trustedPathId"] = peer->paths[i].trustedPathId; + //j["active"] = (bool)(peer->paths[i].expired == 0); + //j["expired"] = (bool)(peer->paths[i].expired != 0); + //j["preferred"] = (bool)(peer->paths[i].preferred != 0); + j["latency"] = peer->paths[i].latency; + //j["packetDelayVariance"] = peer->paths[i].packetDelayVariance; + //j["throughputDisturbCoeff"] = peer->paths[i].throughputDisturbCoeff; + //j["packetErrorRatio"] = peer->paths[i].packetErrorRatio; + //j["packetLossRatio"] = peer->paths[i].packetLossRatio; + j["stability"] = peer->paths[i].stability; + j["throughput"] = peer->paths[i].throughput; + //j["maxThroughput"] = peer->paths[i].maxThroughput; + j["allocation"] = peer->paths[i].allocation; + j["ifname"] = peer->paths[i].ifname; + pa.push_back(j); + } + pj["paths"] = pa; +} + static void _moonToJson(nlohmann::json &mj,const World &world) { char tmp[4096]; @@ -1189,7 +1222,23 @@ public: json &settings = res["config"]["settings"]; settings["primaryPort"] = OSUtils::jsonInt(settings["primaryPort"],(uint64_t)_primaryPort) & 0xffff; settings["allowTcpFallbackRelay"] = OSUtils::jsonBool(settings["allowTcpFallbackRelay"],_allowTcpFallbackRelay); - settings["multipathMode"] = OSUtils::jsonInt(settings["multipathMode"],_multipathMode); + + if (_multipathMode) { + json &multipathConfig = res["multipath"]; + ZT_PeerList *pl = _node->peers(); + char peerAddrStr[256]; + if (pl) { + for(unsigned long i=0;ipeerCount;++i) { + if (pl->peers[i].role == ZT_PEER_ROLE_LEAF) { + nlohmann::json pj; + _peerAggregateLinkToJson(pj,&(pl->peers[i])); + OSUtils::ztsnprintf(peerAddrStr,sizeof(peerAddrStr),"%.10llx",pl->peers[i].address); + multipathConfig[peerAddrStr] = (pj); + } + } + } + } + #ifdef ZT_USE_MINIUPNPC settings["portMappingEnabled"] = OSUtils::jsonBool(settings["portMappingEnabled"],true); #else -- cgit v1.2.3 From bdcdccfcc3157e62a4bc8078e583876cbef41223 Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Fri, 22 Jun 2018 16:30:20 -0700 Subject: Improved path selection, more efficient traffic allocation, lower QoS/ACK overhead --- include/ZeroTierOne.h | 5 ++ node/Constants.hpp | 15 ++++- node/Node.cpp | 6 +- node/Path.hpp | 20 +++---- node/Peer.cpp | 150 ++++++++++++++++++++++++++----------------------- node/Peer.hpp | 36 ++++++++++-- service/OneService.cpp | 2 +- 7 files changed, 143 insertions(+), 91 deletions(-) (limited to 'node/Node.cpp') diff --git a/include/ZeroTierOne.h b/include/ZeroTierOne.h index a100afd9..5b228e17 100644 --- a/include/ZeroTierOne.h +++ b/include/ZeroTierOne.h @@ -1315,6 +1315,11 @@ typedef struct */ unsigned int pathCount; + /** + * Whether this peer was ever reachable via an aggregate link + */ + bool hadAggregateLink; + /** * Known network paths to peer */ diff --git a/node/Constants.hpp b/node/Constants.hpp index 0d3692f1..420343ad 100644 --- a/node/Constants.hpp +++ b/node/Constants.hpp @@ -274,6 +274,19 @@ */ #define ZT_MULTIPATH_BINDER_REFRESH_PERIOD 5000 +/** + * Packets are only used for QoS/ACK statistical sampling if their packet ID is divisible by + * this integer. This is to provide a mechanism for both peers to agree on which packets need + * special treatment without having to exchange information. Changing this value would be + * a breaking change and would necessitate a protocol version upgrade. Since each incoming and + * outgoing packet ID is checked against this value its evaluation is of the form: + * (id & (divisor - 1)) == 0, thus the divisor must be a power of 2. + * + * This value is set at (16) so that given a normally-distributed RNG output we will sample + * 1/16th (or ~6.25%) of packets. + */ +#define ZT_PATH_QOS_ACK_PROTOCOL_DIVISOR 0x10 + /** * Time horizon for VERB_QOS_MEASUREMENT and VERB_ACK packet processing cutoff */ @@ -384,7 +397,7 @@ /** * Minimum amount of time between each ACK packet */ -#define ZT_PATH_ACK_INTERVAL 250 +#define ZT_PATH_ACK_INTERVAL 1000 /** * How often an aggregate link statistics report is emitted into this tracing system diff --git a/node/Node.cpp b/node/Node.cpp index 24deeae2..9b10dfdd 100644 --- a/node/Node.cpp +++ b/node/Node.cpp @@ -450,6 +450,7 @@ ZT_PeerList *Node::peers() const for(std::vector< std::pair< Address,SharedPtr > >::iterator pi(peers.begin());pi!=peers.end();++pi) { ZT_Peer *p = &(pl->peers[pl->peerCount++]); p->address = pi->second->address().toInt(); + p->hadAggregateLink = 0; if (pi->second->remoteVersionKnown()) { p->versionMajor = pi->second->remoteVersionMajor(); p->versionMinor = pi->second->remoteVersionMinor(); @@ -466,6 +467,7 @@ ZT_PeerList *Node::peers() const std::vector< SharedPtr > paths(pi->second->paths(_now)); SharedPtr bestp(pi->second->getAppropriatePath(_now,false)); + p->hadAggregateLink |= pi->second->hasAggregateLink(); p->pathCount = 0; for(std::vector< SharedPtr >::iterator path(paths.begin());path!=paths.end();++path) { ZT_FAST_MEMCPY(&(p->paths[p->pathCount].address),&((*path)->address()),sizeof(struct sockaddr_storage)); @@ -475,14 +477,14 @@ ZT_PeerList *Node::peers() const p->paths[p->pathCount].expired = 0; p->paths[p->pathCount].preferred = ((*path) == bestp) ? 1 : 0; p->paths[p->pathCount].latency = (*path)->latency(); - p->paths[p->pathCount].packetDelayVariance = (*path)->packetDelayVariance(); + p->paths[p->pathCount].packetDelayVariance = (*path)->packetDelayVariance(); p->paths[p->pathCount].throughputDisturbCoeff = (*path)->throughputDisturbanceCoefficient(); p->paths[p->pathCount].packetErrorRatio = (*path)->packetErrorRatio(); p->paths[p->pathCount].packetLossRatio = (*path)->packetLossRatio(); p->paths[p->pathCount].stability = (*path)->lastComputedStability(); p->paths[p->pathCount].throughput = (*path)->meanThroughput(); p->paths[p->pathCount].maxThroughput = (*path)->maxLifetimeThroughput(); - p->paths[p->pathCount].allocation = (*path)->allocation(); + p->paths[p->pathCount].allocation = (float)(*path)->allocation() / (float)255; p->paths[p->pathCount].ifname = (*path)->getName(); ++p->pathCount; diff --git a/node/Path.hpp b/node/Path.hpp index fb202306..cafff8cf 100644 --- a/node/Path.hpp +++ b/node/Path.hpp @@ -121,7 +121,7 @@ public: _lastComputedStability(0.0), _lastComputedRelativeQuality(0), _lastComputedThroughputDistCoeff(0.0), - _lastAllocation(0.0) + _lastAllocation(0) { prepareBuffers(); } @@ -153,7 +153,7 @@ public: _lastComputedStability(0.0), _lastComputedRelativeQuality(0), _lastComputedThroughputDistCoeff(0.0), - _lastAllocation(0.0) + _lastAllocation(0) { prepareBuffers(); _phy->getIfName((PhySocket *)((uintptr_t)_localSocket), _ifname, 16); @@ -316,12 +316,10 @@ public: { Mutex::Lock _l(_statistics_m); if (verb != Packet::VERB_ACK && verb != Packet::VERB_QOS_MEASUREMENT) { - if (packetId % 2 == 0) { // even -> use for ACK + if ((packetId & (ZT_PATH_QOS_ACK_PROTOCOL_DIVISOR - 1)) == 0) { _unackedBytes += payloadLength; // Take note that we're expecting a VERB_ACK on this path as of a specific time _expectingAckAsOf = ackAge(now) > ZT_PATH_ACK_INTERVAL ? _expectingAckAsOf : now; - } - else { // odd -> use for QoS if (_outQoSRecords.size() < ZT_PATH_MAX_OUTSTANDING_QOS_RECORDS) { _outQoSRecords[packetId] = now; } @@ -341,11 +339,9 @@ public: { Mutex::Lock _l(_statistics_m); if (verb != Packet::VERB_ACK && verb != Packet::VERB_QOS_MEASUREMENT) { - if (packetId % 2 == 0) { // even -> use for ACK + if ((packetId & (ZT_PATH_QOS_ACK_PROTOCOL_DIVISOR - 1)) == 0) { _inACKRecords[packetId] = payloadLength; _packetsReceivedSinceLastAck++; - } - else { // odd -> use for QoS _inQoSRecords[packetId] = now; _packetsReceivedSinceLastQoS++; } @@ -527,12 +523,12 @@ public: * * @param allocation Percentage of traffic to be sent over this path to a peer */ - inline void updateComponentAllocationOfAggregateLink(float allocation) { _lastAllocation = allocation; } + inline void updateComponentAllocationOfAggregateLink(unsigned char allocation) { _lastAllocation = allocation; } /** * @return Percentage of traffic allocated to this path in the aggregate link */ - inline float allocation() { return _lastAllocation; } + inline unsigned char allocation() { return _lastAllocation; } /** * @return Stability estimates can become expensive to compute, we cache the most recent result. @@ -704,7 +700,9 @@ private: float _lastComputedStability; float _lastComputedRelativeQuality; float _lastComputedThroughputDistCoeff; - float _lastAllocation; + unsigned char _lastAllocation; + + // cached human-readable strings for tracing purposes char _ifname[16]; diff --git a/node/Peer.cpp b/node/Peer.cpp index 1d581ab8..21bbfabe 100644 --- a/node/Peer.cpp +++ b/node/Peer.cpp @@ -56,6 +56,12 @@ Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Ident _lastSentFullHello(0), _lastACKWindowReset(0), _lastQoSWindowReset(0), + _lastMultipathCompatibilityCheck(0), + _freeRandomByte(0), + _uniqueAlivePathCount(0), + _localMultipathSupported(false), + _remoteMultipathSupported(false), + _canUseMultipath(false), _vProto(0), _vMajor(0), _vMinor(0), @@ -69,6 +75,7 @@ Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Ident _lastAggregateStatsReport(0), _lastAggregateAllocation(0) { + Utils::getSecureRandom(&_freeRandomByte, 1); if (!myIdentity.agree(peerIdentity,_key,ZT_PEER_SECRET_KEY_LENGTH)) throw ZT_EXCEPTION_INVALID_ARGUMENT; _pathChoiceHist = new RingBuffer(ZT_MULTIPATH_PROPORTION_WIN_SZ); @@ -110,7 +117,7 @@ void Peer::received( recordIncomingPacket(tPtr, path, packetId, payloadLength, verb, now); - if (canUseMultipath()) { + if (_canUseMultipath) { if (path->needsToSendQoS(now)) { sendQOS_MEASUREMENT(tPtr, path, path->localSocket(), path->address(), now); } @@ -145,17 +152,23 @@ void Peer::received( // Paths are redundant if they duplicate an alive path to the same IP or // with the same local socket and address family. bool redundant = false; + unsigned int replacePath = ZT_MAX_PEER_NETWORK_PATHS; for(unsigned int i=0;ialive(now)) && ( ((_paths[i].p->localSocket() == path->localSocket())&&(_paths[i].p->address().ss_family == path->address().ss_family)) || (_paths[i].p->address().ipsEqual2(path->address())) ) ) { + if ( (_paths[i].p->alive(now)) && ( ((_paths[i].p->localSocket() == path->localSocket())&&(_paths[i].p->address().ss_family == path->address().ss_family)) || (_paths[i].p->address().ipsEqual2(path->address())) ) ) { redundant = true; break; } + // If the path is the same address and port, simply assume this is a replacement + if ( (_paths[i].p->address().ipsEqual2(path->address()) && (_paths[i].p->address().port() == path->address().port()))) { + replacePath = i; + break; + } } else break; } - - if (!redundant) { - unsigned int replacePath = ZT_MAX_PEER_NETWORK_PATHS; + // If the path isn't a duplicate of the same localSocket AND we haven't already determined a replacePath, + // then find the worst path and replace it. + if (!redundant && replacePath == ZT_MAX_PEER_NETWORK_PATHS) { int replacePathQuality = 0; for(unsigned int i=0;iaddress().ss_family == path->address().ss_family && _paths[i].p->address().ipsEqual2(path->address())) { - replacePath = i; - break; - } - } - } - } - - if (replacePath != ZT_MAX_PEER_NETWORK_PATHS) { - if (verb == Packet::VERB_OK) { - RR->t->peerLearnedNewPath(tPtr,networkId,*this,path,packetId); - _paths[replacePath].lr = now; - _paths[replacePath].p = path; - _paths[replacePath].priority = 1; - } else { - attemptToContact = true; - } + } + if (replacePath != ZT_MAX_PEER_NETWORK_PATHS) { + if (verb == Packet::VERB_OK) { + RR->t->peerLearnedNewPath(tPtr,networkId,*this,path,packetId); + _paths[replacePath].lr = now; + _paths[replacePath].p = path; + _paths[replacePath].priority = 1; + } else { + attemptToContact = true; } } } @@ -274,7 +273,9 @@ void Peer::received( void Peer::recordOutgoingPacket(const SharedPtr &path, const uint64_t packetId, uint16_t payloadLength, const Packet::Verb verb, int64_t now) { - if (localMultipathSupport()) { + // Grab second byte from packetId to use as a source of entropy in the next path selection + _freeRandomByte = (packetId & 0xFF00) >> 8; + if (_canUseMultipath) { path->recordOutgoingPacket(now, packetId, payloadLength, verb); } } @@ -282,7 +283,7 @@ void Peer::recordOutgoingPacket(const SharedPtr &path, const uint64_t pack void Peer::recordIncomingPacket(void *tPtr, const SharedPtr &path, const uint64_t packetId, uint16_t payloadLength, const Packet::Verb verb, int64_t now) { - if (localMultipathSupport()) { + if (_canUseMultipath) { if (path->needsToSendAck(now)) { sendACK(tPtr, path, path->localSocket(), path->address(), now); } @@ -323,6 +324,9 @@ void Peer::computeAggregateProportionalAllocation(int64_t now) + (fmax(1, relThroughput[i]) * ZT_PATH_CONTRIB_THROUGHPUT) + relScope * ZT_PATH_CONTRIB_SCOPE; relQuality *= age_contrib; + // Arbitrary cutoffs + relQuality = relQuality > (1.00 / 100.0) ? relQuality : 0.0; + relQuality = relQuality < (99.0 / 100.0) ? relQuality : 1.0; totalRelativeQuality += relQuality; _paths[i].p->updateRelativeQuality(relQuality); } @@ -330,12 +334,12 @@ void Peer::computeAggregateProportionalAllocation(int64_t now) // Convert set of relative performances into an allocation set for(uint16_t i=0;iupdateComponentAllocationOfAggregateLink(_paths[i].p->relativeQuality() / totalRelativeQuality); + _paths[i].p->updateComponentAllocationOfAggregateLink((_paths[i].p->relativeQuality() / totalRelativeQuality) * 255); } } } -float Peer::computeAggregateLinkPacketDelayVariance() +int Peer::computeAggregateLinkPacketDelayVariance() { float pdv = 0.0; for(unsigned int i=0;i Peer::getAppropriatePath(int64_t now, bool includeExpired) * Send traffic across the highest quality path only. This algorithm will still * use the old path quality metric from protocol version 9. */ - if (!canUseMultipath()) { + if (!_canUseMultipath) { long bestPathQuality = 2147483647; for(unsigned int i=0;i Peer::getAppropriatePath(int64_t now, bool includeExpired) } } } - unsigned int r; - Utils::getSecureRandom(&r, 1); + unsigned int r = _freeRandomByte; if (numAlivePaths > 0) { - // pick a random out of the set deemed "alive" int rf = r % numAlivePaths; return _paths[alivePaths[rf]].p; } else if(numStalePaths > 0) { - // resort to trying any non-expired path + // Resort to trying any non-expired path int rf = r % numStalePaths; return _paths[stalePaths[rf]].p; } @@ -461,40 +463,12 @@ SharedPtr Peer::getAppropriatePath(int64_t now, bool includeExpired) * Proportionally allocate traffic according to dynamic path quality measurements */ if (RR->node->getMultipathMode() == ZT_MULTIPATH_PROPORTIONALLY_BALANCED) { - int numAlivePaths = 0; - int numStalePaths = 0; - int alivePaths[ZT_MAX_PEER_NETWORK_PATHS]; - int stalePaths[ZT_MAX_PEER_NETWORK_PATHS]; - memset(&alivePaths, -1, sizeof(alivePaths)); - memset(&stalePaths, -1, sizeof(stalePaths)); - // Attempt to find an excuse not to use the rest of this algorithm - // Alive or Stale? - for(unsigned int i=0;ialive(now)) { - alivePaths[numAlivePaths] = i; - numAlivePaths++; - } else { - stalePaths[numStalePaths] = i; - numStalePaths++; - } - // Record a default path to use as a short-circuit for the rest of the algorithm (if needed) - bestPath = i; - } - } if ((now - _lastAggregateAllocation) >= ZT_PATH_QUALITY_COMPUTE_INTERVAL) { _lastAggregateAllocation = now; computeAggregateProportionalAllocation(now); } - if (numAlivePaths == 0 && numStalePaths == 0) { - return SharedPtr(); - } if (numAlivePaths == 1 || numStalePaths == 1) { - return _paths[bestPath].p; - } // Randomly choose path according to their allocations - unsigned int r; - Utils::getSecureRandom(&r, 1); - float rf = (float)(r %= 100) / 100; + float rf = _freeRandomByte; for(int i=0;iallocation()) { @@ -676,6 +650,41 @@ void Peer::introduce(void *const tPtr,const int64_t now,const SharedPtr &o } } +inline void Peer::processBackgroundPeerTasks(int64_t now) +{ + // Determine current multipath compatibility with other peer + if ((now - _lastMultipathCompatibilityCheck) >= ZT_PATH_QUALITY_COMPUTE_INTERVAL) { + // Cache number of available paths so that we can short-circuit multipath logic elsewhere + // + // We also take notice of duplicate paths (same IP only) because we may have + // recently received a direct path push from a peer and our list might contain + // a dead path which hasn't been fully recognized as such. In this case we + // don't want the duplicate to trigger execution of multipath code prematurely. + // + // This is done to support the behavior of auto multipath enable/disable + // without user intervention. + int currAlivePathCount = 0; + int duplicatePathsFound = 0; + for (unsigned int i=0;iaddress().ipsEqual2(_paths[j].p->address()) && i != j) { + duplicatePathsFound+=1; + break; + } + } + } + } + _uniqueAlivePathCount = (currAlivePathCount - (duplicatePathsFound / 2)); + _lastMultipathCompatibilityCheck = now; + _localMultipathSupported = ((RR->node->getMultipathMode() != ZT_MULTIPATH_NONE) && (ZT_PROTO_VERSION > 9)); + _remoteMultipathSupported = _vProto > 9; + // If both peers support multipath and more than one path exist, we can use multipath logic + _canUseMultipath = _localMultipathSupported && _remoteMultipathSupported && (_uniqueAlivePathCount > 1); + } +} + void Peer::sendACK(void *tPtr,const SharedPtr &path,const int64_t localSocket,const InetAddress &atAddress,int64_t now) { Packet outp(_id.address(),RR->identity.address(),Packet::VERB_ACK); @@ -774,14 +783,15 @@ void Peer::tryMemorizedPath(void *tPtr,int64_t now) unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now) { unsigned int sent = 0; - Mutex::Lock _l(_paths_m); const bool sendFullHello = ((now - _lastSentFullHello) >= ZT_PEER_PING_PERIOD); _lastSentFullHello = now; + processBackgroundPeerTasks(now); + // Emit traces regarding aggregate link status - if (canUseMultipath()) { + if (_canUseMultipath) { int alivePathCount = aggregateLinkPhysicalPathCount(); if ((now - _lastAggregateStatsReport) > ZT_PATH_AGGREGATE_STATS_REPORT_INTERVAL) { _lastAggregateStatsReport = now; diff --git a/node/Peer.hpp b/node/Peer.hpp index ddbe6f77..a32eaad0 100644 --- a/node/Peer.hpp +++ b/node/Peer.hpp @@ -203,12 +203,12 @@ public: /** * @return The aggregate link Packet Delay Variance (PDV) */ - float computeAggregateLinkPacketDelayVariance(); + int computeAggregateLinkPacketDelayVariance(); /** * @return The aggregate link mean latency */ - float computeAggregateLinkMeanLatency(); + int computeAggregateLinkMeanLatency(); /** * @return The number of currently alive "physical" paths in the aggregate link @@ -357,7 +357,7 @@ public: */ inline unsigned int latency(const int64_t now) { - if (RR->node->getMultipathMode()) { + if (_canUseMultipath) { return (int)computeAggregateLinkMeanLatency(); } else { SharedPtr bp(getAppropriatePath(now,false)); @@ -417,6 +417,14 @@ public: inline bool remoteVersionKnown() const { return ((_vMajor > 0)||(_vMinor > 0)||(_vRevision > 0)); } + /** + * Periodically update known multipath activation constraints. This is done so that we know when and when + * not to use multipath logic. Doing this once every few seconds is sufficient. + * + * @param now Current time + */ + inline void processBackgroundPeerTasks(int64_t now); + /** * Record that the remote peer does have multipath enabled. As is evident by the receipt of a VERB_ACK * or a VERB_QOS_MEASUREMENT packet at some point in the past. Until this flag is set, the local client @@ -427,18 +435,18 @@ public: /** * @return Whether the local client supports and is configured to use multipath */ - inline bool localMultipathSupport() { return ((RR->node->getMultipathMode() != ZT_MULTIPATH_NONE) && (ZT_PROTO_VERSION > 9)); } + inline bool localMultipathSupport() { return _localMultipathSupported; } /** * @return Whether the remote peer supports and is configured to use multipath */ - inline bool remoteMultipathSupport() { return (_remotePeerMultipathEnabled && (_vProto > 9)); } + inline bool remoteMultipathSupport() { return _remoteMultipathSupported; } /** * @return Whether this client can use multipath to communicate with this peer. True if both peers are using * the correct protocol and if both peers have multipath enabled. False if otherwise. */ - inline bool canUseMultipath() { return (localMultipathSupport() && remoteMultipathSupport()); } + inline bool canUseMultipath() { return _canUseMultipath; } /** * @return True if peer has received a trust established packet (e.g. common network membership) in the past ZT_TRUST_EXPIRATION ms @@ -557,6 +565,13 @@ public: return (_QoSCutoffCount < ZT_PATH_QOS_ACK_CUTOFF_LIMIT); } + /** + * @return Whether this peer is reachable via an aggregate link + */ + inline bool hasAggregateLink() { + return _localMultipathSupported && _remoteMultipathSupported && _remotePeerMultipathEnabled; + } + /** * Serialize a peer for storage in local cache * @@ -658,6 +673,15 @@ private: int64_t _lastPathPrune; int64_t _lastACKWindowReset; int64_t _lastQoSWindowReset; + int64_t _lastMultipathCompatibilityCheck; + + unsigned char _freeRandomByte; + + int _uniqueAlivePathCount; + + bool _localMultipathSupported; + bool _remoteMultipathSupported; + bool _canUseMultipath; uint16_t _vProto; uint16_t _vMajor; diff --git a/service/OneService.cpp b/service/OneService.cpp index a1a9d981..9b12f17b 100644 --- a/service/OneService.cpp +++ b/service/OneService.cpp @@ -1229,7 +1229,7 @@ public: char peerAddrStr[256]; if (pl) { for(unsigned long i=0;ipeerCount;++i) { - if (pl->peers[i].role == ZT_PEER_ROLE_LEAF) { + if (pl->peers[i].hadAggregateLink) { nlohmann::json pj; _peerAggregateLinkToJson(pj,&(pl->peers[i])); OSUtils::ztsnprintf(peerAddrStr,sizeof(peerAddrStr),"%.10llx",pl->peers[i].address); -- cgit v1.2.3 From 28cb40529d04e61a188e9d4a2a316690703ea605 Mon Sep 17 00:00:00 2001 From: Joseph Henry Date: Tue, 10 Jul 2018 16:50:12 -0700 Subject: Rough draft of fq-codel implementation --- node/Constants.hpp | 35 ++++++ node/Network.cpp | 16 ++- node/Network.hpp | 10 +- node/Node.cpp | 1 + node/OutboundMulticast.cpp | 3 +- node/Switch.cpp | 275 +++++++++++++++++++++++++++++++++++++++++++-- node/Switch.hpp | 94 ++++++++++++++++ 7 files changed, 419 insertions(+), 15 deletions(-) (limited to 'node/Node.cpp') diff --git a/node/Constants.hpp b/node/Constants.hpp index 420343ad..5f21201e 100644 --- a/node/Constants.hpp +++ b/node/Constants.hpp @@ -410,6 +410,41 @@ */ #define ZT_PATH_IMBALANCE_THRESHOLD 0.20 +/** + * Max allowable time spent in any queue + */ +#define ZT_QOS_TARGET 5 // ms + +/** + * Time period where the time spent in the queue by a packet should fall below + * target at least once + */ +#define ZT_QOS_INTERVAL 100 // ms + +/** + * The number of bytes that each queue is allowed to send during each DRR cycle. + * This approximates a single-byte-based fairness queuing scheme + */ +#define ZT_QOS_QUANTUM ZT_DEFAULT_MTU + +/** + * The maximum total number of packets that can be queued among all + * active/inactive, old/new queues + */ +#define ZT_QOS_MAX_ENQUEUED_PACKETS 1024 + +/** + * Number of QoS queues (buckets) + */ +#define ZT_QOS_NUM_BUCKETS 9 + +/** + * All unspecified traffic is put in this bucket. Anything in a bucket with a smaller + * value is de-prioritized. Anything in a bucket with a higher value is prioritized over + * other traffic. + */ +#define ZT_QOS_DEFAULT_BUCKET 0 + /** * How frequently to send heartbeats over in-use paths */ diff --git a/node/Network.cpp b/node/Network.cpp index a75d9fd1..a5c2fc3e 100644 --- a/node/Network.cpp +++ b/node/Network.cpp @@ -106,7 +106,8 @@ static _doZtFilterResult _doZtFilter( const unsigned int ruleCount, Address &cc, // MUTABLE -- set to TEE destination if TEE action is taken or left alone otherwise unsigned int &ccLength, // MUTABLE -- set to length of packet payload to TEE - bool &ccWatch) // MUTABLE -- set to true for WATCH target as opposed to normal TEE + bool &ccWatch, // MUTABLE -- set to true for WATCH target as opposed to normal TEE + uint8_t &qosBucket) // MUTABLE -- set to the value of the argument provided to the matching action { // Set to true if we are a TEE/REDIRECT/WATCH target bool superAccept = false; @@ -621,7 +622,8 @@ bool Network::filterOutgoingPacket( const uint8_t *frameData, const unsigned int frameLen, const unsigned int etherType, - const unsigned int vlanId) + const unsigned int vlanId, + uint8_t &qosBucket) { const int64_t now = RR->node->now(); Address ztFinalDest(ztDest); @@ -636,7 +638,7 @@ bool Network::filterOutgoingPacket( Membership *const membership = (ztDest) ? _memberships.get(ztDest) : (Membership *)0; - switch(_doZtFilter(RR,rrl,_config,membership,false,ztSource,ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,_config.rules,_config.ruleCount,cc,ccLength,ccWatch)) { + switch(_doZtFilter(RR,rrl,_config,membership,false,ztSource,ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,_config.rules,_config.ruleCount,cc,ccLength,ccWatch,qosBucket)) { case DOZTFILTER_NO_MATCH: { for(unsigned int c=0;c<_config.capabilityCount;++c) { @@ -644,7 +646,7 @@ bool Network::filterOutgoingPacket( Address cc2; unsigned int ccLength2 = 0; bool ccWatch2 = false; - switch (_doZtFilter(RR,crrl,_config,membership,false,ztSource,ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,_config.capabilities[c].rules(),_config.capabilities[c].ruleCount(),cc2,ccLength2,ccWatch2)) { + switch (_doZtFilter(RR,crrl,_config,membership,false,ztSource,ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,_config.capabilities[c].rules(),_config.capabilities[c].ruleCount(),cc2,ccLength2,ccWatch2,qosBucket)) { case DOZTFILTER_NO_MATCH: case DOZTFILTER_DROP: // explicit DROP in a capability just terminates its evaluation and is an anti-pattern break; @@ -759,11 +761,13 @@ int Network::filterIncomingPacket( bool ccWatch = false; const Capability *c = (Capability *)0; + uint8_t qosBucket = 255; // For incoming packets this is a dummy value + Mutex::Lock _l(_lock); Membership &membership = _membership(sourcePeer->address()); - switch (_doZtFilter(RR,rrl,_config,&membership,true,sourcePeer->address(),ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,_config.rules,_config.ruleCount,cc,ccLength,ccWatch)) { + switch (_doZtFilter(RR,rrl,_config,&membership,true,sourcePeer->address(),ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,_config.rules,_config.ruleCount,cc,ccLength,ccWatch,qosBucket)) { case DOZTFILTER_NO_MATCH: { Membership::CapabilityIterator mci(membership,_config); @@ -772,7 +776,7 @@ int Network::filterIncomingPacket( Address cc2; unsigned int ccLength2 = 0; bool ccWatch2 = false; - switch(_doZtFilter(RR,crrl,_config,&membership,true,sourcePeer->address(),ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,c->rules(),c->ruleCount(),cc2,ccLength2,ccWatch2)) { + switch(_doZtFilter(RR,crrl,_config,&membership,true,sourcePeer->address(),ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,c->rules(),c->ruleCount(),cc2,ccLength2,ccWatch2,qosBucket)) { case DOZTFILTER_NO_MATCH: case DOZTFILTER_DROP: // explicit DROP in a capability just terminates its evaluation and is an anti-pattern break; diff --git a/node/Network.hpp b/node/Network.hpp index 38f03eb3..2baab511 100644 --- a/node/Network.hpp +++ b/node/Network.hpp @@ -132,7 +132,8 @@ public: const uint8_t *frameData, const unsigned int frameLen, const unsigned int etherType, - const unsigned int vlanId); + const unsigned int vlanId, + uint8_t &qosBucket); /** * Apply filters to an incoming packet @@ -297,6 +298,13 @@ public: */ void learnBridgeRoute(const MAC &mac,const Address &addr); + /** + * Whether QoS is in effect for this network + */ + bool QoSEnabled() { + return false; + } + /** * Learn a multicast group that is bridged to our tap device * diff --git a/node/Node.cpp b/node/Node.cpp index 9b10dfdd..576b2e4a 100644 --- a/node/Node.cpp +++ b/node/Node.cpp @@ -368,6 +368,7 @@ ZT_ResultCode Node::leave(uint64_t nwid,void **uptr,void *tptr) { Mutex::Lock _l(_networks_m); SharedPtr *nw = _networks.get(nwid); + RR->sw->removeNetworkQoSControlBlock(nwid); if (!nw) return ZT_RESULT_OK; if (uptr) diff --git a/node/OutboundMulticast.cpp b/node/OutboundMulticast.cpp index d7a7b4d8..2391771f 100644 --- a/node/OutboundMulticast.cpp +++ b/node/OutboundMulticast.cpp @@ -85,7 +85,8 @@ void OutboundMulticast::sendOnly(const RuntimeEnvironment *RR,void *tPtr,const A { const SharedPtr nw(RR->node->network(_nwid)); const Address toAddr2(toAddr); - if ((nw)&&(nw->filterOutgoingPacket(tPtr,true,RR->identity.address(),toAddr2,_macSrc,_macDest,_frameData,_frameLen,_etherType,0))) { + uint8_t QoSBucket = 255; // Dummy value + if ((nw)&&(nw->filterOutgoingPacket(tPtr,true,RR->identity.address(),toAddr2,_macSrc,_macDest,_frameData,_frameLen,_etherType,0,QoSBucket))) { _packet.newInitializationVector(); _packet.setDestination(toAddr2); RR->node->expectReplyTo(_packet.packetId()); diff --git a/node/Switch.cpp b/node/Switch.cpp index d53bf53e..fddbd581 100644 --- a/node/Switch.cpp +++ b/node/Switch.cpp @@ -266,6 +266,8 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr &network,const } } + uint8_t qosBucket = ZT_QOS_DEFAULT_BUCKET; + if (to.isMulticast()) { MulticastGroup multicastGroup(to,0); @@ -383,7 +385,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr &network,const network->learnBridgedMulticastGroup(tPtr,multicastGroup,RR->node->now()); // First pass sets noTee to false, but noTee is set to true in OutboundMulticast to prevent duplicates. - if (!network->filterOutgoingPacket(tPtr,false,RR->identity.address(),Address(),from,to,(const uint8_t *)data,len,etherType,vlanId)) { + if (!network->filterOutgoingPacket(tPtr,false,RR->identity.address(),Address(),from,to,(const uint8_t *)data,len,etherType,vlanId,qosBucket)) { RR->t->outgoingNetworkFrameDropped(tPtr,network,from,to,etherType,vlanId,len,"filter blocked"); return; } @@ -407,7 +409,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr &network,const Address toZT(to.toAddress(network->id())); // since in-network MACs are derived from addresses and network IDs, we can reverse this SharedPtr toPeer(RR->topology->getPeer(tPtr,toZT)); - if (!network->filterOutgoingPacket(tPtr,false,RR->identity.address(),toZT,from,to,(const uint8_t *)data,len,etherType,vlanId)) { + if (!network->filterOutgoingPacket(tPtr,false,RR->identity.address(),toZT,from,to,(const uint8_t *)data,len,etherType,vlanId,qosBucket)) { RR->t->outgoingNetworkFrameDropped(tPtr,network,from,to,etherType,vlanId,len,"filter blocked"); return; } @@ -422,7 +424,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr &network,const outp.append(data,len); if (!network->config().disableCompression()) outp.compress(); - send(tPtr,outp,true); + aqm_enqueue(tPtr,network,outp,true,qosBucket); } else { Packet outp(toZT,RR->identity.address(),Packet::VERB_FRAME); outp.append(network->id()); @@ -430,7 +432,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr &network,const outp.append(data,len); if (!network->config().disableCompression()) outp.compress(); - send(tPtr,outp,true); + aqm_enqueue(tPtr,network,outp,true,qosBucket); } } else { @@ -439,7 +441,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr &network,const // We filter with a NULL destination ZeroTier address first. Filtrations // for each ZT destination are also done below. This is the same rationale // and design as for multicast. - if (!network->filterOutgoingPacket(tPtr,false,RR->identity.address(),Address(),from,to,(const uint8_t *)data,len,etherType,vlanId)) { + if (!network->filterOutgoingPacket(tPtr,false,RR->identity.address(),Address(),from,to,(const uint8_t *)data,len,etherType,vlanId,qosBucket)) { RR->t->outgoingNetworkFrameDropped(tPtr,network,from,to,etherType,vlanId,len,"filter blocked"); return; } @@ -477,7 +479,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr &network,const } for(unsigned int b=0;bfilterOutgoingPacket(tPtr,true,RR->identity.address(),bridges[b],from,to,(const uint8_t *)data,len,etherType,vlanId)) { + if (network->filterOutgoingPacket(tPtr,true,RR->identity.address(),bridges[b],from,to,(const uint8_t *)data,len,etherType,vlanId,qosBucket)) { Packet outp(bridges[b],RR->identity.address(),Packet::VERB_EXT_FRAME); outp.append(network->id()); outp.append((uint8_t)0x00); @@ -487,7 +489,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr &network,const outp.append(data,len); if (!network->config().disableCompression()) outp.compress(); - send(tPtr,outp,true); + aqm_enqueue(tPtr,network,outp,true,qosBucket); } else { RR->t->outgoingNetworkFrameDropped(tPtr,network,from,to,etherType,vlanId,len,"filter blocked (bridge replication)"); } @@ -495,6 +497,263 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr &network,const } } +void Switch::aqm_enqueue(void *tPtr, const SharedPtr &network, Packet &packet,bool encrypt,int qosBucket) +{ + if(!network->QoSEnabled()) { + send(tPtr, packet, encrypt); + return; + } + NetworkQoSControlBlock *nqcb = _netQueueControlBlock[network->id()]; + if (!nqcb) { + // DEBUG_INFO("creating network QoS control block (NQCB) for network %llx", network->id()); + nqcb = new NetworkQoSControlBlock(); + _netQueueControlBlock[network->id()] = nqcb; + // Initialize ZT_QOS_NUM_BUCKETS queues and place them in the INACTIVE list + // These queues will be shuffled between the new/old/inactive lists by the enqueue/dequeue algorithm + for (int i=0; iinactiveQueues.push_back(new ManagedQueue(i)); + } + } + + if (packet.verb() != Packet::VERB_FRAME && packet.verb() != Packet::VERB_EXT_FRAME) { + // DEBUG_INFO("skipping, no QoS for this packet, verb=%x", packet.verb()); + // just send packet normally, no QoS for ZT protocol traffic + send(tPtr, packet, encrypt); + } + + _aqm_m.lock(); + + // Enqueue packet and move queue to appropriate list + + const Address dest(packet.destination()); + TXQueueEntry *txEntry = new TXQueueEntry(dest,RR->node->now(),packet,encrypt); + + ManagedQueue *selectedQueue = nullptr; + for (int i=0; ioldQueues.size()) { // search old queues first (I think this is best since old would imply most recent usage of the queue) + if (nqcb->oldQueues[i]->id == qosBucket) { + selectedQueue = nqcb->oldQueues[i]; + } + } if (i < nqcb->newQueues.size()) { // search new queues (this would imply not often-used queues) + if (nqcb->newQueues[i]->id == qosBucket) { + selectedQueue = nqcb->newQueues[i]; + } + } if (i < nqcb->inactiveQueues.size()) { // search inactive queues + if (nqcb->inactiveQueues[i]->id == qosBucket) { + selectedQueue = nqcb->inactiveQueues[i]; + // move queue to end of NEW queue list + selectedQueue->byteCredit = ZT_QOS_QUANTUM; + // DEBUG_INFO("moving q=%p from INACTIVE to NEW list", selectedQueue); + nqcb->newQueues.push_back(selectedQueue); + nqcb->inactiveQueues.erase(nqcb->inactiveQueues.begin() + i); + } + } + } + if (!selectedQueue) { + return; + } + + selectedQueue->q.push_back(txEntry); + selectedQueue->byteLength+=txEntry->packet.payloadLength(); + nqcb->_currEnqueuedPackets++; + + // DEBUG_INFO("nq=%2lu, oq=%2lu, iq=%2lu, nqcb.size()=%3d, bucket=%2d, q=%p", nqcb->newQueues.size(), nqcb->oldQueues.size(), nqcb->inactiveQueues.size(), nqcb->_currEnqueuedPackets, qosBucket, selectedQueue); + + // Drop a packet if necessary + ManagedQueue *selectedQueueToDropFrom = nullptr; + if (nqcb->_currEnqueuedPackets > ZT_QOS_MAX_ENQUEUED_PACKETS) + { + // DEBUG_INFO("too many enqueued packets (%d), finding packet to drop", nqcb->_currEnqueuedPackets); + int maxQueueLength = 0; + for (int i=0; ioldQueues.size()) { + if (nqcb->oldQueues[i]->byteLength > maxQueueLength) { + maxQueueLength = nqcb->oldQueues[i]->byteLength; + selectedQueueToDropFrom = nqcb->oldQueues[i]; + } + } if (i < nqcb->newQueues.size()) { + if (nqcb->newQueues[i]->byteLength > maxQueueLength) { + maxQueueLength = nqcb->newQueues[i]->byteLength; + selectedQueueToDropFrom = nqcb->newQueues[i]; + } + } if (i < nqcb->inactiveQueues.size()) { + if (nqcb->inactiveQueues[i]->byteLength > maxQueueLength) { + maxQueueLength = nqcb->inactiveQueues[i]->byteLength; + selectedQueueToDropFrom = nqcb->inactiveQueues[i]; + } + } + } + if (selectedQueueToDropFrom) { + // DEBUG_INFO("dropping packet from head of largest queue (%d payload bytes)", maxQueueLength); + int sizeOfDroppedPacket = selectedQueueToDropFrom->q.front()->packet.payloadLength(); + delete selectedQueueToDropFrom->q.front(); + selectedQueueToDropFrom->q.pop_front(); + selectedQueueToDropFrom->byteLength-=sizeOfDroppedPacket; + nqcb->_currEnqueuedPackets--; + } + } + _aqm_m.unlock(); + aqm_dequeue(tPtr); +} + +uint64_t Switch::control_law(uint64_t t, int count) +{ + return t + ZT_QOS_INTERVAL / sqrt(count); +} + +Switch::dqr Switch::dodequeue(ManagedQueue *q, uint64_t now) +{ + dqr r; + r.ok_to_drop = false; + r.p = q->q.front(); + + if (r.p == NULL) { + q->first_above_time = 0; + return r; + } + uint64_t sojourn_time = now - r.p->creationTime; + if (sojourn_time < ZT_QOS_TARGET || q->byteLength <= ZT_DEFAULT_MTU) { + // went below - stay below for at least interval + q->first_above_time = 0; + } else { + if (q->first_above_time == 0) { + // just went above from below. if still above at + // first_above_time, will say it's ok to drop. + q->first_above_time = now + ZT_QOS_INTERVAL; + } else if (now >= q->first_above_time) { + r.ok_to_drop = true; + } + } + return r; +} + +Switch::TXQueueEntry * Switch::CoDelDequeue(ManagedQueue *q, bool isNew, uint64_t now) +{ + dqr r = dodequeue(q, now); + + if (q->dropping) { + if (!r.ok_to_drop) { + q->dropping = false; + } + while (now >= q->drop_next && q->dropping) { + q->q.pop_front(); // drop + r = dodequeue(q, now); + if (!r.ok_to_drop) { + // leave dropping state + q->dropping = false; + } else { + ++(q->count); + // schedule the next drop. + q->drop_next = control_law(q->drop_next, q->count); + } + } + } else if (r.ok_to_drop) { + q->q.pop_front(); // drop + r = dodequeue(q, now); + q->dropping = true; + q->count = (q->count > 2 && now - q->drop_next < 8*ZT_QOS_INTERVAL)? + q->count - 2 : 1; + q->drop_next = control_law(now, q->count); + } + return r.p; +} + +void Switch::aqm_dequeue(void *tPtr) +{ + // Cycle through network-specific QoS control blocks + for(std::map::iterator nqcb(_netQueueControlBlock.begin());nqcb!=_netQueueControlBlock.end();) { + if (!(*nqcb).second->_currEnqueuedPackets) { + return; + } + + uint64_t now = RR->node->now(); + TXQueueEntry *entryToEmit = nullptr; + std::vector *currQueues = &((*nqcb).second->newQueues); + std::vector *oldQueues = &((*nqcb).second->oldQueues); + std::vector *inactiveQueues = &((*nqcb).second->inactiveQueues); + + _aqm_m.lock(); + + // Attempt dequeue from queues in NEW list + bool examiningNewQueues = true; + while (currQueues->size()) { + ManagedQueue *queueAtFrontOfList = currQueues->front(); + if (queueAtFrontOfList->byteCredit < 0) { + queueAtFrontOfList->byteCredit += ZT_QOS_QUANTUM; + // Move to list of OLD queues + // DEBUG_INFO("moving q=%p from NEW to OLD list", queueAtFrontOfList); + oldQueues->push_back(queueAtFrontOfList); + currQueues->erase(currQueues->begin()); + } else { + entryToEmit = CoDelDequeue(queueAtFrontOfList, examiningNewQueues, now); + if (!entryToEmit) { + // Move to end of list of OLD queues + // DEBUG_INFO("moving q=%p from NEW to OLD list", queueAtFrontOfList); + oldQueues->push_back(queueAtFrontOfList); + currQueues->erase(currQueues->begin()); + } + else { + int len = entryToEmit->packet.payloadLength(); + queueAtFrontOfList->byteLength -= len; + queueAtFrontOfList->byteCredit -= len; + // Send the packet! + queueAtFrontOfList->q.pop_front(); + send(tPtr, entryToEmit->packet, entryToEmit->encrypt); + (*nqcb).second->_currEnqueuedPackets--; + } + if (queueAtFrontOfList) { + //DEBUG_INFO("dequeuing from q=%p, len=%lu in NEW list (byteCredit=%d)", queueAtFrontOfList, queueAtFrontOfList->q.size(), queueAtFrontOfList->byteCredit); + } + break; + } + } + + // Attempt dequeue from queues in OLD list + examiningNewQueues = false; + currQueues = &((*nqcb).second->oldQueues); + while (currQueues->size()) { + ManagedQueue *queueAtFrontOfList = currQueues->front(); + if (queueAtFrontOfList->byteCredit < 0) { + queueAtFrontOfList->byteCredit += ZT_QOS_QUANTUM; + oldQueues->push_back(queueAtFrontOfList); + currQueues->erase(currQueues->begin()); + } else { + entryToEmit = CoDelDequeue(queueAtFrontOfList, examiningNewQueues, now); + if (!entryToEmit) { + //DEBUG_INFO("moving q=%p from OLD to INACTIVE list", queueAtFrontOfList); + // Move to inactive list of queues + inactiveQueues->push_back(queueAtFrontOfList); + currQueues->erase(currQueues->begin()); + } + else { + int len = entryToEmit->packet.payloadLength(); + queueAtFrontOfList->byteLength -= len; + queueAtFrontOfList->byteCredit -= len; + queueAtFrontOfList->q.pop_front(); + send(tPtr, entryToEmit->packet, entryToEmit->encrypt); + (*nqcb).second->_currEnqueuedPackets--; + } + if (queueAtFrontOfList) { + //DEBUG_INFO("dequeuing from q=%p, len=%lu in OLD list (byteCredit=%d)", queueAtFrontOfList, queueAtFrontOfList->q.size(), queueAtFrontOfList->byteCredit); + } + break; + } + } + nqcb++; + _aqm_m.unlock(); + } +} + +void Switch::removeNetworkQoSControlBlock(uint64_t nwid) +{ + NetworkQoSControlBlock *nq = _netQueueControlBlock[nwid]; + if (nq) { + _netQueueControlBlock.erase(nwid); + delete nq; + nq = NULL; + } +} + void Switch::send(void *tPtr,Packet &packet,bool encrypt) { const Address dest(packet.destination()); @@ -550,6 +809,7 @@ void Switch::doAnythingWaitingForPeer(void *tPtr,const SharedPtr &peer) { Mutex::Lock _l(_txQueue_m); + for(std::list< TXQueueEntry >::iterator txi(_txQueue.begin());txi!=_txQueue.end();) { if (txi->dest == peer->address()) { if (_trySend(tPtr,txi->packet,txi->encrypt)) { @@ -574,6 +834,7 @@ unsigned long Switch::doTimerTasks(void *tPtr,int64_t now) std::vector
needWhois; { Mutex::Lock _l(_txQueue_m); + for(std::list< TXQueueEntry >::iterator txi(_txQueue.begin());txi!=_txQueue.end();) { if (_trySend(tPtr,txi->packet,txi->encrypt)) { _txQueue.erase(txi++); diff --git a/node/Switch.hpp b/node/Switch.hpp index 906f418e..5f60fc46 100644 --- a/node/Switch.hpp +++ b/node/Switch.hpp @@ -59,6 +59,14 @@ class Peer; */ class Switch { + struct ManagedQueue; + struct TXQueueEntry; + + typedef struct { + TXQueueEntry *p; + bool ok_to_drop; + } dqr; + public: Switch(const RuntimeEnvironment *renv); @@ -87,6 +95,62 @@ public: */ void onLocalEthernet(void *tPtr,const SharedPtr &network,const MAC &from,const MAC &to,unsigned int etherType,unsigned int vlanId,const void *data,unsigned int len); + /** + * Determines the next drop schedule for packets in the TX queue + * + * @param t Current time + * @param count Number of packets dropped this round + */ + uint64_t control_law(uint64_t t, int count); + + /** + * Selects a packet eligible for transmission from a TX queue. According to the control law, multiple packets + * may be intentionally dropped before a packet is returned to the AQM scheduler. + * + * @param q The TX queue that is being dequeued from + * @param now Current time + */ + dqr dodequeue(ManagedQueue *q, uint64_t now); + + /** + * Presents a packet to the AQM scheduler. + * + * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call + * @param network Network that the packet shall be sent over + * @param packet Packet to be sent + * @param encrypt Encrypt packet payload? (always true except for HELLO) + * @param qosBucket Which bucket the rule-system determined this packet should fall into + */ + void aqm_enqueue(void *tPtr, const SharedPtr &network, Packet &packet,bool encrypt,int qosBucket); + + /** + * Performs a single AQM cycle and dequeues and transmits all eligible packets on all networks + * + * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call + */ + void aqm_dequeue(void *tPtr); + + /** + * Calls the dequeue mechanism and adjust queue state variables + * + * @param q The TX queue that is being dequeued from + * @param isNew Whether or not this queue is in the NEW list + * @param now Current time + */ + Switch::TXQueueEntry * CoDelDequeue(ManagedQueue *q, bool isNew, uint64_t now); + + /** + * Removes QoS Queues and flow state variables for a specific network. These queues are created + * automatically upon the transmission of the first packet from this peer to another peer on the + * given network. + * + * The reason for existence of queues and flow state variables specific to each network is so that + * each network's QoS rules function independently. + * + * @param nwid Network ID + */ + void removeNetworkQoSControlBlock(uint64_t nwid); + /** * Send a packet to a ZeroTier address (destination in packet) * @@ -199,6 +263,7 @@ private: }; std::list< TXQueueEntry > _txQueue; Mutex _txQueue_m; + Mutex _aqm_m; // Tracks sending of VERB_RENDEZVOUS to relaying peers struct _LastUniteKey @@ -220,6 +285,35 @@ private: }; Hashtable< _LastUniteKey,uint64_t > _lastUniteAttempt; // key is always sorted in ascending order, for set-like behavior Mutex _lastUniteAttempt_m; + + // Queue with additional flow state variables + struct ManagedQueue + { + ManagedQueue(int id) : + id(id), + byteCredit(ZT_QOS_QUANTUM), + byteLength(0), + dropping(false) + {} + int id; + int byteCredit; + int byteLength; + uint64_t first_above_time; + uint32_t count; + uint64_t drop_next; + bool dropping; + uint64_t drop_next_time; + std::list< TXQueueEntry *> q; + }; + // To implement fq_codel we need to maintain a queue of queues + struct NetworkQoSControlBlock + { + int _currEnqueuedPackets; + std::vector newQueues; + std::vector oldQueues; + std::vector inactiveQueues; + }; + std::map _netQueueControlBlock; }; } // namespace ZeroTier -- cgit v1.2.3