From 6a2ba4baca326272c45930208b70cfedf8cb1638 Mon Sep 17 00:00:00 2001
From: Joseph Henry <josephjah@gmail.com>
Date: Tue, 1 May 2018 16:32:15 -0700
Subject: Introduced basic multipath support

---
 node/Node.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'node/Node.cpp')
diff --git a/node/Node.cpp b/node/Node.cpp
index db511430..71e5b6a7 100644
--- a/node/Node.cpp
+++ b/node/Node.cpp
@@ -234,7 +234,7 @@ public:
 			}
 
 			if ((!contacted)&&(_bestCurrentUpstream)) {
-				const SharedPtr<Path> up(_bestCurrentUpstream->getBestPath(_now,true));
+				const SharedPtr<Path> up(_bestCurrentUpstream->getAppropriatePath(_now,true));
 				if (up)
 					p->sendHELLO(_tPtr,up->localSocket(),up->address(),_now);
 			}
@@ -465,7 +465,7 @@ ZT_PeerList *Node::peers() const
 		p->role = RR->topology->role(pi->second->identity().address());
 
 		std::vector< SharedPtr<Path> > paths(pi->second->paths(_now));
-		SharedPtr<Path> bestp(pi->second->getBestPath(_now,false));
+		SharedPtr<Path> bestp(pi->second->getAppropriatePath(_now,false));
 		p->pathCount = 0;
 		for(std::vector< SharedPtr<Path> >::iterator path(paths.begin());path!=paths.end();++path) {
 			ZT_FAST_MEMCPY(&(p->paths[p->pathCount].address),&((*path)->address()),sizeof(struct sockaddr_storage));
-- 
cgit v1.2.3


From 17fbb020e733fab8d8be933bb4981927015a10f5 Mon Sep 17 00:00:00 2001
From: Joseph Henry <josephjah@gmail.com>
Date: Thu, 14 Jun 2018 16:34:45 -0700
Subject: Added multipath field to zerotier-cli status output. Adjusted how
 path estimates are computed and cached

---
 include/ZeroTierOne.h  | 57 +++++++++++++++++++++++++++++++-----
 node/Node.cpp          | 11 +++++++
 node/Path.hpp          | 79 ++++++++++++++++++++++++++++++--------------------
 node/Peer.cpp          |  4 +--
 node/Peer.hpp          | 14 +++++----
 service/OneService.cpp | 51 +++++++++++++++++++++++++++++++-
 6 files changed, 169 insertions(+), 47 deletions(-)

(limited to 'node/Node.cpp')

diff --git a/include/ZeroTierOne.h b/include/ZeroTierOne.h
index b3927c2e..a100afd9 100644
--- a/include/ZeroTierOne.h
+++ b/include/ZeroTierOne.h
@@ -449,13 +449,6 @@ enum ZT_MultipathMode
 	 * Will cease sending traffic over links that appear to be stale.
 	 */
 	ZT_MULTIPATH_PROPORTIONALLY_BALANCED = 2,
-
-	/**
-	 * Traffic is allocated across a user-defined interface/allocation
-	 *
-	 * Will cease sending traffic over links that appear to be stale.
-	 */
-	ZT_MULTIPATH_MANUALLY_BALANCED = 3
 };
 
 /**
@@ -1221,6 +1214,56 @@ typedef struct
 	 */
 	uint64_t trustedPathId;
 
+	/**
+	 * One-way latency
+	 */
+	float latency;
+
+	/**
+	 * How much latency varies over time
+	 */
+	float packetDelayVariance;
+
+	/**
+	 * How much observed throughput varies over time
+	 */
+	float throughputDisturbCoeff;
+
+	/**
+	 * Packet Error Ratio (PER)
+	 */
+	float packetErrorRatio;
+
+	/**
+	 * Packet Loss Ratio (PLR)
+	 */
+	float packetLossRatio;
+
+	/**
+	 * Stability of the path
+	 */
+	float stability;
+
+	/**
+	 * Current throughput (moving average)
+	 */
+	uint64_t throughput;
+
+	/**
+	 * Maximum observed throughput for this path
+	 */
+	uint64_t maxThroughput;
+
+	/**
+	 * Percentage of traffic allocated to this path
+	 */
+	float allocation;
+
+	/**
+	 * Name of physical interface (for monitoring)
+	 */
+	char *ifname;
+
 	/**
 	 * Is path expired?
 	 */
diff --git a/node/Node.cpp b/node/Node.cpp
index 71e5b6a7..24deeae2 100644
--- a/node/Node.cpp
+++ b/node/Node.cpp
@@ -474,6 +474,17 @@ ZT_PeerList *Node::peers() const
 			p->paths[p->pathCount].trustedPathId = RR->topology->getOutboundPathTrust((*path)->address());
 			p->paths[p->pathCount].expired = 0;
 			p->paths[p->pathCount].preferred = ((*path) == bestp) ? 1 : 0;
+			p->paths[p->pathCount].latency = (*path)->latency();
+			p->paths[p->pathCount].packetDelayVariance = (*path)->packetDelayVariance();
+			p->paths[p->pathCount].throughputDisturbCoeff = (*path)->throughputDisturbanceCoefficient();
+			p->paths[p->pathCount].packetErrorRatio = (*path)->packetErrorRatio();
+			p->paths[p->pathCount].packetLossRatio = (*path)->packetLossRatio();
+			p->paths[p->pathCount].stability = (*path)->lastComputedStability();
+			p->paths[p->pathCount].throughput = (*path)->meanThroughput();
+			p->paths[p->pathCount].maxThroughput = (*path)->maxLifetimeThroughput();
+			p->paths[p->pathCount].allocation = (*path)->allocation();
+			p->paths[p->pathCount].ifname = (*path)->getName();
+
 			++p->pathCount;
 		}
 	}
diff --git a/node/Path.hpp b/node/Path.hpp
index 30655877..6162be20 100644
--- a/node/Path.hpp
+++ b/node/Path.hpp
@@ -111,15 +111,16 @@ public:
 		_expectingAckAsOf(0),
 		_packetsReceivedSinceLastAck(0),
 		_packetsReceivedSinceLastQoS(0),
-		_meanThroughput(0.0),
 		_maxLifetimeThroughput(0),
+		_lastComputedMeanThroughput(0),
 		_bytesAckedSinceLastThroughputEstimation(0),
-		_meanLatency(0.0),
-		_packetDelayVariance(0.0),
-		_packetErrorRatio(0.0),
-		_packetLossRatio(0),
+		_lastComputedMeanLatency(0.0),
+		_lastComputedPacketDelayVariance(0.0),
+		_lastComputedPacketErrorRatio(0.0),
+		_lastComputedPacketLossRatio(0),
 		_lastComputedStability(0.0),
 		_lastComputedRelativeQuality(0),
+		_lastComputedThroughputDistCoeff(0.0),
 		_lastAllocation(0.0)
 	{
 		prepareBuffers();
@@ -142,15 +143,16 @@ public:
 		_expectingAckAsOf(0),
 		_packetsReceivedSinceLastAck(0),
 		_packetsReceivedSinceLastQoS(0),
-		_meanThroughput(0.0),
 		_maxLifetimeThroughput(0),
+		_lastComputedMeanThroughput(0),
 		_bytesAckedSinceLastThroughputEstimation(0),
-		_meanLatency(0.0),
-		_packetDelayVariance(0.0),
-		_packetErrorRatio(0.0),
-		_packetLossRatio(0),
+		_lastComputedMeanLatency(0.0),
+		_lastComputedPacketDelayVariance(0.0),
+		_lastComputedPacketErrorRatio(0.0),
+		_lastComputedPacketLossRatio(0),
 		_lastComputedStability(0.0),
 		_lastComputedRelativeQuality(0),
+		_lastComputedThroughputDistCoeff(0.0),
 		_lastAllocation(0.0)
 	{
 		prepareBuffers();
@@ -162,9 +164,11 @@ public:
 		delete _throughputSamples;
 		delete _latencySamples;
 		delete _packetValiditySamples;
+		delete _throughputDisturbanceSamples;
 		_throughputSamples = NULL;
 		_latencySamples = NULL;
 		_packetValiditySamples = NULL;
+		_throughputDisturbanceSamples = NULL;
 	}
 
 	/**
@@ -311,7 +315,7 @@ public:
 	inline void recordOutgoingPacket(int64_t now, int64_t packetId, uint16_t payloadLength, Packet::Verb verb)
 	{
 		Mutex::Lock _l(_statistics_m);
-		if (verb == Packet::VERB_FRAME || verb == Packet::VERB_EXT_FRAME) {
+		if (verb != Packet::VERB_ACK && verb != Packet::VERB_QOS_MEASUREMENT) {
 			if (packetId % 2 == 0) { // even -> use for ACK
 				_unackedBytes += payloadLength;
 				// Take note that we're expecting a VERB_ACK on this path as of a specific time
@@ -336,7 +340,7 @@ public:
 	inline void recordIncomingPacket(int64_t now, int64_t packetId, uint16_t payloadLength, Packet::Verb verb)
 	{
 		Mutex::Lock _l(_statistics_m);
-		if (verb == Packet::VERB_FRAME || verb == Packet::VERB_EXT_FRAME) {
+		if (verb != Packet::VERB_ACK && verb != Packet::VERB_QOS_MEASUREMENT) {
 			if (packetId % 2 == 0) { // even -> use for ACK
 				_inACKRecords[packetId] = payloadLength;
 				_packetsReceivedSinceLastAck++;
@@ -497,14 +501,14 @@ public:
 	inline int64_t ackAge(int64_t now) { return _expectingAckAsOf ? now - _expectingAckAsOf : 0; }
 
 	/**
-	 * The maximum observed throughput for this path
+	 * The maximum observed throughput (in bits/s) for this path
 	 */
 	inline uint64_t maxLifetimeThroughput() { return _maxLifetimeThroughput; }
 
 	/**
 	 * @return The mean throughput (in bits/s) of this link
 	 */
-	inline float meanThroughput() { return _meanThroughput; }
+	inline uint64_t meanThroughput() { return _lastComputedMeanThroughput; }
 
 	/**
 	 * Assign a new relative quality value for this path in the aggregate link
@@ -543,22 +547,22 @@ public:
 	/**
 	 * @return Packet delay variance
 	 */
-	inline float packetDelayVariance() { return _packetDelayVariance; }
+	inline float packetDelayVariance() { return _lastComputedPacketDelayVariance; }
 
 	/**
 	 * @return Previously-computed mean latency
 	 */
-	inline float meanLatency() { return _meanLatency; }
+	inline float meanLatency() { return _lastComputedMeanLatency; }
 
 	/**
 	 * @return Packet loss rate (PLR)
 	 */
-	inline float packetLossRatio() { return _packetLossRatio; }
+	inline float packetLossRatio() { return _lastComputedPacketLossRatio; }
 
 	/**
 	 * @return Packet error ratio (PER)
 	 */
-	inline float packetErrorRatio() { return _packetErrorRatio; }
+	inline float packetErrorRatio() { return _lastComputedPacketErrorRatio; }
 
 	/**
 	 * Record an invalid incoming packet. This packet failed MAC/compression/cipher checks and will now
@@ -571,38 +575,46 @@ public:
 	 */
 	inline char *getAddressString() { return _addrString; }
 
+	/**
+	 * @return The current throughput disturbance coefficient
+	 */
+	inline float throughputDisturbanceCoefficient() { return _lastComputedThroughputDistCoeff; }
+
 	/**
 	 * Compute and cache stability and performance metrics. The resultant stability coefficient is a measure of how "well behaved"
 	 * this path is. This figure is substantially different from (but required for the estimation of the path's overall "quality".
 	 *
 	 * @param now Current time
 	 */
-	inline void processBackgroundPathMeasurements(int64_t now, const int64_t peerId) {
+	inline void processBackgroundPathMeasurements(int64_t now) {
 		if (now - _lastPathQualityComputeTime > ZT_PATH_QUALITY_COMPUTE_INTERVAL) {
 			Mutex::Lock _l(_statistics_m);
 			_lastPathQualityComputeTime = now;
 			address().toString(_addrString);
-			_meanThroughput = _throughputSamples->mean();
-			_meanLatency = _latencySamples->mean();
-			_packetDelayVariance = _latencySamples->stddev(); // Similar to "jitter" (SEE: RFC 3393, RFC 4689)
+			_lastComputedMeanLatency = _latencySamples->mean();
+			_lastComputedPacketDelayVariance = _latencySamples->stddev(); // Similar to "jitter" (SEE: RFC 3393, RFC 4689)
+			_lastComputedMeanThroughput = (uint64_t)_throughputSamples->mean();
 			// If no packet validity samples, assume PER==0
-			_packetErrorRatio = 1 - (_packetValiditySamples->count() ? _packetValiditySamples->mean() : 1);
+			_lastComputedPacketErrorRatio = 1 - (_packetValiditySamples->count() ? _packetValiditySamples->mean() : 1);
 			// Compute path stability
 			// Normalize measurements with wildly different ranges into a reasonable range
-			float normalized_pdv = Utils::normalize(_packetDelayVariance, 0, ZT_PATH_MAX_PDV, 0, 10);
-			float normalized_la = Utils::normalize(_meanLatency, 0, ZT_PATH_MAX_MEAN_LATENCY, 0, 10);
+			float normalized_pdv = Utils::normalize(_lastComputedPacketDelayVariance, 0, ZT_PATH_MAX_PDV, 0, 10);
+			float normalized_la = Utils::normalize(_lastComputedMeanLatency, 0, ZT_PATH_MAX_MEAN_LATENCY, 0, 10);
 			float throughput_cv = _throughputSamples->mean() > 0 ? _throughputSamples->stddev() / _throughputSamples->mean() : 1;
 			// Form an exponential cutoff and apply contribution weights
 			float pdv_contrib = exp((-1)*normalized_pdv) * ZT_PATH_CONTRIB_PDV;
 			float latency_contrib = exp((-1)*normalized_la) * ZT_PATH_CONTRIB_LATENCY;
+			// Throughput Disturbance Coefficient
 			float throughput_disturbance_contrib = exp((-1)*throughput_cv) * ZT_PATH_CONTRIB_THROUGHPUT_DISTURBANCE;
+			_throughputDisturbanceSamples->push(throughput_cv);
+			_lastComputedThroughputDistCoeff = _throughputDisturbanceSamples->mean();
 			// Obey user-defined ignored contributions
 			pdv_contrib = ZT_PATH_CONTRIB_PDV > 0.0 ? pdv_contrib : 1;
 			latency_contrib = ZT_PATH_CONTRIB_LATENCY > 0.0 ? latency_contrib : 1;
 			throughput_disturbance_contrib = ZT_PATH_CONTRIB_THROUGHPUT_DISTURBANCE > 0.0 ? throughput_disturbance_contrib : 1;
-			// Compute the quality product
+			// Stability
 			_lastComputedStability = pdv_contrib + latency_contrib + throughput_disturbance_contrib;
-			_lastComputedStability *= 1 - _packetErrorRatio;
+			_lastComputedStability *= 1 - _lastComputedPacketErrorRatio;
 			// Prevent QoS records from sticking around for too long
 			std::map<uint64_t,uint64_t>::iterator it = _outQoSRecords.begin();
 			while (it != _outQoSRecords.end()) {
@@ -646,6 +658,7 @@ public:
 		_throughputSamples = new RingBuffer<uint64_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
 		_latencySamples = new RingBuffer<uint32_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
 		_packetValiditySamples = new RingBuffer<bool>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
+		_throughputDisturbanceSamples = new RingBuffer<float>(ZT_PATH_QUALITY_METRIC_WIN_SZ);
 		memset(_ifname, 0, 16);
 		memset(_addrString, 0, sizeof(_addrString));
 	}
@@ -677,19 +690,20 @@ private:
 	int16_t _packetsReceivedSinceLastAck;
 	int16_t _packetsReceivedSinceLastQoS;
 
-	float _meanThroughput;
 	uint64_t _maxLifetimeThroughput;
+	uint64_t _lastComputedMeanThroughput;
 	uint64_t _bytesAckedSinceLastThroughputEstimation;
 
-	volatile float _meanLatency;
-	float _packetDelayVariance;
+	float _lastComputedMeanLatency;
+	float _lastComputedPacketDelayVariance;
 
-	float _packetErrorRatio;
-	float _packetLossRatio;
+	float _lastComputedPacketErrorRatio;
+	float _lastComputedPacketLossRatio;
 
 	// cached estimates
 	float _lastComputedStability;
 	float _lastComputedRelativeQuality;
+	float _lastComputedThroughputDistCoeff;
 	float _lastAllocation;
 
 	// cached human-readable strings for tracing purposes
@@ -699,6 +713,7 @@ private:
 	RingBuffer<uint64_t> *_throughputSamples;
 	RingBuffer<uint32_t> *_latencySamples;
 	RingBuffer<bool> *_packetValiditySamples;
+	RingBuffer<float> *_throughputDisturbanceSamples;
 };
 
 } // namespace ZeroTier
diff --git a/node/Peer.cpp b/node/Peer.cpp
index 55132bba..0f471b07 100644
--- a/node/Peer.cpp
+++ b/node/Peer.cpp
@@ -116,7 +116,7 @@ void Peer::received(
 			}
 			for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 				if (_paths[i].p) {
-					_paths[i].p->processBackgroundPathMeasurements(now, _id.address().toInt());
+					_paths[i].p->processBackgroundPathMeasurements(now);
 				}
 			}
 		}
@@ -415,7 +415,7 @@ SharedPtr<Path> Peer::getAppropriatePath(int64_t now, bool includeExpired)
 
 	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 		if (_paths[i].p) {
-			_paths[i].p->processBackgroundPathMeasurements(now, _id.address().toInt());
+			_paths[i].p->processBackgroundPathMeasurements(now);
 		}
 	}
 
diff --git a/node/Peer.hpp b/node/Peer.hpp
index 9361f665..ddbe6f77 100644
--- a/node/Peer.hpp
+++ b/node/Peer.hpp
@@ -353,14 +353,18 @@ public:
 	inline int64_t isActive(int64_t now) const { return ((now - _lastNontrivialReceive) < ZT_PEER_ACTIVITY_TIMEOUT); }
 
 	/**
-	 * @return Latency in milliseconds of best path or 0xffff if unknown / no paths
+	 * @return Latency in milliseconds of best/aggregate path or 0xffff if unknown / no paths
 	 */
 	inline unsigned int latency(const int64_t now)
 	{
-		SharedPtr<Path> bp(getAppropriatePath(now,false));
-		if (bp)
-			return bp->latency();
-		return 0xffff;
+		if (RR->node->getMultipathMode()) {
+			return (int)computeAggregateLinkMeanLatency();
+		} else {
+			SharedPtr<Path> bp(getAppropriatePath(now,false));
+			if (bp)
+				return bp->latency();
+			return 0xffff;
+		}
 	}
 
 	/**
diff --git a/service/OneService.cpp b/service/OneService.cpp
index e56a4827..a1a9d981 100644
--- a/service/OneService.cpp
+++ b/service/OneService.cpp
@@ -298,6 +298,39 @@ static void _peerToJson(nlohmann::json &pj,const ZT_Peer *peer)
 	pj["paths"] = pa;
 }
 
+static void _peerAggregateLinkToJson(nlohmann::json &pj,const ZT_Peer *peer)
+{
+	char tmp[256];
+	OSUtils::ztsnprintf(tmp,sizeof(tmp),"%.10llx",peer->address);
+	pj["aggregateLinkLatency"] = peer->latency;
+
+	nlohmann::json pa = nlohmann::json::array();
+	for(unsigned int i=0;i<peer->pathCount;++i) {
+		//int64_t lastSend = peer->paths[i].lastSend;
+		//int64_t lastReceive = peer->paths[i].lastReceive;
+		nlohmann::json j;
+		j["address"] = reinterpret_cast<const InetAddress *>(&(peer->paths[i].address))->toString(tmp);
+		//j["lastSend"] = (lastSend < 0) ? 0 : lastSend;
+		//j["lastReceive"] = (lastReceive < 0) ? 0 : lastReceive;
+		//j["trustedPathId"] = peer->paths[i].trustedPathId;
+		//j["active"] = (bool)(peer->paths[i].expired == 0);
+		//j["expired"] = (bool)(peer->paths[i].expired != 0);
+		//j["preferred"] = (bool)(peer->paths[i].preferred != 0);
+		j["latency"] = peer->paths[i].latency;
+		//j["packetDelayVariance"] = peer->paths[i].packetDelayVariance;
+		//j["throughputDisturbCoeff"] = peer->paths[i].throughputDisturbCoeff;
+		//j["packetErrorRatio"] = peer->paths[i].packetErrorRatio;
+		//j["packetLossRatio"] = peer->paths[i].packetLossRatio;
+		j["stability"] = peer->paths[i].stability;
+		j["throughput"] = peer->paths[i].throughput;
+		//j["maxThroughput"] = peer->paths[i].maxThroughput;
+		j["allocation"] = peer->paths[i].allocation;
+		j["ifname"] = peer->paths[i].ifname;
+		pa.push_back(j);
+	}
+	pj["paths"] = pa;
+}
+
 static void _moonToJson(nlohmann::json &mj,const World &world)
 {
 	char tmp[4096];
@@ -1189,7 +1222,23 @@ public:
 					json &settings = res["config"]["settings"];
 					settings["primaryPort"] = OSUtils::jsonInt(settings["primaryPort"],(uint64_t)_primaryPort) & 0xffff;
 					settings["allowTcpFallbackRelay"] = OSUtils::jsonBool(settings["allowTcpFallbackRelay"],_allowTcpFallbackRelay);
-					settings["multipathMode"] = OSUtils::jsonInt(settings["multipathMode"],_multipathMode);
+
+					if (_multipathMode) {
+						json &multipathConfig = res["multipath"];
+						ZT_PeerList *pl = _node->peers();
+						char peerAddrStr[256];
+						if (pl) {
+							for(unsigned long i=0;i<pl->peerCount;++i) {
+								if (pl->peers[i].role == ZT_PEER_ROLE_LEAF) {
+									nlohmann::json pj;
+									_peerAggregateLinkToJson(pj,&(pl->peers[i]));
+									OSUtils::ztsnprintf(peerAddrStr,sizeof(peerAddrStr),"%.10llx",pl->peers[i].address);
+									multipathConfig[peerAddrStr] = (pj);
+								}
+							}
+						}
+					}
+
 #ifdef ZT_USE_MINIUPNPC
 					settings["portMappingEnabled"] = OSUtils::jsonBool(settings["portMappingEnabled"],true);
 #else
-- 
cgit v1.2.3


From bdcdccfcc3157e62a4bc8078e583876cbef41223 Mon Sep 17 00:00:00 2001
From: Joseph Henry <josephjah@gmail.com>
Date: Fri, 22 Jun 2018 16:30:20 -0700
Subject: Improved path selection, more efficient traffic allocation, lower
 QoS/ACK overhead

---
 include/ZeroTierOne.h  |   5 ++
 node/Constants.hpp     |  15 ++++-
 node/Node.cpp          |   6 +-
 node/Path.hpp          |  20 +++----
 node/Peer.cpp          | 150 ++++++++++++++++++++++++++-----------------------
 node/Peer.hpp          |  36 ++++++++++--
 service/OneService.cpp |   2 +-
 7 files changed, 143 insertions(+), 91 deletions(-)

(limited to 'node/Node.cpp')

diff --git a/include/ZeroTierOne.h b/include/ZeroTierOne.h
index a100afd9..5b228e17 100644
--- a/include/ZeroTierOne.h
+++ b/include/ZeroTierOne.h
@@ -1315,6 +1315,11 @@ typedef struct
 	 */
 	unsigned int pathCount;
 
+	/**
+	 * Whether this peer was ever reachable via an aggregate link
+	 */
+	bool hadAggregateLink;
+
 	/**
 	 * Known network paths to peer
 	 */
diff --git a/node/Constants.hpp b/node/Constants.hpp
index 0d3692f1..420343ad 100644
--- a/node/Constants.hpp
+++ b/node/Constants.hpp
@@ -274,6 +274,19 @@
  */
 #define ZT_MULTIPATH_BINDER_REFRESH_PERIOD 5000
 
+/**
+ * Packets are only used for QoS/ACK statistical sampling if their packet ID is divisible by
+ * this integer. This is to provide a mechanism for both peers to agree on which packets need
+ * special treatment without having to exchange information. Changing this value would be
+ * a breaking change and would necessitate a protocol version upgrade. Since each incoming and
+ * outgoing packet ID is checked against this value its evaluation is of the form:
+ * (id & (divisor - 1)) == 0, thus the divisor must be a power of 2.
+ *
+ * This value is set at (16) so that given a normally-distributed RNG output we will sample
+ * 1/16th (or ~6.25%) of packets.
+ */
+#define ZT_PATH_QOS_ACK_PROTOCOL_DIVISOR 0x10
+
 /**
  * Time horizon for VERB_QOS_MEASUREMENT and VERB_ACK packet processing cutoff
  */
@@ -384,7 +397,7 @@
 /**
  * Minimum amount of time between each ACK packet
  */
-#define ZT_PATH_ACK_INTERVAL 250
+#define ZT_PATH_ACK_INTERVAL 1000
 
 /**
  * How often an aggregate link statistics report is emitted into this tracing system
diff --git a/node/Node.cpp b/node/Node.cpp
index 24deeae2..9b10dfdd 100644
--- a/node/Node.cpp
+++ b/node/Node.cpp
@@ -450,6 +450,7 @@ ZT_PeerList *Node::peers() const
 	for(std::vector< std::pair< Address,SharedPtr<Peer> > >::iterator pi(peers.begin());pi!=peers.end();++pi) {
 		ZT_Peer *p = &(pl->peers[pl->peerCount++]);
 		p->address = pi->second->address().toInt();
+		p->hadAggregateLink = 0;
 		if (pi->second->remoteVersionKnown()) {
 			p->versionMajor = pi->second->remoteVersionMajor();
 			p->versionMinor = pi->second->remoteVersionMinor();
@@ -466,6 +467,7 @@ ZT_PeerList *Node::peers() const
 
 		std::vector< SharedPtr<Path> > paths(pi->second->paths(_now));
 		SharedPtr<Path> bestp(pi->second->getAppropriatePath(_now,false));
+		p->hadAggregateLink |= pi->second->hasAggregateLink();
 		p->pathCount = 0;
 		for(std::vector< SharedPtr<Path> >::iterator path(paths.begin());path!=paths.end();++path) {
 			ZT_FAST_MEMCPY(&(p->paths[p->pathCount].address),&((*path)->address()),sizeof(struct sockaddr_storage));
@@ -475,14 +477,14 @@ ZT_PeerList *Node::peers() const
 			p->paths[p->pathCount].expired = 0;
 			p->paths[p->pathCount].preferred = ((*path) == bestp) ? 1 : 0;
 			p->paths[p->pathCount].latency = (*path)->latency();
-			p->paths[p->pathCount].packetDelayVariance = (*path)->packetDelayVariance();
+			p->paths[p->pathCount].packetDelayVariance = (*path)->packetDelayVariance(); 
 			p->paths[p->pathCount].throughputDisturbCoeff = (*path)->throughputDisturbanceCoefficient();
 			p->paths[p->pathCount].packetErrorRatio = (*path)->packetErrorRatio();
 			p->paths[p->pathCount].packetLossRatio = (*path)->packetLossRatio();
 			p->paths[p->pathCount].stability = (*path)->lastComputedStability();
 			p->paths[p->pathCount].throughput = (*path)->meanThroughput();
 			p->paths[p->pathCount].maxThroughput = (*path)->maxLifetimeThroughput();
-			p->paths[p->pathCount].allocation = (*path)->allocation();
+			p->paths[p->pathCount].allocation = (float)(*path)->allocation() / (float)255;
 			p->paths[p->pathCount].ifname = (*path)->getName();
 
 			++p->pathCount;
diff --git a/node/Path.hpp b/node/Path.hpp
index fb202306..cafff8cf 100644
--- a/node/Path.hpp
+++ b/node/Path.hpp
@@ -121,7 +121,7 @@ public:
 		_lastComputedStability(0.0),
 		_lastComputedRelativeQuality(0),
 		_lastComputedThroughputDistCoeff(0.0),
-		_lastAllocation(0.0)
+		_lastAllocation(0)
 	{
 		prepareBuffers();
 	}
@@ -153,7 +153,7 @@ public:
 		_lastComputedStability(0.0),
 		_lastComputedRelativeQuality(0),
 		_lastComputedThroughputDistCoeff(0.0),
-		_lastAllocation(0.0)
+		_lastAllocation(0)
 	{
 		prepareBuffers();
 		_phy->getIfName((PhySocket *)((uintptr_t)_localSocket), _ifname, 16);
@@ -316,12 +316,10 @@ public:
 	{
 		Mutex::Lock _l(_statistics_m);
 		if (verb != Packet::VERB_ACK && verb != Packet::VERB_QOS_MEASUREMENT) {
-			if (packetId % 2 == 0) { // even -> use for ACK
+			if ((packetId & (ZT_PATH_QOS_ACK_PROTOCOL_DIVISOR - 1)) == 0) {
 				_unackedBytes += payloadLength;
 				// Take note that we're expecting a VERB_ACK on this path as of a specific time
 				_expectingAckAsOf = ackAge(now) > ZT_PATH_ACK_INTERVAL ? _expectingAckAsOf : now;
-			}
-			else { // odd -> use for QoS
 				if (_outQoSRecords.size() < ZT_PATH_MAX_OUTSTANDING_QOS_RECORDS) {
 					_outQoSRecords[packetId] = now;
 				}
@@ -341,11 +339,9 @@ public:
 	{
 		Mutex::Lock _l(_statistics_m);
 		if (verb != Packet::VERB_ACK && verb != Packet::VERB_QOS_MEASUREMENT) {
-			if (packetId % 2 == 0) { // even -> use for ACK
+			if ((packetId & (ZT_PATH_QOS_ACK_PROTOCOL_DIVISOR - 1)) == 0) {
 				_inACKRecords[packetId] = payloadLength;
 				_packetsReceivedSinceLastAck++;
-			}
-			else { // odd -> use for QoS
 				_inQoSRecords[packetId] = now;
 				_packetsReceivedSinceLastQoS++;
 			}
@@ -527,12 +523,12 @@ public:
 	 *
 	 * @param allocation Percentage of traffic to be sent over this path to a peer
 	 */
-	inline void updateComponentAllocationOfAggregateLink(float allocation) { _lastAllocation = allocation; }
+	inline void updateComponentAllocationOfAggregateLink(unsigned char allocation) { _lastAllocation = allocation; }
 
 	/**
 	 * @return Percentage of traffic allocated to this path in the aggregate link
 	 */
-	inline float allocation() { return _lastAllocation; }
+	inline unsigned char allocation() { return _lastAllocation; }
 
 	/**
 	 * @return Stability estimates can become expensive to compute, we cache the most recent result.
@@ -704,7 +700,9 @@ private:
 	float _lastComputedStability;
 	float _lastComputedRelativeQuality;
 	float _lastComputedThroughputDistCoeff;
-	float _lastAllocation;
+	unsigned char _lastAllocation;
+
+
 
 	// cached human-readable strings for tracing purposes
 	char _ifname[16];
diff --git a/node/Peer.cpp b/node/Peer.cpp
index 1d581ab8..21bbfabe 100644
--- a/node/Peer.cpp
+++ b/node/Peer.cpp
@@ -56,6 +56,12 @@ Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Ident
 	_lastSentFullHello(0),
 	_lastACKWindowReset(0),
 	_lastQoSWindowReset(0),
+	_lastMultipathCompatibilityCheck(0),
+	_freeRandomByte(0),
+	_uniqueAlivePathCount(0),
+	_localMultipathSupported(false),
+	_remoteMultipathSupported(false),
+	_canUseMultipath(false),
 	_vProto(0),
 	_vMajor(0),
 	_vMinor(0),
@@ -69,6 +75,7 @@ Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Ident
 	_lastAggregateStatsReport(0),
 	_lastAggregateAllocation(0)
 {
+	Utils::getSecureRandom(&_freeRandomByte, 1);
 	if (!myIdentity.agree(peerIdentity,_key,ZT_PEER_SECRET_KEY_LENGTH))
 		throw ZT_EXCEPTION_INVALID_ARGUMENT;
 	_pathChoiceHist = new RingBuffer<int>(ZT_MULTIPATH_PROPORTION_WIN_SZ);
@@ -110,7 +117,7 @@ void Peer::received(
 
 		recordIncomingPacket(tPtr, path, packetId, payloadLength, verb, now);
 
-		if (canUseMultipath()) {
+		if (_canUseMultipath) {
 			if (path->needsToSendQoS(now)) {
 				sendQOS_MEASUREMENT(tPtr, path, path->localSocket(), path->address(), now);
 			}
@@ -145,17 +152,23 @@ void Peer::received(
 			// Paths are redundant if they duplicate an alive path to the same IP or
 			// with the same local socket and address family.
 			bool redundant = false;
+			unsigned int replacePath = ZT_MAX_PEER_NETWORK_PATHS;
 			for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 				if (_paths[i].p) {
-					if ( (_paths[i].p->alive(now)) && ( ((_paths[i].p->localSocket() == path->localSocket())&&(_paths[i].p->address().ss_family == path->address().ss_family)) || (_paths[i].p->address().ipsEqual2(path->address())) ) )  {
+					if ( (_paths[i].p->alive(now)) && ( ((_paths[i].p->localSocket() == path->localSocket())&&(_paths[i].p->address().ss_family == path->address().ss_family)) || (_paths[i].p->address().ipsEqual2(path->address())) ) ) {
 						redundant = true;
 						break;
 					}
+					// If the path is the same address and port, simply assume this is a replacement
+					if ( (_paths[i].p->address().ipsEqual2(path->address()) && (_paths[i].p->address().port() == path->address().port()))) {
+						replacePath = i;
+						break;
+					}
 				} else break;
 			}
-
-			if (!redundant) {
-				unsigned int replacePath = ZT_MAX_PEER_NETWORK_PATHS;
+			// If the path isn't a duplicate of the same localSocket AND we haven't already determined a replacePath,
+			// then find the worst path and replace it.
+			if (!redundant && replacePath == ZT_MAX_PEER_NETWORK_PATHS) {
 				int replacePathQuality = 0;
 				for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 					if (_paths[i].p) {
@@ -169,29 +182,15 @@ void Peer::received(
 						break;
 					}
 				}
-
-				// If we find a pre-existing path with the same address, just replace it.
-				// If we don't find anything we can replace, just use the replacePath that we previously decided on.
-				if (canUseMultipath()) {
-					for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-						if (_paths[i].p) {
-							if ( _paths[i].p->address().ss_family == path->address().ss_family && _paths[i].p->address().ipsEqual2(path->address())) {
-								replacePath = i;
-								break;
-							}
-						}
-					}
-				}
-
-				if (replacePath != ZT_MAX_PEER_NETWORK_PATHS) {
-					if (verb == Packet::VERB_OK) {
-						RR->t->peerLearnedNewPath(tPtr,networkId,*this,path,packetId);
-						_paths[replacePath].lr = now;
-						_paths[replacePath].p = path;
-						_paths[replacePath].priority = 1;
-					} else {
-						attemptToContact = true;
-					}
+			}
+			if (replacePath != ZT_MAX_PEER_NETWORK_PATHS) {
+				if (verb == Packet::VERB_OK) {
+					RR->t->peerLearnedNewPath(tPtr,networkId,*this,path,packetId);
+					_paths[replacePath].lr = now;
+					_paths[replacePath].p = path;
+					_paths[replacePath].priority = 1;
+				} else {
+					attemptToContact = true;
 				}
 			}
 		}
@@ -274,7 +273,9 @@ void Peer::received(
 void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t packetId,
 	uint16_t payloadLength, const Packet::Verb verb, int64_t now)
 {
-	if (localMultipathSupport()) {
+	// Grab second byte from packetId to use as a source of entropy in the next path selection
+	_freeRandomByte = (packetId & 0xFF00) >> 8;
+	if (_canUseMultipath) {
 		path->recordOutgoingPacket(now, packetId, payloadLength, verb);
 	}
 }
@@ -282,7 +283,7 @@ void Peer::recordOutgoingPacket(const SharedPtr<Path> &path, const uint64_t pack
 void Peer::recordIncomingPacket(void *tPtr, const SharedPtr<Path> &path, const uint64_t packetId,
 	uint16_t payloadLength, const Packet::Verb verb, int64_t now)
 {
-	if (localMultipathSupport()) {
+	if (_canUseMultipath) {
 		if (path->needsToSendAck(now)) {
 			sendACK(tPtr, path, path->localSocket(), path->address(), now);
 		}
@@ -323,6 +324,9 @@ void Peer::computeAggregateProportionalAllocation(int64_t now)
 				+ (fmax(1, relThroughput[i]) * ZT_PATH_CONTRIB_THROUGHPUT)
 				+ relScope * ZT_PATH_CONTRIB_SCOPE;
 			relQuality *= age_contrib;
+			// Arbitrary cutoffs
+			relQuality = relQuality > (1.00 / 100.0) ? relQuality : 0.0;
+			relQuality = relQuality < (99.0 / 100.0) ? relQuality : 1.0;
 			totalRelativeQuality += relQuality;
 			_paths[i].p->updateRelativeQuality(relQuality);
 		}
@@ -330,12 +334,12 @@ void Peer::computeAggregateProportionalAllocation(int64_t now)
 	// Convert set of relative performances into an allocation set
 	for(uint16_t i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 		if (_paths[i].p) {
-			_paths[i].p->updateComponentAllocationOfAggregateLink(_paths[i].p->relativeQuality() / totalRelativeQuality);
+			_paths[i].p->updateComponentAllocationOfAggregateLink((_paths[i].p->relativeQuality() / totalRelativeQuality) * 255);
 		}
 	}
 }
 
-float Peer::computeAggregateLinkPacketDelayVariance()
+int Peer::computeAggregateLinkPacketDelayVariance()
 {
 	float pdv = 0.0;
 	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
@@ -346,9 +350,9 @@ float Peer::computeAggregateLinkPacketDelayVariance()
 	return pdv;
 }
 
-float Peer::computeAggregateLinkMeanLatency()
+int Peer::computeAggregateLinkMeanLatency()
 {
-	float ml = 0.0;
+	int ml = 0;
 	int pathCount = 0;
 	for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 		if (_paths[i].p) {
@@ -396,7 +400,7 @@ SharedPtr<Path> Peer::getAppropriatePath(int64_t now, bool includeExpired)
 	 * Send traffic across the highest quality path only. This algorithm will still
 	 * use the old path quality metric from protocol version 9.
 	 */
-	if (!canUseMultipath()) {
+	if (!_canUseMultipath) {
 		long bestPathQuality = 2147483647;
 		for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 			if (_paths[i].p) {
@@ -443,15 +447,13 @@ SharedPtr<Path> Peer::getAppropriatePath(int64_t now, bool includeExpired)
 				}
 			}
 		}
-		unsigned int r;
-		Utils::getSecureRandom(&r, 1);
+		unsigned int r = _freeRandomByte;
 		if (numAlivePaths > 0) {
-			// pick a random out of the set deemed "alive"
 			int rf = r % numAlivePaths;
 			return _paths[alivePaths[rf]].p;
 		}
 		else if(numStalePaths > 0) {
-			// resort to trying any non-expired path
+			// Resort to trying any non-expired path
 			int rf = r % numStalePaths;
 			return _paths[stalePaths[rf]].p;
 		}
@@ -461,40 +463,12 @@ SharedPtr<Path> Peer::getAppropriatePath(int64_t now, bool includeExpired)
 	 * Proportionally allocate traffic according to dynamic path quality measurements
 	 */
 	if (RR->node->getMultipathMode() == ZT_MULTIPATH_PROPORTIONALLY_BALANCED) {
-		int numAlivePaths = 0;
-		int numStalePaths = 0;
-		int alivePaths[ZT_MAX_PEER_NETWORK_PATHS];
-		int stalePaths[ZT_MAX_PEER_NETWORK_PATHS];
-		memset(&alivePaths, -1, sizeof(alivePaths));
-		memset(&stalePaths, -1, sizeof(stalePaths));
-		// Attempt to find an excuse not to use the rest of this algorithm
-		// Alive or Stale?
-		for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
-			if (_paths[i].p) {
-				if (_paths[i].p->alive(now)) {
-					alivePaths[numAlivePaths] = i;
-					numAlivePaths++;
-				} else {
-					stalePaths[numStalePaths] = i;
-					numStalePaths++;
-				}
-				// Record a default path to use as a short-circuit for the rest of the algorithm (if needed)
-				bestPath = i;
-			}
-		}
 		if ((now - _lastAggregateAllocation) >= ZT_PATH_QUALITY_COMPUTE_INTERVAL) {
 			_lastAggregateAllocation = now;
 			computeAggregateProportionalAllocation(now);
 		}
-		if (numAlivePaths == 0 && numStalePaths == 0) {
-			return SharedPtr<Path>();
-		} if (numAlivePaths == 1 || numStalePaths == 1) {
-			return _paths[bestPath].p;
-		}
 		// Randomly choose path according to their allocations
-		unsigned int r;
-		Utils::getSecureRandom(&r, 1);
-		float rf = (float)(r %= 100) / 100;
+		float rf = _freeRandomByte;
 		for(int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
 			if (_paths[i].p) {
 				if (rf < _paths[i].p->allocation()) {
@@ -676,6 +650,41 @@ void Peer::introduce(void *const tPtr,const int64_t now,const SharedPtr<Peer> &o
 	}
 }
 
+inline void Peer::processBackgroundPeerTasks(int64_t now)
+{
+	// Determine current multipath compatibility with other peer
+	if ((now - _lastMultipathCompatibilityCheck) >= ZT_PATH_QUALITY_COMPUTE_INTERVAL) {
+		// Cache number of available paths so that we can short-circuit multipath logic elsewhere
+		//
+		// We also take notice of duplicate paths (same IP only) because we may have
+		// recently received a direct path push from a peer and our list might contain
+		// a dead path which hasn't been fully recognized as such. In this case we
+		// don't want the duplicate to trigger execution of multipath code prematurely.
+		//
+		// This is done to support the behavior of auto multipath enable/disable
+		// without user intervention.
+		int currAlivePathCount = 0;
+		int duplicatePathsFound = 0;
+		for (unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) {
+			if (_paths[i].p) {
+				currAlivePathCount++;
+				for (unsigned int j=0;j<ZT_MAX_PEER_NETWORK_PATHS;++j) {
+					if (_paths[i].p && _paths[j].p && _paths[i].p->address().ipsEqual2(_paths[j].p->address()) && i != j) {
+						duplicatePathsFound+=1;
+						break;
+					}
+				}
+			}
+		}
+		_uniqueAlivePathCount = (currAlivePathCount - (duplicatePathsFound / 2));
+		_lastMultipathCompatibilityCheck = now;
+		_localMultipathSupported = ((RR->node->getMultipathMode() != ZT_MULTIPATH_NONE) && (ZT_PROTO_VERSION > 9));
+		_remoteMultipathSupported = _vProto > 9;
+		// If both peers support multipath and more than one path exist, we can use multipath logic
+		_canUseMultipath = _localMultipathSupported && _remoteMultipathSupported && (_uniqueAlivePathCount > 1);
+	}
+}
+
 void Peer::sendACK(void *tPtr,const SharedPtr<Path> &path,const int64_t localSocket,const InetAddress &atAddress,int64_t now)
 {
 	Packet outp(_id.address(),RR->identity.address(),Packet::VERB_ACK);
@@ -774,14 +783,15 @@ void Peer::tryMemorizedPath(void *tPtr,int64_t now)
 unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now)
 {
 	unsigned int sent = 0;
-
 	Mutex::Lock _l(_paths_m);
 
 	const bool sendFullHello = ((now - _lastSentFullHello) >= ZT_PEER_PING_PERIOD);
 	_lastSentFullHello = now;
 
+	processBackgroundPeerTasks(now);
+
 	// Emit traces regarding aggregate link status
-	if (canUseMultipath()) {
+	if (_canUseMultipath) {
 		int alivePathCount = aggregateLinkPhysicalPathCount();
 		if ((now - _lastAggregateStatsReport) > ZT_PATH_AGGREGATE_STATS_REPORT_INTERVAL) {
 			_lastAggregateStatsReport = now;
diff --git a/node/Peer.hpp b/node/Peer.hpp
index ddbe6f77..a32eaad0 100644
--- a/node/Peer.hpp
+++ b/node/Peer.hpp
@@ -203,12 +203,12 @@ public:
 	/**
 	 * @return The aggregate link Packet Delay Variance (PDV)
 	 */
-	float computeAggregateLinkPacketDelayVariance();
+	int computeAggregateLinkPacketDelayVariance();
 
 	/**
 	 * @return The aggregate link mean latency
 	 */
-	float computeAggregateLinkMeanLatency();
+	int computeAggregateLinkMeanLatency();
 
 	/**
 	 * @return The number of currently alive "physical" paths in the aggregate link
@@ -357,7 +357,7 @@ public:
 	 */
 	inline unsigned int latency(const int64_t now)
 	{
-		if (RR->node->getMultipathMode()) {
+		if (_canUseMultipath) {
 			return (int)computeAggregateLinkMeanLatency();
 		} else {
 			SharedPtr<Path> bp(getAppropriatePath(now,false));
@@ -417,6 +417,14 @@ public:
 
 	inline bool remoteVersionKnown() const { return ((_vMajor > 0)||(_vMinor > 0)||(_vRevision > 0)); }
 
+	/**
+	 * Periodically update known multipath activation constraints. This is done so that we know when and when
+	 * not to use multipath logic. Doing this once every few seconds is sufficient.
+	 *
+	 * @param now Current time
+	 */
+	inline void processBackgroundPeerTasks(int64_t now);
+
 	/**
 	 * Record that the remote peer does have multipath enabled. As is evident by the receipt of a VERB_ACK
 	 * or a VERB_QOS_MEASUREMENT packet at some point in the past. Until this flag is set, the local client
@@ -427,18 +435,18 @@ public:
 	/**
 	 * @return Whether the local client supports and is configured to use multipath
 	 */
-	inline bool localMultipathSupport() { return ((RR->node->getMultipathMode() != ZT_MULTIPATH_NONE) && (ZT_PROTO_VERSION > 9)); }
+	inline bool localMultipathSupport() { return _localMultipathSupported; }
 
 	/**
 	 * @return Whether the remote peer supports and is configured to use multipath
 	 */
-	inline bool remoteMultipathSupport() { return (_remotePeerMultipathEnabled && (_vProto > 9)); }
+	inline bool remoteMultipathSupport() { return _remoteMultipathSupported; }
 
 	/**
 	 * @return Whether this client can use multipath to communicate with this peer. True if both peers are using
 	 * the correct protocol and if both peers have multipath enabled. False if otherwise.
 	 */
-	inline bool canUseMultipath() { return (localMultipathSupport() && remoteMultipathSupport()); }
+	inline bool canUseMultipath() { return _canUseMultipath; }
 
 	/**
 	 * @return True if peer has received a trust established packet (e.g. common network membership) in the past ZT_TRUST_EXPIRATION ms
@@ -557,6 +565,13 @@ public:
 		return (_QoSCutoffCount < ZT_PATH_QOS_ACK_CUTOFF_LIMIT);
 	}
 
+	/**
+	 * @return Whether this peer is reachable via an aggregate link
+	 */
+	inline bool hasAggregateLink() {
+		return _localMultipathSupported && _remoteMultipathSupported && _remotePeerMultipathEnabled;
+	}
+
 	/**
 	 * Serialize a peer for storage in local cache
 	 *
@@ -658,6 +673,15 @@ private:
 	int64_t _lastPathPrune;
 	int64_t _lastACKWindowReset;
 	int64_t _lastQoSWindowReset;
+	int64_t _lastMultipathCompatibilityCheck;
+
+	unsigned char _freeRandomByte;
+
+	int _uniqueAlivePathCount;
+
+	bool _localMultipathSupported;
+	bool _remoteMultipathSupported;
+	bool _canUseMultipath;
 
 	uint16_t _vProto;
 	uint16_t _vMajor;
diff --git a/service/OneService.cpp b/service/OneService.cpp
index a1a9d981..9b12f17b 100644
--- a/service/OneService.cpp
+++ b/service/OneService.cpp
@@ -1229,7 +1229,7 @@ public:
 						char peerAddrStr[256];
 						if (pl) {
 							for(unsigned long i=0;i<pl->peerCount;++i) {
-								if (pl->peers[i].role == ZT_PEER_ROLE_LEAF) {
+								if (pl->peers[i].hadAggregateLink) {
 									nlohmann::json pj;
 									_peerAggregateLinkToJson(pj,&(pl->peers[i]));
 									OSUtils::ztsnprintf(peerAddrStr,sizeof(peerAddrStr),"%.10llx",pl->peers[i].address);
-- 
cgit v1.2.3


From 28cb40529d04e61a188e9d4a2a316690703ea605 Mon Sep 17 00:00:00 2001
From: Joseph Henry <josephjah@gmail.com>
Date: Tue, 10 Jul 2018 16:50:12 -0700
Subject: Rough draft of fq-codel implementation

---
 node/Constants.hpp         |  35 ++++++
 node/Network.cpp           |  16 ++-
 node/Network.hpp           |  10 +-
 node/Node.cpp              |   1 +
 node/OutboundMulticast.cpp |   3 +-
 node/Switch.cpp            | 275 +++++++++++++++++++++++++++++++++++++++++++--
 node/Switch.hpp            |  94 ++++++++++++++++
 7 files changed, 419 insertions(+), 15 deletions(-)

(limited to 'node/Node.cpp')

diff --git a/node/Constants.hpp b/node/Constants.hpp
index 420343ad..5f21201e 100644
--- a/node/Constants.hpp
+++ b/node/Constants.hpp
@@ -410,6 +410,41 @@
  */
 #define ZT_PATH_IMBALANCE_THRESHOLD 0.20
 
+/**
+ * Max allowable time spent in any queue
+ */
+#define ZT_QOS_TARGET 5 // ms
+
+/**
+ * Time period where the time spent in the queue by a packet should fall below
+ * target at least once
+ */
+#define ZT_QOS_INTERVAL 100 // ms
+
+/**
+ * The number of bytes that each queue is allowed to send during each DRR cycle.
+ * This approximates a single-byte-based fairness queuing scheme
+ */
+#define ZT_QOS_QUANTUM ZT_DEFAULT_MTU
+
+/**
+ * The maximum total number of packets that can be queued among all
+ * active/inactive, old/new queues
+ */
+#define ZT_QOS_MAX_ENQUEUED_PACKETS 1024
+
+/**
+ * Number of QoS queues (buckets)
+ */
+#define ZT_QOS_NUM_BUCKETS 9
+
+/**
+ * All unspecified traffic is put in this bucket. Anything in a bucket with a smaller
+ * value is de-prioritized. Anything in a bucket with a higher value is prioritized over
+ * other traffic.
+ */
+#define ZT_QOS_DEFAULT_BUCKET 0
+
 /**
  * How frequently to send heartbeats over in-use paths
  */
diff --git a/node/Network.cpp b/node/Network.cpp
index a75d9fd1..a5c2fc3e 100644
--- a/node/Network.cpp
+++ b/node/Network.cpp
@@ -106,7 +106,8 @@ static _doZtFilterResult _doZtFilter(
 	const unsigned int ruleCount,
 	Address &cc, // MUTABLE -- set to TEE destination if TEE action is taken or left alone otherwise
 	unsigned int &ccLength, // MUTABLE -- set to length of packet payload to TEE
-	bool &ccWatch) // MUTABLE -- set to true for WATCH target as opposed to normal TEE
+	bool &ccWatch, // MUTABLE -- set to true for WATCH target as opposed to normal TEE
+	uint8_t &qosBucket) // MUTABLE -- set to the value of the argument provided to the matching action
 {
 	// Set to true if we are a TEE/REDIRECT/WATCH target
 	bool superAccept = false;
@@ -621,7 +622,8 @@ bool Network::filterOutgoingPacket(
 	const uint8_t *frameData,
 	const unsigned int frameLen,
 	const unsigned int etherType,
-	const unsigned int vlanId)
+	const unsigned int vlanId,
+	uint8_t &qosBucket)
 {
 	const int64_t now = RR->node->now();
 	Address ztFinalDest(ztDest);
@@ -636,7 +638,7 @@ bool Network::filterOutgoingPacket(
 
 	Membership *const membership = (ztDest) ? _memberships.get(ztDest) : (Membership *)0;
 
-	switch(_doZtFilter(RR,rrl,_config,membership,false,ztSource,ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,_config.rules,_config.ruleCount,cc,ccLength,ccWatch)) {
+	switch(_doZtFilter(RR,rrl,_config,membership,false,ztSource,ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,_config.rules,_config.ruleCount,cc,ccLength,ccWatch,qosBucket)) {
 
 		case DOZTFILTER_NO_MATCH: {
 			for(unsigned int c=0;c<_config.capabilityCount;++c) {
@@ -644,7 +646,7 @@ bool Network::filterOutgoingPacket(
 				Address cc2;
 				unsigned int ccLength2 = 0;
 				bool ccWatch2 = false;
-				switch (_doZtFilter(RR,crrl,_config,membership,false,ztSource,ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,_config.capabilities[c].rules(),_config.capabilities[c].ruleCount(),cc2,ccLength2,ccWatch2)) {
+				switch (_doZtFilter(RR,crrl,_config,membership,false,ztSource,ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,_config.capabilities[c].rules(),_config.capabilities[c].ruleCount(),cc2,ccLength2,ccWatch2,qosBucket)) {
 					case DOZTFILTER_NO_MATCH:
 					case DOZTFILTER_DROP: // explicit DROP in a capability just terminates its evaluation and is an anti-pattern
 						break;
@@ -759,11 +761,13 @@ int Network::filterIncomingPacket(
 	bool ccWatch = false;
 	const Capability *c = (Capability *)0;
 
+	uint8_t qosBucket = 255; // For incoming packets this is a dummy value
+
 	Mutex::Lock _l(_lock);
 
 	Membership &membership = _membership(sourcePeer->address());
 
-	switch (_doZtFilter(RR,rrl,_config,&membership,true,sourcePeer->address(),ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,_config.rules,_config.ruleCount,cc,ccLength,ccWatch)) {
+	switch (_doZtFilter(RR,rrl,_config,&membership,true,sourcePeer->address(),ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,_config.rules,_config.ruleCount,cc,ccLength,ccWatch,qosBucket)) {
 
 		case DOZTFILTER_NO_MATCH: {
 			Membership::CapabilityIterator mci(membership,_config);
@@ -772,7 +776,7 @@ int Network::filterIncomingPacket(
 				Address cc2;
 				unsigned int ccLength2 = 0;
 				bool ccWatch2 = false;
-				switch(_doZtFilter(RR,crrl,_config,&membership,true,sourcePeer->address(),ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,c->rules(),c->ruleCount(),cc2,ccLength2,ccWatch2)) {
+				switch(_doZtFilter(RR,crrl,_config,&membership,true,sourcePeer->address(),ztFinalDest,macSource,macDest,frameData,frameLen,etherType,vlanId,c->rules(),c->ruleCount(),cc2,ccLength2,ccWatch2,qosBucket)) {
 					case DOZTFILTER_NO_MATCH:
 					case DOZTFILTER_DROP: // explicit DROP in a capability just terminates its evaluation and is an anti-pattern
 						break;
diff --git a/node/Network.hpp b/node/Network.hpp
index 38f03eb3..2baab511 100644
--- a/node/Network.hpp
+++ b/node/Network.hpp
@@ -132,7 +132,8 @@ public:
 		const uint8_t *frameData,
 		const unsigned int frameLen,
 		const unsigned int etherType,
-		const unsigned int vlanId);
+		const unsigned int vlanId,
+		uint8_t &qosBucket);
 
 	/**
 	 * Apply filters to an incoming packet
@@ -297,6 +298,13 @@ public:
 	 */
 	void learnBridgeRoute(const MAC &mac,const Address &addr);
 
+	/**
+	 * Whether QoS is in effect for this network
+	 */
+	bool QoSEnabled() {
+		return false;
+	}
+
 	/**
 	 * Learn a multicast group that is bridged to our tap device
 	 *
diff --git a/node/Node.cpp b/node/Node.cpp
index 9b10dfdd..576b2e4a 100644
--- a/node/Node.cpp
+++ b/node/Node.cpp
@@ -368,6 +368,7 @@ ZT_ResultCode Node::leave(uint64_t nwid,void **uptr,void *tptr)
 	{
 		Mutex::Lock _l(_networks_m);
 		SharedPtr<Network> *nw = _networks.get(nwid);
+		RR->sw->removeNetworkQoSControlBlock(nwid);
 		if (!nw)
 			return ZT_RESULT_OK;
 		if (uptr)
diff --git a/node/OutboundMulticast.cpp b/node/OutboundMulticast.cpp
index d7a7b4d8..2391771f 100644
--- a/node/OutboundMulticast.cpp
+++ b/node/OutboundMulticast.cpp
@@ -85,7 +85,8 @@ void OutboundMulticast::sendOnly(const RuntimeEnvironment *RR,void *tPtr,const A
 {
 	const SharedPtr<Network> nw(RR->node->network(_nwid));
 	const Address toAddr2(toAddr);
-	if ((nw)&&(nw->filterOutgoingPacket(tPtr,true,RR->identity.address(),toAddr2,_macSrc,_macDest,_frameData,_frameLen,_etherType,0))) {
+	uint8_t QoSBucket = 255; // Dummy value
+	if ((nw)&&(nw->filterOutgoingPacket(tPtr,true,RR->identity.address(),toAddr2,_macSrc,_macDest,_frameData,_frameLen,_etherType,0,QoSBucket))) {
 		_packet.newInitializationVector();
 		_packet.setDestination(toAddr2);
 		RR->node->expectReplyTo(_packet.packetId());
diff --git a/node/Switch.cpp b/node/Switch.cpp
index d53bf53e..fddbd581 100644
--- a/node/Switch.cpp
+++ b/node/Switch.cpp
@@ -266,6 +266,8 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr<Network> &network,const
 		}
 	}
 
+	uint8_t qosBucket = ZT_QOS_DEFAULT_BUCKET;
+
 	if (to.isMulticast()) {
 		MulticastGroup multicastGroup(to,0);
 
@@ -383,7 +385,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr<Network> &network,const
 			network->learnBridgedMulticastGroup(tPtr,multicastGroup,RR->node->now());
 
 		// First pass sets noTee to false, but noTee is set to true in OutboundMulticast to prevent duplicates.
-		if (!network->filterOutgoingPacket(tPtr,false,RR->identity.address(),Address(),from,to,(const uint8_t *)data,len,etherType,vlanId)) {
+		if (!network->filterOutgoingPacket(tPtr,false,RR->identity.address(),Address(),from,to,(const uint8_t *)data,len,etherType,vlanId,qosBucket)) {
 			RR->t->outgoingNetworkFrameDropped(tPtr,network,from,to,etherType,vlanId,len,"filter blocked");
 			return;
 		}
@@ -407,7 +409,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr<Network> &network,const
 		Address toZT(to.toAddress(network->id())); // since in-network MACs are derived from addresses and network IDs, we can reverse this
 		SharedPtr<Peer> toPeer(RR->topology->getPeer(tPtr,toZT));
 
-		if (!network->filterOutgoingPacket(tPtr,false,RR->identity.address(),toZT,from,to,(const uint8_t *)data,len,etherType,vlanId)) {
+		if (!network->filterOutgoingPacket(tPtr,false,RR->identity.address(),toZT,from,to,(const uint8_t *)data,len,etherType,vlanId,qosBucket)) {
 			RR->t->outgoingNetworkFrameDropped(tPtr,network,from,to,etherType,vlanId,len,"filter blocked");
 			return;
 		}
@@ -422,7 +424,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr<Network> &network,const
 			outp.append(data,len);
 			if (!network->config().disableCompression())
 				outp.compress();
-			send(tPtr,outp,true);
+			aqm_enqueue(tPtr,network,outp,true,qosBucket);
 		} else {
 			Packet outp(toZT,RR->identity.address(),Packet::VERB_FRAME);
 			outp.append(network->id());
@@ -430,7 +432,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr<Network> &network,const
 			outp.append(data,len);
 			if (!network->config().disableCompression())
 				outp.compress();
-			send(tPtr,outp,true);
+			aqm_enqueue(tPtr,network,outp,true,qosBucket);
 		}
 
 	} else {
@@ -439,7 +441,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr<Network> &network,const
 		// We filter with a NULL destination ZeroTier address first. Filtrations
 		// for each ZT destination are also done below. This is the same rationale
 		// and design as for multicast.
-		if (!network->filterOutgoingPacket(tPtr,false,RR->identity.address(),Address(),from,to,(const uint8_t *)data,len,etherType,vlanId)) {
+		if (!network->filterOutgoingPacket(tPtr,false,RR->identity.address(),Address(),from,to,(const uint8_t *)data,len,etherType,vlanId,qosBucket)) {
 			RR->t->outgoingNetworkFrameDropped(tPtr,network,from,to,etherType,vlanId,len,"filter blocked");
 			return;
 		}
@@ -477,7 +479,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr<Network> &network,const
 		}
 
 		for(unsigned int b=0;b<numBridges;++b) {
-			if (network->filterOutgoingPacket(tPtr,true,RR->identity.address(),bridges[b],from,to,(const uint8_t *)data,len,etherType,vlanId)) {
+			if (network->filterOutgoingPacket(tPtr,true,RR->identity.address(),bridges[b],from,to,(const uint8_t *)data,len,etherType,vlanId,qosBucket)) {
 				Packet outp(bridges[b],RR->identity.address(),Packet::VERB_EXT_FRAME);
 				outp.append(network->id());
 				outp.append((uint8_t)0x00);
@@ -487,7 +489,7 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr<Network> &network,const
 				outp.append(data,len);
 				if (!network->config().disableCompression())
 					outp.compress();
-				send(tPtr,outp,true);
+				aqm_enqueue(tPtr,network,outp,true,qosBucket);
 			} else {
 				RR->t->outgoingNetworkFrameDropped(tPtr,network,from,to,etherType,vlanId,len,"filter blocked (bridge replication)");
 			}
@@ -495,6 +497,263 @@ void Switch::onLocalEthernet(void *tPtr,const SharedPtr<Network> &network,const
 	}
 }
 
+void Switch::aqm_enqueue(void *tPtr, const SharedPtr<Network> &network, Packet &packet,bool encrypt,int qosBucket)
+{
+	if(!network->QoSEnabled()) {
+		send(tPtr, packet, encrypt);
+		return;
+	}
+	NetworkQoSControlBlock *nqcb = _netQueueControlBlock[network->id()];
+	if (!nqcb) {
+		// DEBUG_INFO("creating network QoS control block (NQCB) for network %llx", network->id());
+		nqcb = new NetworkQoSControlBlock();
+		_netQueueControlBlock[network->id()] = nqcb;
+		// Initialize ZT_QOS_NUM_BUCKETS queues and place them in the INACTIVE list
+		// These queues will be shuffled between the new/old/inactive lists by the enqueue/dequeue algorithm
+		for (int i=0; i<ZT_QOS_NUM_BUCKETS; i++) {
+			nqcb->inactiveQueues.push_back(new ManagedQueue(i));
+		}
+	}
+
+	if (packet.verb() != Packet::VERB_FRAME && packet.verb() != Packet::VERB_EXT_FRAME) {
+		// DEBUG_INFO("skipping, no QoS for this packet, verb=%x", packet.verb());
+		// just send packet normally, no QoS for ZT protocol traffic
+		send(tPtr, packet, encrypt);
+	} 
+
+	_aqm_m.lock();
+
+	// Enqueue packet and move queue to appropriate list
+
+	const Address dest(packet.destination());
+	TXQueueEntry *txEntry = new TXQueueEntry(dest,RR->node->now(),packet,encrypt);
+	
+	ManagedQueue *selectedQueue = nullptr;
+	for (int i=0; i<ZT_QOS_NUM_BUCKETS; i++) {
+		if (i < nqcb->oldQueues.size()) { // search old queues first (I think this is best since old would imply most recent usage of the queue)
+			if (nqcb->oldQueues[i]->id == qosBucket) {
+				selectedQueue = nqcb->oldQueues[i];
+			}
+		} if (i < nqcb->newQueues.size()) { // search new queues (this would imply not often-used queues)
+			if (nqcb->newQueues[i]->id == qosBucket) {
+				selectedQueue = nqcb->newQueues[i];
+			}
+		} if (i < nqcb->inactiveQueues.size()) { // search inactive queues
+			if (nqcb->inactiveQueues[i]->id == qosBucket) {
+				selectedQueue = nqcb->inactiveQueues[i];
+				// move queue to end of NEW queue list
+				selectedQueue->byteCredit = ZT_QOS_QUANTUM;
+				// DEBUG_INFO("moving q=%p from INACTIVE to NEW list", selectedQueue);
+				nqcb->newQueues.push_back(selectedQueue);
+				nqcb->inactiveQueues.erase(nqcb->inactiveQueues.begin() + i);
+			}
+		}
+	}
+	if (!selectedQueue) {
+		return;
+	}
+
+	selectedQueue->q.push_back(txEntry);
+	selectedQueue->byteLength+=txEntry->packet.payloadLength();
+	nqcb->_currEnqueuedPackets++;
+
+	// DEBUG_INFO("nq=%2lu, oq=%2lu, iq=%2lu, nqcb.size()=%3d, bucket=%2d, q=%p", nqcb->newQueues.size(), nqcb->oldQueues.size(), nqcb->inactiveQueues.size(), nqcb->_currEnqueuedPackets, qosBucket, selectedQueue);
+
+	// Drop a packet if necessary
+	ManagedQueue *selectedQueueToDropFrom = nullptr;
+	if (nqcb->_currEnqueuedPackets > ZT_QOS_MAX_ENQUEUED_PACKETS)
+	{
+		// DEBUG_INFO("too many enqueued packets (%d), finding packet to drop", nqcb->_currEnqueuedPackets);
+		int maxQueueLength = 0;
+		for (int i=0; i<ZT_QOS_NUM_BUCKETS; i++) {
+			if (i < nqcb->oldQueues.size()) {
+				if (nqcb->oldQueues[i]->byteLength > maxQueueLength) {
+					maxQueueLength = nqcb->oldQueues[i]->byteLength;
+					selectedQueueToDropFrom = nqcb->oldQueues[i];
+				}
+			} if (i < nqcb->newQueues.size()) {
+				if (nqcb->newQueues[i]->byteLength > maxQueueLength) {
+					maxQueueLength = nqcb->newQueues[i]->byteLength;
+					selectedQueueToDropFrom = nqcb->newQueues[i];
+				}
+			} if (i < nqcb->inactiveQueues.size()) {
+				if (nqcb->inactiveQueues[i]->byteLength > maxQueueLength) {
+					maxQueueLength = nqcb->inactiveQueues[i]->byteLength;
+					selectedQueueToDropFrom = nqcb->inactiveQueues[i];
+				}
+			}
+		}
+		if (selectedQueueToDropFrom) {
+			// DEBUG_INFO("dropping packet from head of largest queue (%d payload bytes)", maxQueueLength);
+			int sizeOfDroppedPacket = selectedQueueToDropFrom->q.front()->packet.payloadLength();
+			delete selectedQueueToDropFrom->q.front();
+			selectedQueueToDropFrom->q.pop_front();
+			selectedQueueToDropFrom->byteLength-=sizeOfDroppedPacket;
+			nqcb->_currEnqueuedPackets--;
+		}
+	}
+	_aqm_m.unlock();
+	aqm_dequeue(tPtr);
+}
+
+uint64_t Switch::control_law(uint64_t t, int count)
+{
+	return t + ZT_QOS_INTERVAL / sqrt(count);
+}
+
+Switch::dqr Switch::dodequeue(ManagedQueue *q, uint64_t now) 
+{
+	dqr r;
+	r.ok_to_drop = false;
+	r.p = q->q.front();
+
+	if (r.p == NULL) {
+		q->first_above_time = 0;
+		return r;
+	}
+	uint64_t sojourn_time = now - r.p->creationTime;
+	if (sojourn_time < ZT_QOS_TARGET || q->byteLength <= ZT_DEFAULT_MTU) {
+		// went below - stay below for at least interval
+		q->first_above_time = 0;
+	} else {
+		if (q->first_above_time == 0) {
+			// just went above from below. if still above at
+			// first_above_time, will say it's ok to drop.
+			q->first_above_time = now + ZT_QOS_INTERVAL;
+		} else if (now >= q->first_above_time) {
+			r.ok_to_drop = true;
+		}
+	}
+	return r;
+}
+
+Switch::TXQueueEntry * Switch::CoDelDequeue(ManagedQueue *q, bool isNew, uint64_t now)
+{
+	dqr r = dodequeue(q, now);
+
+	if (q->dropping) {
+		if (!r.ok_to_drop) {
+			q->dropping = false;
+		}
+		while (now >= q->drop_next && q->dropping) {
+			q->q.pop_front(); // drop
+			r = dodequeue(q, now);
+			if (!r.ok_to_drop) {
+				// leave dropping state
+				q->dropping = false;
+			} else {
+				++(q->count);
+				// schedule the next drop.
+				q->drop_next = control_law(q->drop_next, q->count);
+			}
+		}
+	} else if (r.ok_to_drop) {
+		q->q.pop_front(); // drop
+		r = dodequeue(q, now);
+		q->dropping = true;
+		q->count = (q->count > 2 && now - q->drop_next < 8*ZT_QOS_INTERVAL)?
+		q->count - 2 : 1;
+		q->drop_next = control_law(now, q->count);
+	}
+	return r.p;
+}
+
+void Switch::aqm_dequeue(void *tPtr)
+{
+	// Cycle through network-specific QoS control blocks
+	for(std::map<uint64_t,NetworkQoSControlBlock*>::iterator nqcb(_netQueueControlBlock.begin());nqcb!=_netQueueControlBlock.end();) {
+		if (!(*nqcb).second->_currEnqueuedPackets) {
+			return;
+		}
+
+		uint64_t now = RR->node->now();
+		TXQueueEntry *entryToEmit = nullptr;
+		std::vector<ManagedQueue*> *currQueues = &((*nqcb).second->newQueues);
+		std::vector<ManagedQueue*> *oldQueues = &((*nqcb).second->oldQueues);
+		std::vector<ManagedQueue*> *inactiveQueues = &((*nqcb).second->inactiveQueues);
+
+		_aqm_m.lock();
+
+		// Attempt dequeue from queues in NEW list
+		bool examiningNewQueues = true;
+		while (currQueues->size()) {
+			ManagedQueue *queueAtFrontOfList = currQueues->front();
+			if (queueAtFrontOfList->byteCredit < 0) {
+				queueAtFrontOfList->byteCredit += ZT_QOS_QUANTUM;
+				// Move to list of OLD queues
+				// DEBUG_INFO("moving q=%p from NEW to OLD list", queueAtFrontOfList);
+				oldQueues->push_back(queueAtFrontOfList);
+				currQueues->erase(currQueues->begin());
+			} else {
+				entryToEmit = CoDelDequeue(queueAtFrontOfList, examiningNewQueues, now);
+				if (!entryToEmit) {
+					// Move to end of list of OLD queues
+					// DEBUG_INFO("moving q=%p from NEW to OLD list", queueAtFrontOfList);
+					oldQueues->push_back(queueAtFrontOfList);
+					currQueues->erase(currQueues->begin());
+				}
+				else {
+					int len = entryToEmit->packet.payloadLength();
+					queueAtFrontOfList->byteLength -= len;
+					queueAtFrontOfList->byteCredit -= len;
+					// Send the packet!
+					queueAtFrontOfList->q.pop_front();
+					send(tPtr, entryToEmit->packet, entryToEmit->encrypt);
+					(*nqcb).second->_currEnqueuedPackets--;
+				}
+				if (queueAtFrontOfList) {
+					//DEBUG_INFO("dequeuing from q=%p, len=%lu in NEW list (byteCredit=%d)", queueAtFrontOfList, queueAtFrontOfList->q.size(), queueAtFrontOfList->byteCredit);
+				}
+				break;
+			}
+		}
+
+		// Attempt dequeue from queues in OLD list
+		examiningNewQueues = false;
+		currQueues = &((*nqcb).second->oldQueues);
+		while (currQueues->size()) {
+			ManagedQueue *queueAtFrontOfList = currQueues->front();
+			if (queueAtFrontOfList->byteCredit < 0) {
+				queueAtFrontOfList->byteCredit += ZT_QOS_QUANTUM;
+				oldQueues->push_back(queueAtFrontOfList);
+				currQueues->erase(currQueues->begin());
+			} else {
+				entryToEmit = CoDelDequeue(queueAtFrontOfList, examiningNewQueues, now);
+				if (!entryToEmit) {
+					//DEBUG_INFO("moving q=%p from OLD to INACTIVE list", queueAtFrontOfList);
+					// Move to inactive list of queues
+					inactiveQueues->push_back(queueAtFrontOfList);
+					currQueues->erase(currQueues->begin());
+				}
+				else {
+					int len = entryToEmit->packet.payloadLength();
+					queueAtFrontOfList->byteLength -= len;
+					queueAtFrontOfList->byteCredit -= len;
+					queueAtFrontOfList->q.pop_front();
+					send(tPtr, entryToEmit->packet, entryToEmit->encrypt);
+					(*nqcb).second->_currEnqueuedPackets--;
+				}
+				if (queueAtFrontOfList) {
+					//DEBUG_INFO("dequeuing from q=%p, len=%lu in OLD list (byteCredit=%d)", queueAtFrontOfList, queueAtFrontOfList->q.size(), queueAtFrontOfList->byteCredit);
+				}
+				break;
+			}
+		}
+		nqcb++;
+		_aqm_m.unlock();
+	}
+}
+
+void Switch::removeNetworkQoSControlBlock(uint64_t nwid)
+{
+	NetworkQoSControlBlock *nq = _netQueueControlBlock[nwid];
+	if (nq) {
+		_netQueueControlBlock.erase(nwid);
+		delete nq;
+		nq = NULL;
+	}
+}
+
 void Switch::send(void *tPtr,Packet &packet,bool encrypt)
 {
 	const Address dest(packet.destination());
@@ -550,6 +809,7 @@ void Switch::doAnythingWaitingForPeer(void *tPtr,const SharedPtr<Peer> &peer)
 
 	{
 		Mutex::Lock _l(_txQueue_m);
+
 		for(std::list< TXQueueEntry >::iterator txi(_txQueue.begin());txi!=_txQueue.end();) {
 			if (txi->dest == peer->address()) {
 				if (_trySend(tPtr,txi->packet,txi->encrypt)) {
@@ -574,6 +834,7 @@ unsigned long Switch::doTimerTasks(void *tPtr,int64_t now)
 	std::vector<Address> needWhois;
 	{
 		Mutex::Lock _l(_txQueue_m);
+
 		for(std::list< TXQueueEntry >::iterator txi(_txQueue.begin());txi!=_txQueue.end();) {
 			if (_trySend(tPtr,txi->packet,txi->encrypt)) {
 				_txQueue.erase(txi++);
diff --git a/node/Switch.hpp b/node/Switch.hpp
index 906f418e..5f60fc46 100644
--- a/node/Switch.hpp
+++ b/node/Switch.hpp
@@ -59,6 +59,14 @@ class Peer;
  */
 class Switch
 {
+	struct ManagedQueue;
+	struct TXQueueEntry;
+
+	typedef struct {
+		TXQueueEntry *p;
+		bool ok_to_drop;
+	} dqr;
+
 public:
 	Switch(const RuntimeEnvironment *renv);
 
@@ -87,6 +95,62 @@ public:
 	 */
 	void onLocalEthernet(void *tPtr,const SharedPtr<Network> &network,const MAC &from,const MAC &to,unsigned int etherType,unsigned int vlanId,const void *data,unsigned int len);
 
+	/**
+	 * Determines the next drop schedule for packets in the TX queue
+	 *
+	 * @param t Current time
+	 * @param count Number of packets dropped this round
+	 */
+	uint64_t control_law(uint64_t t, int count);
+
+	/**
+	 * Selects a packet eligible for transmission from a TX queue. According to the control law, multiple packets
+	 * may be intentionally dropped before a packet is returned to the AQM scheduler.
+	 *
+	 * @param q The TX queue that is being dequeued from
+	 * @param now Current time
+	 */
+	dqr dodequeue(ManagedQueue *q, uint64_t now);
+
+	/**
+	 * Presents a packet to the AQM scheduler.
+	 *
+	 * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call
+	 * @param network Network that the packet shall be sent over
+	 * @param packet Packet to be sent
+	 * @param encrypt Encrypt packet payload? (always true except for HELLO)
+	 * @param qosBucket Which bucket the rule-system determined this packet should fall into
+	 */
+	void aqm_enqueue(void *tPtr, const SharedPtr<Network> &network, Packet &packet,bool encrypt,int qosBucket);
+
+	/**
+	 * Performs a single AQM cycle and dequeues and transmits all eligible packets on all networks
+	 *
+	 * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call
+	 */
+	void aqm_dequeue(void *tPtr);
+
+	/**
+	 * Calls the dequeue mechanism and adjust queue state variables
+	 *
+	 * @param q The TX queue that is being dequeued from
+	 * @param isNew Whether or not this queue is in the NEW list
+	 * @param now Current time
+	 */
+	Switch::TXQueueEntry * CoDelDequeue(ManagedQueue *q, bool isNew, uint64_t now);
+
+	/**
+	 * Removes QoS Queues and flow state variables for a specific network. These queues are created
+	 * automatically upon the transmission of the first packet from this peer to another peer on the
+	 * given network.
+	 *
+	 * The reason for existence of queues and flow state variables specific to each network is so that
+	 * each network's QoS rules function independently.
+	 *
+	 * @param nwid Network ID
+	 */
+	void removeNetworkQoSControlBlock(uint64_t nwid);
+
 	/**
 	 * Send a packet to a ZeroTier address (destination in packet)
 	 *
@@ -199,6 +263,7 @@ private:
 	};
 	std::list< TXQueueEntry > _txQueue;
 	Mutex _txQueue_m;
+	Mutex _aqm_m;
 
 	// Tracks sending of VERB_RENDEZVOUS to relaying peers
 	struct _LastUniteKey
@@ -220,6 +285,35 @@ private:
 	};
 	Hashtable< _LastUniteKey,uint64_t > _lastUniteAttempt; // key is always sorted in ascending order, for set-like behavior
 	Mutex _lastUniteAttempt_m;
+
+	// Queue with additional flow state variables
+	struct ManagedQueue
+	{
+		ManagedQueue(int id) :
+			id(id),
+			byteCredit(ZT_QOS_QUANTUM),
+			byteLength(0),
+			dropping(false)
+		{}
+		int id;
+		int byteCredit;
+		int byteLength;
+		uint64_t first_above_time;
+		uint32_t count;
+		uint64_t drop_next;
+		bool dropping;
+		uint64_t drop_next_time;
+		std::list< TXQueueEntry *> q;
+	};
+	// To implement fq_codel we need to maintain a queue of queues
+	struct NetworkQoSControlBlock
+	{
+		int _currEnqueuedPackets;
+		std::vector<ManagedQueue *> newQueues;
+		std::vector<ManagedQueue *> oldQueues;
+		std::vector<ManagedQueue *> inactiveQueues;
+	};
+	std::map<uint64_t,NetworkQoSControlBlock*> _netQueueControlBlock;
 };
 
 } // namespace ZeroTier
-- 
cgit v1.2.3