diff options
author | Joseph Henry <josephjah@gmail.com> | 2018-05-01 16:32:15 -0700 |
---|---|---|
committer | Joseph Henry <josephjah@gmail.com> | 2018-05-01 16:32:15 -0700 |
commit | 6a2ba4baca326272c45930208b70cfedf8cb1638 (patch) | |
tree | 434403aecca63908909678bd234ef8b4ffb1d1e4 | |
parent | 836d897aecc193ec3477e67858237a3f97819024 (diff) | |
download | infinitytier-6a2ba4baca326272c45930208b70cfedf8cb1638.tar.gz infinitytier-6a2ba4baca326272c45930208b70cfedf8cb1638.zip |
Introduced basic multipath support
-rw-r--r-- | include/ZeroTierOne.h | 36 | ||||
-rw-r--r-- | node/Constants.hpp | 92 | ||||
-rw-r--r-- | node/IncomingPacket.cpp | 5 | ||||
-rw-r--r-- | node/Multicaster.cpp | 2 | ||||
-rw-r--r-- | node/Node.cpp | 4 | ||||
-rw-r--r-- | node/Node.hpp | 5 | ||||
-rw-r--r-- | node/Path.hpp | 265 | ||||
-rw-r--r-- | node/Peer.cpp | 289 | ||||
-rw-r--r-- | node/Peer.hpp | 38 | ||||
-rw-r--r-- | node/RingBuffer.hpp | 315 | ||||
-rw-r--r-- | node/Switch.cpp | 6 | ||||
-rw-r--r-- | node/Topology.hpp | 2 | ||||
-rw-r--r-- | node/Trace.cpp | 20 | ||||
-rw-r--r-- | node/Trace.hpp | 6 | ||||
-rw-r--r-- | osdep/Binder.hpp | 15 | ||||
-rw-r--r-- | osdep/Phy.hpp | 195 | ||||
-rw-r--r-- | service/OneService.cpp | 56 |
17 files changed, 1310 insertions, 41 deletions
diff --git a/include/ZeroTierOne.h b/include/ZeroTierOne.h index 6da53a73..15e15bd1 100644 --- a/include/ZeroTierOne.h +++ b/include/ZeroTierOne.h @@ -423,6 +423,42 @@ enum ZT_ResultCode #define ZT_ResultCode_isFatal(x) ((((int)(x)) >= 100)&&(((int)(x)) < 1000)) /** + * The multipath algorithm in use by this node. + */ +enum ZT_MultipathMode +{ + /** + * No active multipath. + * + * Traffic is merely sent over the strongest path. That being + * said, this mode will automatically failover in the event that a link goes down. + */ + ZT_MULTIPATH_NONE = 0, + + /** + * Traffic is randomly distributed among all active paths. + * + * Will cease sending traffic over links that appear to be stale. + */ + ZT_MULTIPATH_RANDOM = 1, + + /** + * Traffic is allocated across all active paths in proportion to their strength and + * reliability. + * + * Will cease sending traffic over links that appear to be stale. + */ + ZT_MULTIPATH_PROPORTIONALLY_BALANCED = 2, + + /** + * Traffic is allocated across a user-defined interface/allocation + * + * Will cease sending traffic over links that appear to be stale. + */ + ZT_MULTIPATH_MANUALLY_BALANCED = 3 +}; + +/** * Status codes sent to status update callback when things happen */ enum ZT_Event diff --git a/node/Constants.hpp b/node/Constants.hpp index e2a35dce..80083abe 100644 --- a/node/Constants.hpp +++ b/node/Constants.hpp @@ -268,6 +268,98 @@ #define ZT_PING_CHECK_INVERVAL 5000 /** + * Length of interface name + */ +#define ZT_PATH_INTERFACE_NAME_SZ 16 + +/** + * How frequently to check for changes to the system's network interfaces. When + * the service decides to use this constant it's because we want to react more + * quickly to new interfaces that pop up or go down. + */ +#define ZT_MULTIPATH_BINDER_REFRESH_PERIOD 5000 + +/** + * Path choice history window size. This is used to keep track of which paths were + * previously selected so that we can maintain a target allocation over time. + */ +#define ZT_MULTIPATH_PROPORTION_WIN_SZ 128 + +/** + * Threshold for flow to be considered balanced. + */ +#define ZT_MULTIPATH_FLOW_BALANCE_THESHOLD 0.80 + +/** + * Number of samples to consider when computing path statistics + */ +#define ZT_PATH_QUALITY_METRIC_WIN_SZ 128 + +/** + * How often important path metrics are sampled (in ms). These metrics are later used + * for path quality estimates + */ +#define ZT_PATH_QUALITY_SAMPLE_INTERVAL 100 + +/** + * How often new path quality estimates are computed + */ +#define ZT_PATH_QUALITY_ESTIMATE_INTERVAL 100 + +/** + * How often we will sample packet latency. Should be at least greater than ZT_PING_CHECK_INVERVAL + * since we will record a 0 bit/s measurement if no valid latency measurement was made within this + * window of time. + */ +#define ZT_PATH_LATENCY_SAMPLE_INTERVAL ZT_PING_CHECK_INVERVAL * 2 + +/** + * Interval used for rate-limiting the computation of path quality estimates. Set at 0 + * to compute as new packets arrive with no delay. + */ +#define ZT_PATH_QUALITY_COMPUTE_INTERVAL 0 + +/** + * Path error rate history window size. This is used to keep track of packet error + * measurements over a path's medium-term history. + */ +#define ZT_PATH_ERROR_HIST_WIN_SZ 10 + +/** + * The number of packet error measurements in each sample + */ +#define ZT_PATH_ERROR_SAMPLE_WIN_SZ 1024 + +/** + * How often a peer will prune its own paths. Pruning is important when multipath is + * enabled because we want to prevent the allocation algorithms from sending anything + * out on known dead paths. Additionally, quickly marking paths as dead helps when + * a new path is learned and needs to replace an older path. + */ +#define ZT_CLOSED_PATH_PRUNING_INTERVAL 1000 + +/** + * Datagram used to test link throughput. Contents are random. + */ +#define ZT_LINK_TEST_DATAGRAM_SZ 1024 + +/** + * Size of datagram expected as a reply to a link speed test + */ +#define ZT_LINK_TEST_DATAGRAM_RESPONSE_SZ 8 + +/** + * Time before a link test datagram is considered lost. Any corresponding + * timing records that would have been used to compute a RTT are purged. + */ +#define ZT_LINK_TEST_TIMEOUT 10000 + +/** + * How often the service tests the link throughput. + */ +#define ZT_LINK_SPEED_TEST_INTERVAL 1000 + +/** * How frequently to send heartbeats over in-use paths */ #define ZT_PATH_HEARTBEAT_PERIOD 14000 diff --git a/node/IncomingPacket.cpp b/node/IncomingPacket.cpp index ff4fc94b..8f6dda63 100644 --- a/node/IncomingPacket.cpp +++ b/node/IncomingPacket.cpp @@ -80,6 +80,7 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr) if (!trusted) { if (!dearmor(peer->key())) { RR->t->incomingPacketMessageAuthenticationFailure(tPtr,_path,packetId(),sourceAddress,hops(),"invalid MAC"); + _path->recordPacket(false); return true; } } @@ -89,6 +90,8 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR,void *tPtr) return true; } + _path->recordPacket(true); + const Packet::Verb v = verb(); switch(v) { //case Packet::VERB_NOP: @@ -446,7 +449,7 @@ bool IncomingPacket::_doOK(const RuntimeEnvironment *RR,void *tPtr,const SharedP } if (!hops()) - _path->updateLatency((unsigned int)latency); + _path->updateLatency((unsigned int)latency, RR->node->now()); peer->setRemoteVersion(vProto,vMajor,vMinor,vRevision); diff --git a/node/Multicaster.cpp b/node/Multicaster.cpp index 753e4ee0..b2a5a205 100644 --- a/node/Multicaster.cpp +++ b/node/Multicaster.cpp @@ -190,7 +190,7 @@ void Multicaster::send( for(unsigned int i=0;i<multicastReplicatorCount;++i) { const SharedPtr<Peer> p(RR->topology->getPeerNoCache(multicastReplicators[i])); if ((p)&&(p->isAlive(now))) { - const SharedPtr<Path> pp(p->getBestPath(now,false)); + const SharedPtr<Path> pp(p->getAppropriatePath(now,false)); if ((pp)&&(pp->latency() < bestMulticastReplicatorLatency)) { bestMulticastReplicatorLatency = pp->latency(); bestMulticastReplicatorPath = pp; diff --git a/node/Node.cpp b/node/Node.cpp index db511430..71e5b6a7 100644 --- a/node/Node.cpp +++ b/node/Node.cpp @@ -234,7 +234,7 @@ public: } if ((!contacted)&&(_bestCurrentUpstream)) { - const SharedPtr<Path> up(_bestCurrentUpstream->getBestPath(_now,true)); + const SharedPtr<Path> up(_bestCurrentUpstream->getAppropriatePath(_now,true)); if (up) p->sendHELLO(_tPtr,up->localSocket(),up->address(),_now); } @@ -465,7 +465,7 @@ ZT_PeerList *Node::peers() const p->role = RR->topology->role(pi->second->identity().address()); std::vector< SharedPtr<Path> > paths(pi->second->paths(_now)); - SharedPtr<Path> bestp(pi->second->getBestPath(_now,false)); + SharedPtr<Path> bestp(pi->second->getAppropriatePath(_now,false)); p->pathCount = 0; for(std::vector< SharedPtr<Path> >::iterator path(paths.begin());path!=paths.end();++path) { ZT_FAST_MEMCPY(&(p->paths[p->pathCount].address),&((*path)->address()),sizeof(struct sockaddr_storage)); diff --git a/node/Node.hpp b/node/Node.hpp index 79284b63..4ad0e3cd 100644 --- a/node/Node.hpp +++ b/node/Node.hpp @@ -260,6 +260,9 @@ public: inline const Address &remoteTraceTarget() const { return _remoteTraceTarget; } inline Trace::Level remoteTraceLevel() const { return _remoteTraceLevel; } + inline void setMultipathMode(uint8_t mode) { _multipathMode = mode; } + inline uint8_t getMultipathMode() { return _multipathMode; } + private: RuntimeEnvironment _RR; RuntimeEnvironment *RR; @@ -284,6 +287,8 @@ private: Address _remoteTraceTarget; enum Trace::Level _remoteTraceLevel; + uint8_t _multipathMode; + volatile int64_t _now; int64_t _lastPingCheck; int64_t _lastHousekeepingRun; diff --git a/node/Path.hpp b/node/Path.hpp index e12328ff..5751d326 100644 --- a/node/Path.hpp +++ b/node/Path.hpp @@ -39,6 +39,9 @@ #include "SharedPtr.hpp" #include "AtomicCounter.hpp" #include "Utils.hpp" +#include "RingBuffer.hpp" + +#include "../osdep/Phy.hpp" /** * Maximum return value of preferenceRank() @@ -55,6 +58,7 @@ class RuntimeEnvironment; class Path { friend class SharedPtr<Path>; + Phy<Path *> *_phy; public: /** @@ -93,22 +97,71 @@ public: _lastOut(0), _lastIn(0), _lastTrustEstablishedPacketReceived(0), + _lastPathQualityComputeTime(0), _localSocket(-1), _latency(0xffff), _addr(), - _ipScope(InetAddress::IP_SCOPE_NONE) + _ipScope(InetAddress::IP_SCOPE_NONE), + _currentPacketSampleCounter(0), + _meanPacketErrorRatio(0.0), + _meanLatency(0.0), + _lastLatencyUpdate(0), + _jitter(0.0), + _lastPathQualitySampleTime(0), + _lastComputedQuality(0.0), + _lastPathQualityEstimate(0), + _meanAge(0.0), + _meanThroughput(0.0), + _packetLossRatio(0) { + memset(_ifname, 0, sizeof(_ifname)); + memset(_addrString, 0, sizeof(_addrString)); + _throughputSamples = new RingBuffer<uint64_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ); + _ageSamples = new RingBuffer<uint64_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ); + _latencySamples = new RingBuffer<uint32_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ); + _errSamples = new RingBuffer<float>(ZT_PATH_QUALITY_METRIC_WIN_SZ); } Path(const int64_t localSocket,const InetAddress &addr) : _lastOut(0), _lastIn(0), _lastTrustEstablishedPacketReceived(0), + _lastPathQualityComputeTime(0), _localSocket(localSocket), _latency(0xffff), _addr(addr), - _ipScope(addr.ipScope()) + _ipScope(addr.ipScope()), + _currentPacketSampleCounter(0), + _meanPacketErrorRatio(0.0), + _meanLatency(0.0), + _lastLatencyUpdate(0), + _jitter(0.0), + _lastPathQualitySampleTime(0), + _lastComputedQuality(0.0), + _lastPathQualityEstimate(0), + _meanAge(0.0), + _meanThroughput(0.0), + _packetLossRatio(0) + { + memset(_ifname, 0, sizeof(_ifname)); + memset(_addrString, 0, sizeof(_addrString)); + _throughputSamples = new RingBuffer<uint64_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ); + _ageSamples = new RingBuffer<uint64_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ); + _latencySamples = new RingBuffer<uint32_t>(ZT_PATH_QUALITY_METRIC_WIN_SZ); + _errSamples = new RingBuffer<float>(ZT_PATH_QUALITY_METRIC_WIN_SZ); + } + + ~Path() { + delete _throughputSamples; + delete _ageSamples; + delete _latencySamples; + delete _errSamples; + + _throughputSamples = NULL; + _ageSamples = NULL; + _latencySamples = NULL; + _errSamples = NULL; } /** @@ -147,12 +200,17 @@ public: * * @param l Measured latency */ - inline void updateLatency(const unsigned int l) + inline void updateLatency(const unsigned int l, int64_t now) { unsigned int pl = _latency; - if (pl < 0xffff) + if (pl < 0xffff) { _latency = (pl + l) / 2; - else _latency = l; + } + else { + _latency = l; + } + _lastLatencyUpdate = now; + _latencySamples->push(l); } /** @@ -241,11 +299,180 @@ public: } /** + * @return An estimate of path quality -- higher is better. + */ + inline float computeQuality(const int64_t now) + { + float latency_contrib = _meanLatency ? 1.0 / _meanLatency : 0; + float jitter_contrib = _jitter ? 1.0 / _jitter : 0; + float throughput_contrib = _meanThroughput ? _meanThroughput / 1000000 : 0; // in Mbps + float age_contrib = _meanAge > 0 ? (float)sqrt(_meanAge) : 1; + float error_contrib = 1.0 - _meanPacketErrorRatio; + float sum = (latency_contrib + jitter_contrib + throughput_contrib + error_contrib) / age_contrib; + _lastComputedQuality = sum * (long)((_ipScope) + 1); + return _lastComputedQuality; + } + + /** + * Since quality estimates can become expensive we should cache the most recent result for traffic allocation + * algorithms which may need to reference this value multiple times through the course of their execution. + */ + inline float lastComputedQuality() { + return _lastComputedQuality; + } + + /** + * @return A pointer to a cached copy of the human-readable name of the interface this Path's localSocket is bound to + */ + inline char *getName() { return _ifname; } + + /** + * @return Estimated throughput in bps of this link + */ + inline uint64_t getThroughput() { return _phy->getThroughput((PhySocket *)((uintptr_t)_localSocket)); } + + /** + * @return Packet delay varience + */ + inline float jitter() { return _jitter; } + + /** + * @return Previously-computed mean latency + */ + inline float meanLatency() { return _meanLatency; } + + /** + * @return Packet loss rate + */ + inline float packetLossRatio() { return _packetLossRatio; } + + /** + * @return Mean packet error ratio + */ + inline float meanPacketErrorRatio() { return _meanPacketErrorRatio; } + + /** + * @return Current packet error ratio (possibly incomplete sample set) + */ + inline float currentPacketErrorRatio() { + int errorsPerSample = 0; + for (int i=0; i<_currentPacketSampleCounter; i++) { + if (_packetValidity[i] == false) { + errorsPerSample++; + } + } + return (float)errorsPerSample / (float)ZT_PATH_ERROR_SAMPLE_WIN_SZ; + } + + /** + * @return Whether the Path's local socket is in a CLOSED state + */ + inline bool isClosed() { return _phy->isClosed((PhySocket *)((uintptr_t)_localSocket)); } + + /** + * @return The state of a Path's local socket + */ + inline int getState() { return _phy->getState((PhySocket *)((uintptr_t)_localSocket)); } + + /** + * @return Whether this socket may have been erased by the virtual physical link layer + */ + inline bool isValidState() { return _phy->isValidState((PhySocket *)((uintptr_t)_localSocket)); } + + /** + * @return Whether the path quality monitors have collected enough data to provide a quality value + * TODO: expand this + */ + inline bool monitorsReady() { + return _latencySamples->count() && _ageSamples->count() && _throughputSamples->count(); + } + + /** + * @return A pointer to a cached copy of the address string for this Path (For debugging only) + */ + inline char *getAddressString() { return _addrString; } + + /** + * Handle path sampling, computation of quality estimates, and other periodic tasks + * @param now Current time + */ + inline void measureLink(int64_t now) { + // Sample path properties and store them in a continuously-revolving buffer + if (now - _lastPathQualitySampleTime > ZT_PATH_QUALITY_SAMPLE_INTERVAL) { + _lastPathQualitySampleTime = now; + _throughputSamples->push(getThroughput()); // Thoughtput in bits/s + _ageSamples->push(now - _lastIn); // Age (time since last received packet) + if (now - _lastLatencyUpdate > ZT_PATH_LATENCY_SAMPLE_INTERVAL) { + _lastLatencyUpdate = now; + // Record 0 bp/s. Since we're using this to detect possible packet loss + updateLatency(0, now); + } + } + // Compute statistical values for use in link quality estimates + if (now - _lastPathQualityComputeTime > ZT_PATH_QUALITY_COMPUTE_INTERVAL) { + _lastPathQualityComputeTime = now; + // Cache Path address string + address().toString(_addrString); + _phy->getIfName((PhySocket *)((uintptr_t)_localSocket), _ifname, ZT_PATH_INTERFACE_NAME_SZ); // Cache Interface name + // Derived values + if (_throughputSamples->count()) { + _packetLossRatio = (float)_throughputSamples->zeroCount() / (float)_throughputSamples->count(); + } + _meanThroughput = _throughputSamples->mean(); + _meanAge = _ageSamples->mean(); + _meanLatency = _latencySamples->mean(); + // Jitter + // SEE: RFC 3393, RFC 4689 + _jitter = _latencySamples->stddev(); + _meanPacketErrorRatio = _errSamples->mean(); // Packet Error Ratio (PER) + } + // Periodically compute a path quality estimate + if (now - _lastPathQualityEstimate > ZT_PATH_QUALITY_ESTIMATE_INTERVAL) { + computeQuality(now); + } + } + + /** + * Record whether a packet is considered invalid by MAC/compression/cipher checks. This + * could be an indication of a bit error. This function will keep a running counter of + * up to a given window size and with each counter overflow it will compute a mean error rate + * and store that in a continuously shifting sample window. + * + * @param isValid Whether the packet in question is considered invalid + */ + inline void recordPacket(bool isValid) { + if (_currentPacketSampleCounter < ZT_PATH_ERROR_SAMPLE_WIN_SZ) { + _packetValidity[_currentPacketSampleCounter] = isValid; + _currentPacketSampleCounter++; + } + else { + // Sample array is full, compute an mean and stick it in the ring buffer for trend analysis + _errSamples->push(currentPacketErrorRatio()); + _currentPacketSampleCounter=0; + } + } + + /** + * @return The mean age (in ms) of this link + */ + inline float meanAge() { return _meanAge; } + + /** + * @return The mean throughput (in bits/s) of this link + */ + inline float meanThroughput() { return _meanThroughput; } + + /** * @return True if this path is alive (receiving heartbeats) */ inline bool alive(const int64_t now) const { return ((now - _lastIn) < (ZT_PATH_HEARTBEAT_PERIOD + 5000)); } /** + * @return True if this path hasn't received a packet in a "significant" amount of time + */ + inline bool stale(const int64_t now) const { return ((now - _lastIn) > ZT_LINK_SPEED_TEST_INTERVAL * 10); } + + /** * @return True if this path needs a heartbeat */ inline bool needsHeartbeat(const int64_t now) const { return ((now - _lastOut) >= ZT_PATH_HEARTBEAT_PERIOD); } @@ -269,11 +496,39 @@ private: volatile int64_t _lastOut; volatile int64_t _lastIn; volatile int64_t _lastTrustEstablishedPacketReceived; + volatile int64_t _lastPathQualityComputeTime; int64_t _localSocket; volatile unsigned int _latency; InetAddress _addr; InetAddress::IpScope _ipScope; // memoize this since it's a computed value checked often AtomicCounter __refCount; + + // Packet Error Ratio (PER) + int _packetValidity[ZT_PATH_ERROR_SAMPLE_WIN_SZ]; + int _currentPacketSampleCounter; + volatile float _meanPacketErrorRatio; + + // Latency and Jitter + volatile float _meanLatency; + int64_t _lastLatencyUpdate; + volatile float _jitter; + + int64_t _lastPathQualitySampleTime; + float _lastComputedQuality; + int64_t _lastPathQualityEstimate; + float _meanAge; + float _meanThroughput; + + // Circular buffers used to efficiently store large time series + RingBuffer<uint64_t> *_throughputSamples; + RingBuffer<uint32_t> *_latencySamples; + RingBuffer<uint64_t> *_ageSamples; + RingBuffer<float> *_errSamples; + + float _packetLossRatio; + + char _ifname[ZT_PATH_INTERFACE_NAME_SZ]; + char _addrString[256]; }; } // namespace ZeroTier diff --git a/node/Peer.cpp b/node/Peer.cpp index 71afd852..862a4529 100644 --- a/node/Peer.cpp +++ b/node/Peer.cpp @@ -35,6 +35,7 @@ #include "Packet.hpp" #include "Trace.hpp" #include "InetAddress.hpp" +#include "RingBuffer.hpp" namespace ZeroTier { @@ -59,10 +60,14 @@ Peer::Peer(const RuntimeEnvironment *renv,const Identity &myIdentity,const Ident _vRevision(0), _id(peerIdentity), _directPathPushCutoffCount(0), - _credentialsCutoffCount(0) + _credentialsCutoffCount(0), + _linkBalanceStatus(false), + _linkRedundancyStatus(false) { if (!myIdentity.agree(peerIdentity,_key,ZT_PEER_SECRET_KEY_LENGTH)) throw ZT_EXCEPTION_INVALID_ARGUMENT; + _pathChoiceHist = new RingBuffer<int>(ZT_MULTIPATH_PROPORTION_WIN_SZ); + _flowBalanceHist = new RingBuffer<float>(ZT_MULTIPATH_PROPORTION_WIN_SZ); } void Peer::received( @@ -95,6 +100,18 @@ void Peer::received( path->trustedPacketReceived(now); } + if (RR->node->getMultipathMode() != ZT_MULTIPATH_NONE) { + if ((now - _lastPathPrune) > ZT_CLOSED_PATH_PRUNING_INTERVAL) { + _lastPathPrune = now; + prunePaths(); + } + for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) { + if (_paths[i].p) { + _paths[i].p->measureLink(now); + } + } + } + if (hops == 0) { // If this is a direct packet (no hops), update existing paths or learn new ones @@ -232,26 +249,246 @@ void Peer::received( } } -SharedPtr<Path> Peer::getBestPath(int64_t now,bool includeExpired) const +SharedPtr<Path> Peer::getAppropriatePath(int64_t now, bool includeExpired) { Mutex::Lock _l(_paths_m); - unsigned int bestPath = ZT_MAX_PEER_NETWORK_PATHS; - long bestPathQuality = 2147483647; + + /** + * Send traffic across the highest quality path only. This algorithm will still + * use the old path quality metric. + */ + if (RR->node->getMultipathMode() == ZT_MULTIPATH_NONE) { + long bestPathQuality = 2147483647; + for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) { + if (_paths[i].p && _paths[i].p->isValidState()) { + if ((includeExpired)||((now - _paths[i].lr) < ZT_PEER_PATH_EXPIRATION)) { + const long q = _paths[i].p->quality(now) / _paths[i].priority; + if (q <= bestPathQuality) { + bestPathQuality = q; + bestPath = i; + } + } + } else break; + } + if (bestPath != ZT_MAX_PEER_NETWORK_PATHS) { + return _paths[bestPath].p; + } + return SharedPtr<Path>(); + } + + if ((now - _lastPathPrune) > ZT_CLOSED_PATH_PRUNING_INTERVAL) { + _lastPathPrune = now; + prunePaths(); + } for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) { if (_paths[i].p) { - if ((includeExpired)||((now - _paths[i].lr) < ZT_PEER_PATH_EXPIRATION)) { - const long q = _paths[i].p->quality(now) / _paths[i].priority; - if (q <= bestPathQuality) { - bestPathQuality = q; + _paths[i].p->measureLink(now); + } + } + + /** + * Randomly distribute traffic across all paths + * + * Behavior: + * - If path DOWN: Stop randomly choosing that path + * - If path UP: Start randomly choosing that path + * - If all paths are unresponsive: randomly choose from all paths + */ + int numAlivePaths = 0; + int numStalePaths = 0; + if (RR->node->getMultipathMode() == ZT_MULTIPATH_RANDOM) { + int alivePaths[ZT_MAX_PEER_NETWORK_PATHS]; + int stalePaths[ZT_MAX_PEER_NETWORK_PATHS]; + memset(&alivePaths, -1, sizeof(alivePaths)); + memset(&stalePaths, -1, sizeof(stalePaths)); + for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) { + if (_paths[i].p) { + if (_paths[i].p->isValidState()) { + if (_paths[i].p->alive(now)) { + alivePaths[numAlivePaths] = i; + numAlivePaths++; + } + else { + stalePaths[numStalePaths] = i; + numStalePaths++; + } + } + } + } + unsigned int r; + Utils::getSecureRandom(&r, 1); + if (numAlivePaths > 0) { + // pick a random out of the set deemed "alive" + int rf = (float)(r %= numAlivePaths); + return _paths[alivePaths[rf]].p; + } + else if(numStalePaths > 0) { + // resort to trying any non-expired path + int rf = (float)(r %= numStalePaths); + return _paths[stalePaths[rf]].p; + } + } + + /** + * Proportionally allocate traffic according to dynamic path quality measurements + */ + if (RR->node->getMultipathMode() == ZT_MULTIPATH_PROPORTIONALLY_BALANCED) { + float relq[ZT_MAX_PEER_NETWORK_PATHS]; + memset(&relq, 0, sizeof(relq)); + float alloc[ZT_MAX_PEER_NETWORK_PATHS]; + memset(&alloc, 0, sizeof(alloc)); + + // Survey + // + // Take a survey of all available link qualities. We use this to determine if we + // can skip this algorithm altogether and if not, to establish baseline for physical + // link quality used in later calculations. + // + // We find the min/max quality of our currently-active links so + // that we can form a relative scale to rank each link proportionally + // to each other link. + uint16_t alivePaths[ZT_MAX_PEER_NETWORK_PATHS]; + uint16_t stalePaths[ZT_MAX_PEER_NETWORK_PATHS]; + memset(&alivePaths, -1, sizeof(alivePaths)); + memset(&stalePaths, -1, sizeof(stalePaths)); + uint16_t numAlivePaths = 0; + uint16_t numStalePaths = 0; + float minQuality = 10000; + float maxQuality = -1; + float currQuality; + for(uint16_t i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) { + if (_paths[i].p && _paths[i].p->isValidState()) { + if (!_paths[i].p->monitorsReady()) { + // TODO: This should fix itself anyway but we should test whether forcing the use of a new path will + // aid in establishing flow balance more quickly. + } + // Compute quality here, going forward we will use lastComputedQuality() + currQuality = _paths[i].p->computeQuality(now); + if (!_paths[i].p->stale(now)) { + alivePaths[i] = currQuality; + numAlivePaths++; + } + else { + stalePaths[i] = currQuality; + numStalePaths++; + } + if (currQuality > maxQuality) { + maxQuality = currQuality; bestPath = i; } + if (currQuality < minQuality) { + minQuality = currQuality; + } + relq[i] = currQuality; } - } else break; + } + + // Attempt to find an excuse not to use the rest of this algorithm + if (bestPath == ZT_MAX_PEER_NETWORK_PATHS || (numAlivePaths == 0 && numStalePaths == 0)) { + return SharedPtr<Path>(); + } if (numAlivePaths == 1) { + return _paths[bestPath].p; + } if (numStalePaths == 1) { + return _paths[bestPath].p; + } + + // Relative quality + // + // The strongest link will have a value of 1.0 whereas every other + // link will have a value which represents some fraction of the strongest link. + float totalRelativeQuality = 0; + for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) { + if (_paths[i].p && _paths[i].p->isValidState()) { + relq[i] /= maxQuality ? maxQuality : 1; + totalRelativeQuality += relq[i]; + } + } + + // Convert the relative quality values into flow allocations. + // Additionally, determine whether each path in the flow is + // contributing more or less than its target allocation. If + // it is contributing more than required, don't allow it to be + // randomly selected for the next packet. If however the path + // needs to contribute more to the flow, we should record + float imbalance = 0; + float qualityScalingFactor = 1.0 / totalRelativeQuality; + for(uint16_t i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) { + // Out of the last N packets to this peer, how many were sent by this path? + int numPktSentWithinWin = (int)_pathChoiceHist->countValue((float)i); + // Compute traffic allocation for each path in the flow + if (_paths[i].p && _paths[i].p->isValidState()) { + // Allocation + // This is the percentage of traffic we want to send over a given path + alloc[i] = relq[i] * qualityScalingFactor; + float currProportion = numPktSentWithinWin / (float)ZT_MULTIPATH_PROPORTION_WIN_SZ; + float targetProportion = alloc[i]; + float diffProportion = currProportion - targetProportion; + // Imbalance + // + // This is the sum of the distances of each path's currently observed flow contributions + // from its most recent target allocation. In other words, this is a measure of how closely we + // are adhering to our desired allocations. It is worth noting that this value can be greater + // than 1.0 if a significant change to allocations is made by the algorithm, this will + // eventually correct itself. + imbalance += fabs(diffProportion); + if (diffProportion < 0) { + alloc[i] = targetProportion; + } + else { + alloc[i] = targetProportion; + } + } + } + + // Compute and record current flow balance + float balance = 1.0 - imbalance; + if (balance >= ZT_MULTIPATH_FLOW_BALANCE_THESHOLD) { + if (!_linkBalanceStatus) { + _linkBalanceStatus = true; + RR->t->peerLinkBalanced(NULL,0,*this); + } + } + else { + if (_linkBalanceStatus) { + _linkBalanceStatus = false; + RR->t->peerLinkImbalanced(NULL,0,*this); + } + } + + // Record the current flow balance. Later used for computing a mean flow balance value. + _flowBalanceHist->push(balance); + + // Randomly choose path from allocated candidates + unsigned int r; + Utils::getSecureRandom(&r, 1); + float rf = (float)(r %= 100) / 100; + for(int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) { + if (_paths[i].p && _paths[i].p->isValidState() && _paths[i].p->address().isV4()) { + if (alloc[i] > 0 && rf < alloc[i]) { + bestPath = i; + _pathChoiceHist->push(bestPath); // Record which path we chose + break; + } + if (alloc[i] > 0) { + rf -= alloc[i]; + } + else { + rf -= alloc[i]*-1; + } + } + } + if (bestPath < ZT_MAX_PEER_NETWORK_PATHS) { + return _paths[bestPath].p; + } + return SharedPtr<Path>(); + } + + // Adhere to a user-defined interface/allocation scheme + if (RR->node->getMultipathMode() == ZT_MULTIPATH_MANUALLY_BALANCED) { + // TODO } - if (bestPath != ZT_MAX_PEER_NETWORK_PATHS) - return _paths[bestPath].p; return SharedPtr<Path>(); } @@ -477,16 +714,34 @@ unsigned int Peer::doPingAndKeepalive(void *tPtr,int64_t now) } } else break; } - while(j < ZT_MAX_PEER_NETWORK_PATHS) { - _paths[j].lr = 0; - _paths[j].p.zero(); - _paths[j].priority = 1; - ++j; + if (RR->node->getMultipathMode() != ZT_MULTIPATH_NONE) { + while(j < ZT_MAX_PEER_NETWORK_PATHS) { + _paths[j].lr = 0; + _paths[j].p.zero(); + _paths[j].priority = 1; + ++j; + } } - return sent; } +unsigned int Peer::prunePaths() +{ + Mutex::Lock _l(_paths_m); + unsigned int pruned = 0; + for(unsigned int i=0;i<ZT_MAX_PEER_NETWORK_PATHS;++i) { + if (_paths[i].p) { + if(_paths[i].p->isClosed() || !_paths[i].p->isValidState()) { + _paths[i].lr = 0; + _paths[i].p.zero(); + _paths[i].priority = 1; + pruned++; + } + } + } + return pruned; +} + void Peer::clusterRedirect(void *tPtr,const SharedPtr<Path> &originatingPath,const InetAddress &remoteAddress,const int64_t now) { SharedPtr<Path> np(RR->topology->getPath(originatingPath->localSocket(),remoteAddress)); diff --git a/node/Peer.hpp b/node/Peer.hpp index b6f3c695..9873729b 100644 --- a/node/Peer.hpp +++ b/node/Peer.hpp @@ -65,7 +65,13 @@ private: Peer() {} // disabled to prevent bugs -- should not be constructed uninitialized public: - ~Peer() { Utils::burn(_key,sizeof(_key)); } + ~Peer() { + Utils::burn(_key,sizeof(_key)); + delete _pathChoiceHist; + delete _flowBalanceHist; + _pathChoiceHist = NULL; + _flowBalanceHist = NULL; + } /** * Construct a new peer @@ -145,20 +151,20 @@ public: */ inline bool sendDirect(void *tPtr,const void *data,unsigned int len,int64_t now,bool force) { - SharedPtr<Path> bp(getBestPath(now,force)); + SharedPtr<Path> bp(getAppropriatePath(now,force)); if (bp) return bp->send(RR,tPtr,data,len,now); return false; } /** - * Get the best current direct path + * Get the most appropriate direct path based on current multipath configuration * * @param now Current time * @param includeExpired If true, include even expired paths * @return Best current path or NULL if none */ - SharedPtr<Path> getBestPath(int64_t now,bool includeExpired) const; + SharedPtr<Path> getAppropriatePath(int64_t now, bool includeExpired); /** * Send VERB_RENDEZVOUS to this and another peer via the best common IP scope and path @@ -213,6 +219,16 @@ public: unsigned int doPingAndKeepalive(void *tPtr,int64_t now); /** + * Clear paths whose localSocket(s) are in a CLOSED state or have an otherwise INVALID state. + * This should be called frequently so that we can detect and remove unproductive or invalid paths. + * + * Under the hood this is done periodically based on ZT_CLOSED_PATH_PRUNING_INTERVAL. + * + * @return Number of paths that were pruned this round + */ + unsigned int prunePaths(); + + /** * Process a cluster redirect sent by this peer * * @param tPtr Thread pointer to be handed through to any callbacks called as a result of this call @@ -270,9 +286,9 @@ public: /** * @return Latency in milliseconds of best path or 0xffff if unknown / no paths */ - inline unsigned int latency(const int64_t now) const + inline unsigned int latency(const int64_t now) { - SharedPtr<Path> bp(getBestPath(now,false)); + SharedPtr<Path> bp(getAppropriatePath(now,false)); if (bp) return bp->latency(); return 0xffff; @@ -289,7 +305,7 @@ public: * * @return Relay quality score computed from latency and other factors, lower is better */ - inline unsigned int relayQuality(const int64_t now) const + inline unsigned int relayQuality(const int64_t now) { const uint64_t tsr = now - _lastReceive; if (tsr >= ZT_PEER_ACTIVITY_TIMEOUT) @@ -515,6 +531,7 @@ private: int64_t _lastCredentialsReceived; int64_t _lastTrustEstablishedPacketReceived; int64_t _lastSentFullHello; + int64_t _lastPathPrune; uint16_t _vProto; uint16_t _vMajor; @@ -530,6 +547,13 @@ private: unsigned int _credentialsCutoffCount; AtomicCounter __refCount; + + RingBuffer<int> *_pathChoiceHist; + RingBuffer<float> *_flowBalanceHist; + + bool _linkBalanceStatus; + bool _linkRedundancyStatus; + }; } // namespace ZeroTier diff --git a/node/RingBuffer.hpp b/node/RingBuffer.hpp new file mode 100644 index 00000000..62f1cf47 --- /dev/null +++ b/node/RingBuffer.hpp @@ -0,0 +1,315 @@ +/* + * ZeroTier One - Network Virtualization Everywhere + * Copyright (C) 2011-2018 ZeroTier, Inc. https://www.zerotier.com/ + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * -- + * + * You can be released from the requirements of the license by purchasing + * a commercial license. Buying such a license is mandatory as soon as you + * develop commercial closed-source software that incorporates or links + * directly against ZeroTier software without disclosing the source code + * of your own application. + */ + +#ifndef ZT_RINGBUFFER_H +#define ZT_RINGBUFFER_H + +#include <typeinfo> +#include <cstdint> +#include <stdlib.h> +#include <memory.h> +#include <algorithm> +#include <math.h> + +namespace ZeroTier { + +/** + * A revolving (ring) buffer. + * + * For fast handling of continuously-evolving variables (such as path quality metrics). + * Using this, we can maintain longer sliding historical windows for important path + * metrics without the need for potentially expensive calls to memcpy/memmove. + * + * Some basic statistical functionality is implemented here in an attempt + * to reduce the complexity of code needed to interact with this type of buffer. + */ + +template <class T> +class RingBuffer +{ +private: + T * buf; + size_t size; + size_t begin; + size_t end; + bool wrap; + +public: + + /** + * create a RingBuffer with space for up to size elements. + */ + explicit RingBuffer(size_t size) + : size(size), + begin(0), + end(0), + wrap(false) + { + buf = new T[size]; + memset(buf, 0, sizeof(T) * size); + } + + /** + * @return A pointer to the underlying buffer + */ + T* get_buf() + { + return buf + begin; + } + + /** + * Adjust buffer index pointer as if we copied data in + * @param n Number of elements to copy in + * @return Number of elements we copied in + */ + size_t produce(size_t n) + { + n = std::min(n, getFree()); + if (n == 0) { + return n; + } + const size_t first_chunk = std::min(n, size - end); + end = (end + first_chunk) % size; + if (first_chunk < n) { + const size_t second_chunk = n - first_chunk; + end = (end + second_chunk) % size; + } + if (begin == end) { + wrap = true; + } + return n; + } + + /** + * Fast erase, O(1). + * Merely reset the buffer pointer, doesn't erase contents + */ + void reset() + { + consume(count()); + } + + /** + * adjust buffer index pointer as if we copied data out + * @param n Number of elements we copied from the buffer + * @return Number of elements actually available from the buffer + */ + size_t consume(size_t n) + { + n = std::min(n, count()); + if (n == 0) { + return n; + } + if (wrap) { + wrap = false; + } + const size_t first_chunk = std::min(n, size - begin); + begin = (begin + first_chunk) % size; + if (first_chunk < n) { + const size_t second_chunk = n - first_chunk; + begin = (begin + second_chunk) % size; + } + return n; + } + + /** + * @param data Buffer that is to be written to the ring + * @param n Number of elements to write to the buffer + */ + size_t write(const T * data, size_t n) + { + n = std::min(n, getFree()); + if (n == 0) { + return n; + } + const size_t first_chunk = std::min(n, size - end); + memcpy(buf + end, data, first_chunk * sizeof(T)); + end = (end + first_chunk) % size; + if (first_chunk < n) { + const size_t second_chunk = n - first_chunk; + memcpy(buf + end, data + first_chunk, second_chunk * sizeof(T)); + end = (end + second_chunk) % size; + } + if (begin == end) { + wrap = true; + } + return n; + } + + /** + * Place a single value on the buffer. If the buffer is full, consume a value first. + * + * @param value A single value to be placed in the buffer + */ + void push(const T value) + { + if (count() == size) { + consume(1); + } + write(&value, 1); + } + + /** + * @param dest Destination buffer + * @param n Size (in terms of number of elements) of the destination buffer + * @return Number of elements read from the buffer + */ + size_t read(T * dest, size_t n) + { + n = std::min(n, count()); + if (n == 0) { + return n; + } + if (wrap) { + wrap = false; + } + const size_t first_chunk = std::min(n, size - begin); + memcpy(dest, buf + begin, first_chunk * sizeof(T)); + begin = (begin + first_chunk) % size; + if (first_chunk < n) { + const size_t second_chunk = n - first_chunk; + memcpy(dest + first_chunk, buf + begin, second_chunk * sizeof(T)); + begin = (begin + second_chunk) % size; + } + return n; + } + + /** + * Return how many elements are in the buffer, O(1). + * + * @return The number of elements in the buffer + */ + size_t count() + { + if (end == begin) { + return wrap ? size : 0; + } + else if (end > begin) { + return end - begin; + } + else { + return size + end - begin; + } + } + + /** + * @return The number of slots that are unused in the buffer + */ + size_t getFree() + { + return size - count(); + } + + /** + * @return The arithmetic mean of the contents of the buffer + */ + T mean() + { + size_t iterator = begin; + T mean = 0; + for (int i=0; i<size; i++) { + iterator = (iterator + size - 1) % size; + mean += *(buf + iterator); + } + return count() ? mean / (T)count() : 0; + } + + /** + * @return The sample standard deviation of the contents of the ring buffer + */ + T stddev() + { + size_t iterator = begin; + T cached_mean = mean(); + if (size) { + T sum_of_squared_deviations = 0; + for (int i=0; i<size; i++) { + iterator = (iterator + size - 1) % size; + T deviation = (buf[i] - cached_mean); + T sdev = deviation*deviation; + sum_of_squared_deviations += sdev; + } + T variance = sum_of_squared_deviations / (size - 1); + T sd = sqrt(variance); + return sd; + } + return 0; + } + + /** + * @return The number of elements of zero value, O(n) + */ + size_t zeroCount() + { + size_t iterator = begin; + size_t zeros = 0; + for (int i=0; i<size; i++) { + iterator = (iterator + size - 1) % size; + if (*(buf + iterator) == 0) { + zeros++; + } + } + return zeros; + } + + /** + * @param value Value to match against in buffer + * @return The number of values held in the ring buffer which match a given value + */ + size_t countValue(T value) + { + size_t iterator = begin; + size_t count = 0; + for (int i=0; i<size; i++) { + iterator = (iterator + size - 1) % size; + if (*(buf + iterator) == value) { + count++; + } + } + return count; + } + + /** + * Print the contents of the buffer + */ + void dump() + { + size_t iterator = begin; + for (int i=0; i<size; i++) { + iterator = (iterator + size - 1) % size; + if (typeid(T) == typeid(int)) { + // DEBUG_INFO("buf[%2zu]=%2d", iterator, (int)*(buf + iterator)); + } + else { + // DEBUG_INFO("buf[%2zu]=%2f", iterator, (float)*(buf + iterator)); + } + } + } +}; + +} // namespace ZeroTier + +#endif diff --git a/node/Switch.cpp b/node/Switch.cpp index eb1ebadb..3f6c4e41 100644 --- a/node/Switch.cpp +++ b/node/Switch.cpp @@ -646,12 +646,12 @@ bool Switch::_trySend(void *tPtr,Packet &packet,bool encrypt) const SharedPtr<Peer> peer(RR->topology->getPeer(tPtr,destination)); if (peer) { - viaPath = peer->getBestPath(now,false); + viaPath = peer->getAppropriatePath(now,false); if (!viaPath) { peer->tryMemorizedPath(tPtr,now); // periodically attempt memorized or statically defined paths, if any are known const SharedPtr<Peer> relay(RR->topology->getUpstreamPeer()); - if ( (!relay) || (!(viaPath = relay->getBestPath(now,false))) ) { - if (!(viaPath = peer->getBestPath(now,true))) + if ( (!relay) || (!(viaPath = relay->getAppropriatePath(now,false))) ) { + if (!(viaPath = peer->getAppropriatePath(now,true))) return false; } } diff --git a/node/Topology.hpp b/node/Topology.hpp index 63946a32..5d726007 100644 --- a/node/Topology.hpp +++ b/node/Topology.hpp @@ -299,7 +299,7 @@ public: Address *a = (Address *)0; SharedPtr<Peer> *p = (SharedPtr<Peer> *)0; while (i.next(a,p)) { - const SharedPtr<Path> pp((*p)->getBestPath(now,false)); + const SharedPtr<Path> pp((*p)->getAppropriatePath(now,false)); if (pp) ++cnt; } diff --git a/node/Trace.cpp b/node/Trace.cpp index 386edaac..01a8da55 100644 --- a/node/Trace.cpp +++ b/node/Trace.cpp @@ -106,6 +106,26 @@ void Trace::peerConfirmingUnknownPath(void *const tPtr,const uint64_t networkId, } } +void Trace::peerLinkNowRedundant(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath) +{ + ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx on network %.16llx is fully redundant",peer.address().toInt(),networkId); +} + +void Trace::peerLinkNoLongerRedundant(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath) +{ + ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx on network %.16llx is no longer redundant",peer.address().toInt(),networkId); +} + +void Trace::peerLinkBalanced(void *const tPtr,const uint64_t networkId,Peer &peer) +{ + ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx on network %.16llx is balanced",peer.address().toInt(),networkId); +} + +void Trace::peerLinkImbalanced(void *const tPtr,const uint64_t networkId,Peer &peer) +{ + ZT_LOCAL_TRACE(tPtr,RR,"link to peer %.10llx on network %.16llx is unbalanced",peer.address().toInt(),networkId); +} + void Trace::peerLearnedNewPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath,const uint64_t packetId) { char tmp[128]; diff --git a/node/Trace.hpp b/node/Trace.hpp index 2a2fca6c..b01163d6 100644 --- a/node/Trace.hpp +++ b/node/Trace.hpp @@ -121,6 +121,12 @@ public: void resettingPathsInScope(void *const tPtr,const Address &reporter,const InetAddress &reporterPhysicalAddress,const InetAddress &myPhysicalAddress,const InetAddress::IpScope scope); void peerConfirmingUnknownPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &path,const uint64_t packetId,const Packet::Verb verb); + + void peerLinkNowRedundant(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath); + void peerLinkNoLongerRedundant(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath); + void peerLinkBalanced(void *const tPtr,const uint64_t networkId,Peer &peer); + void peerLinkImbalanced(void *const tPtr,const uint64_t networkId,Peer &peer); + void peerLearnedNewPath(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath,const uint64_t packetId); void peerRedirected(void *const tPtr,const uint64_t networkId,Peer &peer,const SharedPtr<Path> &newPath); diff --git a/osdep/Binder.hpp b/osdep/Binder.hpp index 93fad9f1..6e13836c 100644 --- a/osdep/Binder.hpp +++ b/osdep/Binder.hpp @@ -388,6 +388,7 @@ public: _bindings[_bindingCount].udpSock = udps; _bindings[_bindingCount].tcpListenSock = tcps; _bindings[_bindingCount].address = ii->first; + phy.setIfName(udps, (char*)ii->second.c_str(), ii->second.length()); ++_bindingCount; } } else { @@ -455,6 +456,20 @@ public: return false; } + /** + * Get a list of socket pointers for all bindings. + * + * @return A list of socket pointers for current bindings + */ + inline std::vector<PhySocket*> getBoundSockets() + { + std::vector<PhySocket*> sockets; + for (int i=0; i<ZT_BINDER_MAX_BINDINGS; i++) { + sockets.push_back(_bindings[i].udpSock); + } + return sockets; + } + private: _Binding _bindings[ZT_BINDER_MAX_BINDINGS]; std::atomic<unsigned int> _bindingCount; diff --git a/osdep/Phy.hpp b/osdep/Phy.hpp index e359ccdd..d4f68681 100644 --- a/osdep/Phy.hpp +++ b/osdep/Phy.hpp @@ -27,6 +27,8 @@ #ifndef ZT_PHY_HPP #define ZT_PHY_HPP +#include "../osdep/OSUtils.hpp" + #include <stdio.h> #include <stdlib.h> #include <string.h> @@ -86,6 +88,22 @@ namespace ZeroTier { */ typedef void PhySocket; +struct link_test_record +{ + link_test_record(PhySocket *_s, uint64_t _id, uint64_t _egress_time, uint32_t _length) : + s(_s), + id(_id), + egress_time(_egress_time), + length(_length) + { + // + } + PhySocket *s; + uint64_t id; + uint64_t egress_time; + uint32_t length; +}; + /** * Simple templated non-blocking sockets implementation * @@ -154,10 +172,17 @@ private: struct PhySocketImpl { + PhySocketImpl() : + throughput(0) + { + memset(ifname, 0, sizeof(ifname)); + } PhySocketType type; ZT_PHY_SOCKFD_TYPE sock; void *uptr; // user-settable pointer ZT_PHY_SOCKADDR_STORAGE_TYPE saddr; // remote for TCP_OUT and TCP_IN, local for TCP_LISTEN, RAW, and UDP + char ifname[16]; + uint64_t throughput; }; std::list<PhySocketImpl> _socks; @@ -173,6 +198,7 @@ private: bool _noDelay; bool _noCheck; + std::vector<struct link_test_record*> link_test_records; public: /** @@ -250,6 +276,173 @@ public: static inline void** getuptr(PhySocket *s) throw() { return &(reinterpret_cast<PhySocketImpl *>(s)->uptr); } /** + * @param s Socket object + * @param nameBuf Buffer to store name of interface which this Socket object is bound to + * @param buflen Length of buffer to copy name into + */ + static inline void getIfName(PhySocket *s, char *nameBuf, int buflen) + { + memcpy(nameBuf, reinterpret_cast<PhySocketImpl *>(s)->ifname, buflen); + } + + /** + * @param s Socket object + * @param ifname Buffer containing name of interface that this Socket object is bound to + * @param len Length of name of interface + */ + static inline void setIfName(PhySocket *s, char *ifname, int len) + { + memcpy(&(reinterpret_cast<PhySocketImpl *>(s)->ifname), ifname, len); + } + + /** + * Get result of most recent throughput test + * + * @param s Socket object + */ + inline uint64_t getThroughput(PhySocket *s) + { + PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s)); + return sws ? sws->throughput : 0; + } + + /** + * Whether or not the socket object is in a closed state + * + * @param s Socket object + * @return true if socket is closed, false if otherwise + */ + inline bool isClosed(PhySocket *s) + { + PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s)); + return sws->type == ZT_PHY_SOCKET_CLOSED; + } + + /** + * Get state of socket object + * + * @param s Socket object + * @return State of socket + */ + inline int getState(PhySocket *s) + { + PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s)); + return sws->type; + } + + /** + * In the event that this socket is erased, we need a way to convey to the multipath logic + * that this path is no longer valid. + * + * @param s Socket object + * @return Whether the state of this socket is within an acceptable range of values + */ + inline bool isValidState(PhySocket *s) + { + PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s)); + return sws->type >= ZT_PHY_SOCKET_CLOSED && sws->type <= ZT_PHY_SOCKET_UNIX_LISTEN; + } + + /** + * Send a datagram of a known size to a selected peer and record egress time. The peer + * shall eventually respond by echoing back a smaller datagram. + * + * @param s Socket object + * @param remoteAddress Address of remote peer to receive link test packet + * @param data Buffer containing random packet data + * @param len Length of packet data buffer + * @return Number of bytes successfully written to socket + */ + inline int test_link_speed(PhySocket *s, const struct sockaddr *to, void *data, uint32_t len) { + if (!reinterpret_cast<PhySocketImpl *>(s)) { + return 0; + } + uint64_t *buf = (uint64_t*)data; + uint64_t id = buf[0]; + if (to->sa_family != AF_INET && to->sa_family != AF_INET6) { + return 0; + } + uint64_t egress_time = OSUtils::now(); + PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s)); +#if defined(_WIN32) || defined(_WIN64) + return ((long)::sendto(sws->sock,reinterpret_cast<const char *>(data),len,0,to,(to->sa_family == AF_INET6) ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in)) == (long)len); +#else + int w = ::sendto(sws->sock,data,len,0,to,(to->sa_family == AF_INET6) ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in)); +#endif + if (w > 0) { + link_test_records.push_back(new link_test_record(s, id, egress_time, len)); + } + return w; + } + + /** + * Remove link speed test records which have timed-out and record a 0 bits/s measurement + */ + inline void refresh_link_speed_records() + { + for(int i=0;i<link_test_records.size();i++) { + if(OSUtils::now() - link_test_records[i]->egress_time > ZT_LINK_TEST_TIMEOUT) { + PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(link_test_records[i]->s)); + if (sws) { + sws->throughput = 0; + } + link_test_records.erase(link_test_records.begin() + i); + } + } + } + + /** + * Upon receipt of a link speed test datagram we echo back only the identification portion + * + * @param s Socket object + * @param from Address of remote peer that sent this datagram + * @param data Buffer containing datagram's contents + * @param len Length of datagram + * @return Number of bytes successfully written to socket in response + */ + inline int respond_to_link_test(PhySocket *s,const struct sockaddr *from,void *data,unsigned long len) { + PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s)); + uint64_t *id = (uint64_t*)data; +#if defined(_WIN32) || defined(_WIN64) + return ((long)::sendto(sws->sock,reinterpret_cast<const char *>(data),len,0,from,(from->sa_family == AF_INET6) ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in)) == (long)len); +#else + int w = ::sendto(sws->sock,id,sizeof(id[0]),0,from,(from->sa_family == AF_INET6) ? sizeof(struct sockaddr_in6) : sizeof(struct sockaddr_in)); +#endif + return w; + } + + /** + * Upon receipt of a response to our original link test datagram, correlate this new datagram with the record + * of the one we sent. Compute the transit time and update the throughput field of the relevant socket. This + * value will later be read by the path quality estimation logic located in Path.hpp. + * + * @param s Socket object + * @param from Address of remote peer that sent this datagram + * @param data Buffer containing datagram contents (ID of original link test datagram) + * @param len Length of datagram + * @return true if datagram correponded to previous record, false if otherwise + */ + inline bool handle_link_test_response(PhySocket *s,const struct sockaddr *from,void *data,unsigned long len) { + uint64_t *id = (uint64_t*)data; + for(int i=0;i<link_test_records.size();i++) { + if(link_test_records[i]->id == id[0]) { + float rtt = (OSUtils::now()-link_test_records[i]->egress_time) / (float)1000; // s + uint32_t sz = (link_test_records[i]->length) * 8; // bits + float transit_time = rtt / 2.0; + int64_t raw = sz / transit_time; + PhySocketImpl *sws = (reinterpret_cast<PhySocketImpl *>(s)); + if (sws) { + sws->throughput = raw; + } + delete link_test_records[i]; + link_test_records.erase(link_test_records.begin() + i); + return true; + } + } + return false; + } + + /** * Cause poll() to stop waiting immediately * * This can be used to reset the polling loop after changes that require @@ -985,7 +1178,7 @@ public: ZT_PHY_SOCKFD_TYPE sock = s->sock; // if closed, s->sock becomes invalid as s is no longer dereferencable if ((FD_ISSET(sock,&wfds))&&(FD_ISSET(sock,&_writefds))) { try { - _handler->phyOnUnixWritable((PhySocket *)&(*s),&(s->uptr),false); + _handler->phyOnUnixWritable((PhySocket *)&(*s),&(s->uptr)); } catch ( ... ) {} } if (FD_ISSET(sock,&rfds)) { diff --git a/service/OneService.cpp b/service/OneService.cpp index 80b42818..24456f71 100644 --- a/service/OneService.cpp +++ b/service/OneService.cpp @@ -37,6 +37,7 @@ #include "../version.h" #include "../include/ZeroTierOne.h" +#include "../include/ZeroTierDebug.h" #include "../node/Constants.hpp" #include "../node/Mutex.hpp" @@ -417,6 +418,7 @@ public: PhySocket *_localControlSocket6; bool _updateAutoApply; bool _allowTcpFallbackRelay; + unsigned int _multipathMode; unsigned int _primaryPort; volatile unsigned int _udpPortPickerCounter; @@ -455,6 +457,9 @@ public: // Last potential sleep/wake event uint64_t _lastRestart; + // Last time link throughput was tested + uint64_t _lastLinkSpeedTest; + // Deadline for the next background task service function volatile int64_t _nextBackgroundTaskDeadline; @@ -818,6 +823,7 @@ public: _lastRestart = clockShouldBe; int64_t lastTapMulticastGroupCheck = 0; int64_t lastBindRefresh = 0; + int64_t lastMultipathModeUpdate = 0; int64_t lastUpdateCheck = clockShouldBe; int64_t lastCleanedPeersDb = 0; int64_t lastLocalInterfaceAddressCheck = (clockShouldBe - ZT_LOCAL_INTERFACE_CHECK_INTERVAL) + 15000; // do this in 15s to give portmapper time to configure and other things time to settle @@ -849,8 +855,9 @@ public: _updater->apply(); } - // Refresh bindings in case device's interfaces have changed, and also sync routes to update any shadow routes (e.g. shadow default) - if (((now - lastBindRefresh) >= ZT_BINDER_REFRESH_PERIOD)||(restarted)) { + // Refresh bindings + int interfaceRefreshPeriod = _multipathMode ? ZT_MULTIPATH_BINDER_REFRESH_PERIOD : ZT_BINDER_REFRESH_PERIOD; + if (((now - lastBindRefresh) >= interfaceRefreshPeriod)||(restarted)) { lastBindRefresh = now; unsigned int p[3]; unsigned int pc = 0; @@ -867,6 +874,34 @@ public: } } } + // Update multipath mode (if needed) + if (((now - lastMultipathModeUpdate) >= interfaceRefreshPeriod)||(restarted)) { + lastMultipathModeUpdate = now; + _node->setMultipathMode(_multipathMode); + } + + // Test link speeds + // TODO: This logic should eventually find its way into the core or as part of a passive + // measure within the protocol. + if (_multipathMode && ((now - _lastLinkSpeedTest) >= ZT_LINK_SPEED_TEST_INTERVAL)) { + _phy.refresh_link_speed_records(); + _lastLinkSpeedTest = now; + // Generate random data to fill UDP packet + uint64_t pktBuf[ZT_LINK_TEST_DATAGRAM_SZ / sizeof(uint64_t)]; + Utils::getSecureRandom(pktBuf, ZT_LINK_TEST_DATAGRAM_SZ); + ZT_PeerList *pl = _node->peers(); + // get bindings (specifically just the sockets) + std::vector<PhySocket*> sockets = _binder.getBoundSockets(); + // interfaces + for (int i=0; i<ZT_BINDER_MAX_BINDINGS; i++) { + for(int j=0;j<pl->peerCount;++j) { + for (int k=0; k<(ZT_MAX_PEER_NETWORK_PATHS/4); k++) { + Utils::getSecureRandom(pktBuf, 8); // generate one random integer for unique id + _phy.test_link_speed(sockets[i], (struct sockaddr*)&(pl->peers[j].paths[k].address), pktBuf, ZT_LINK_TEST_DATAGRAM_SZ); + } + } + } + } // Run background task processor in core if it's time to do so int64_t dl = _nextBackgroundTaskDeadline; @@ -1190,6 +1225,7 @@ public: json &settings = res["config"]["settings"]; settings["primaryPort"] = OSUtils::jsonInt(settings["primaryPort"],(uint64_t)_primaryPort) & 0xffff; settings["allowTcpFallbackRelay"] = OSUtils::jsonBool(settings["allowTcpFallbackRelay"],_allowTcpFallbackRelay); + settings["multipathMode"] = OSUtils::jsonInt(settings["multipathMode"],_multipathMode); #ifdef ZT_USE_MINIUPNPC settings["portMappingEnabled"] = OSUtils::jsonBool(settings["portMappingEnabled"],true); #else @@ -1518,6 +1554,11 @@ public: _primaryPort = (unsigned int)OSUtils::jsonInt(settings["primaryPort"],(uint64_t)_primaryPort) & 0xffff; _allowTcpFallbackRelay = OSUtils::jsonBool(settings["allowTcpFallbackRelay"],true); + _multipathMode = OSUtils::jsonInt(settings["multipathMode"],0); + if (_multipathMode != 0 && _allowTcpFallbackRelay) { + fprintf(stderr,"WARNING: multipathMode cannot be used with allowTcpFallbackRelay. Disabling allowTcpFallbackRelay"); + _allowTcpFallbackRelay = false; + } _portMappingEnabled = OSUtils::jsonBool(settings["portMappingEnabled"],true); json &ignoreIfs = settings["interfacePrefixBlacklist"]; @@ -1758,6 +1799,15 @@ public: inline void phyOnDatagram(PhySocket *sock,void **uptr,const struct sockaddr *localAddr,const struct sockaddr *from,void *data,unsigned long len) { + if (_multipathMode) { + // Handle link test packets (should eventually be moved into the protocol itself) + if (len == ZT_LINK_TEST_DATAGRAM_SZ) { + _phy.respond_to_link_test(sock, from, data, len); + } + if (len == ZT_LINK_TEST_DATAGRAM_RESPONSE_SZ) { + _phy.handle_link_test_response(sock, from, data, len); + } + } if ((len >= 16)&&(reinterpret_cast<const InetAddress *>(from)->ipScope() == InetAddress::IP_SCOPE_GLOBAL)) _lastDirectReceiveFromGlobal = OSUtils::now(); const ZT_ResultCode rc = _node->processWirePacket( @@ -2007,7 +2057,7 @@ public: inline void phyOnUnixAccept(PhySocket *sockL,PhySocket *sockN,void **uptrL,void **uptrN) {} inline void phyOnUnixClose(PhySocket *sock,void **uptr) {} inline void phyOnUnixData(PhySocket *sock,void **uptr,void *data,unsigned long len) {} - inline void phyOnUnixWritable(PhySocket *sock,void **uptr,bool lwip_invoked) {} + inline void phyOnUnixWritable(PhySocket *sock,void **uptr) {} inline int nodeVirtualNetworkConfigFunction(uint64_t nwid,void **nuptr,enum ZT_VirtualNetworkConfigOperation op,const ZT_VirtualNetworkConfig *nwc) { |