From 0d0039674fefbfc14eb41f3bdbec7bf3e6dba9a5 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Wed, 30 Sep 2015 14:48:07 -0700 Subject: Add new verb names, and fix some Mac compiler flags. --- node/Packet.cpp | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'node/Packet.cpp') diff --git a/node/Packet.cpp b/node/Packet.cpp index 2c73a087..f69e4e79 100644 --- a/node/Packet.cpp +++ b/node/Packet.cpp @@ -31,6 +31,8 @@ namespace ZeroTier { const unsigned char Packet::ZERO_KEY[32] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; +#ifdef ZT_TRACE + const char *Packet::verbString(Verb v) throw() { @@ -52,6 +54,8 @@ const char *Packet::verbString(Verb v) case VERB_MULTICAST_FRAME: return "MULTICAST_FRAME"; case VERB_SET_EPHEMERAL_KEY: return "SET_EPHEMERAL_KEY"; case VERB_PUSH_DIRECT_PATHS: return "PUSH_DIRECT_PATHS"; + case VERB_CIRCUIT_TEST: return "CIRCUIT_TEST"; + case VERB_CIRCUIT_TEST_REPORT: return "CIRCUIT_TEST_REPORT"; } return "(unknown)"; } @@ -73,6 +77,8 @@ const char *Packet::errorString(ErrorCode e) return "(unknown)"; } +#endif // ZT_TRACE + void Packet::armor(const void *key,bool encryptPayload) { unsigned char mangledKey[32]; -- cgit v1.2.3 From e5f168f599ba053ee5e6029387dd7ad4b95a7d28 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Wed, 7 Oct 2015 13:35:46 -0700 Subject: Add proof of work request for future DDOS mitigation use. --- node/IncomingPacket.cpp | 121 ++++++++++++++++++++++++++++++++++++++++++++++++ node/IncomingPacket.hpp | 22 +++++++++ node/Packet.cpp | 2 +- node/Packet.hpp | 102 ++++++++++++++++++++++++++++++++-------- selftest.cpp | 14 ++++++ 5 files changed, 240 insertions(+), 21 deletions(-) (limited to 'node/Packet.cpp') diff --git a/node/IncomingPacket.cpp b/node/IncomingPacket.cpp index 19bea243..de901779 100644 --- a/node/IncomingPacket.cpp +++ b/node/IncomingPacket.cpp @@ -41,6 +41,8 @@ #include "Peer.hpp" #include "NetworkController.hpp" #include "SelfAwareness.hpp" +#include "Salsa20.hpp" +#include "SHA512.hpp" namespace ZeroTier { @@ -90,6 +92,7 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR) case Packet::VERB_PUSH_DIRECT_PATHS: return _doPUSH_DIRECT_PATHS(RR,peer); case Packet::VERB_CIRCUIT_TEST: return _doCIRCUIT_TEST(RR,peer); case Packet::VERB_CIRCUIT_TEST_REPORT: return _doCIRCUIT_TEST_REPORT(RR,peer); + case Packet::VERB_REQUEST_PROOF_OF_WORK: return _doREQUEST_PROOF_OF_WORK(RR,peer); } } else { RR->sw->requestWhois(sourceAddress); @@ -1042,6 +1045,124 @@ bool IncomingPacket::_doCIRCUIT_TEST_REPORT(const RuntimeEnvironment *RR,const S return true; } +bool IncomingPacket::_doREQUEST_PROOF_OF_WORK(const RuntimeEnvironment *RR,const SharedPtr &peer) +{ + try { + // Right now this is only allowed from root servers -- may be allowed from controllers and relays later. + if (RR->topology->isRoot(peer->identity())) { + const unsigned int difficulty = (*this)[ZT_PACKET_IDX_PAYLOAD + 1]; + const unsigned int challengeLength = at(ZT_PACKET_IDX_PAYLOAD + 2); + if (challengeLength > ZT_PROTO_MAX_PACKET_LENGTH) + return true; // sanity check, drop invalid size + const unsigned char *challenge = field(ZT_PACKET_IDX_PAYLOAD + 4,challengeLength); + + switch((*this)[ZT_PACKET_IDX_PAYLOAD]) { + + // Salsa20/12+SHA512 hashcash + case 0x01: { + unsigned char result[16]; + computeSalsa2012Sha512ProofOfWork(difficulty,challenge,challengeLength,result); + + Packet outp(peer->address(),RR->identity.address(),Packet::VERB_OK); + outp.append((unsigned char)Packet::VERB_REQUEST_PROOF_OF_WORK); + outp.append(packetId()); + outp.append((uint16_t)sizeof(result)); + outp.append(result,sizeof(result)); + outp.armor(peer->key(),true); + RR->node->putPacket(_localAddress,_remoteAddress,outp.data(),outp.size()); + } break; + + default: + TRACE("dropped REQUEST_PROO_OF_WORK from %s(%s): unrecognized proof of work type",peer->address().toString().c_str(),_remoteAddress.toString().c_str()); + break; + } + } else { + TRACE("dropped REQUEST_PROO_OF_WORK from %s(%s): not trusted enough",peer->address().toString().c_str(),_remoteAddress.toString().c_str()); + } + } catch ( ... ) { + TRACE("dropped REQUEST_PROOF_OF_WORK from %s(%s): unexpected exception",peer->address().toString().c_str(),_remoteAddress.toString().c_str()); + } + return true; +} + +void IncomingPacket::computeSalsa2012Sha512ProofOfWork(unsigned int difficulty,const void *challenge,unsigned int challengeLength,unsigned char result[16]) +{ + unsigned char salsabuf[131072]; // 131072 == protocol constant, size of memory buffer for this proof of work function + char candidatebuf[ZT_PROTO_MAX_PACKET_LENGTH + 256]; + unsigned char shabuf[ZT_SHA512_DIGEST_LEN]; + const uint64_t s20iv = 0; // zero IV for Salsa20 + char *const candidate = (char *)(( ((uintptr_t)&(candidatebuf[0])) | 0xf ) + 1); // align to 16-byte boundary to ensure that uint64_t type punning of initial nonce is okay + Salsa20 s20; + unsigned int d; + unsigned char *p; + + Utils::getSecureRandom(candidate,16); + memcpy(candidate + 16,challenge,challengeLength); + + if (difficulty > 512) + difficulty = 512; // sanity check + +try_salsa2012sha512_again: + ++*(reinterpret_cast(candidate)); + + SHA512::hash(shabuf,candidate,16 + challengeLength); + s20.init(shabuf,256,&s20iv,12); + memset(salsabuf,0,sizeof(salsabuf)); + s20.encrypt(salsabuf,salsabuf,sizeof(salsabuf)); + SHA512::hash(shabuf,salsabuf,sizeof(salsabuf)); + + d = difficulty; + p = shabuf; + while (d >= 8) { + if (*(p++)) + goto try_salsa2012sha512_again; + d -= 8; + } + if (d > 0) { + if ( ((((unsigned int)*p) << d) & 0xff00) != 0 ) + goto try_salsa2012sha512_again; + } + + memcpy(result,candidate,16); +} + +bool IncomingPacket::testSalsa2012Sha512ProofOfWorkResult(unsigned int difficulty,const void *challenge,unsigned int challengeLength,const unsigned char proposedResult[16]) +{ + unsigned char salsabuf[131072]; // 131072 == protocol constant, size of memory buffer for this proof of work function + char candidate[ZT_PROTO_MAX_PACKET_LENGTH + 256]; + unsigned char shabuf[ZT_SHA512_DIGEST_LEN]; + const uint64_t s20iv = 0; // zero IV for Salsa20 + Salsa20 s20; + unsigned int d; + unsigned char *p; + + if (difficulty > 512) + difficulty = 512; // sanity check + + memcpy(candidate,proposedResult,16); + memcpy(candidate + 16,challenge,challengeLength); + + SHA512::hash(shabuf,candidate,16 + challengeLength); + s20.init(shabuf,256,&s20iv,12); + memset(salsabuf,0,sizeof(salsabuf)); + s20.encrypt(salsabuf,salsabuf,sizeof(salsabuf)); + SHA512::hash(shabuf,salsabuf,sizeof(salsabuf)); + + d = difficulty; + p = shabuf; + while (d >= 8) { + if (*(p++)) + return false; + d -= 8; + } + if (d > 0) { + if ( ((((unsigned int)*p) << d) & 0xff00) != 0 ) + return false; + } + + return true; +} + void IncomingPacket::_sendErrorNeedCertificate(const RuntimeEnvironment *RR,const SharedPtr &peer,uint64_t nwid) { Packet outp(source(),RR->identity.address(),Packet::VERB_ERROR); diff --git a/node/IncomingPacket.hpp b/node/IncomingPacket.hpp index 06220c4b..fd7a06c0 100644 --- a/node/IncomingPacket.hpp +++ b/node/IncomingPacket.hpp @@ -107,6 +107,27 @@ public: */ inline uint64_t receiveTime() const throw() { return _receiveTime; } + /** + * Compute the Salsa20/12+SHA512 proof of work function + * + * @param difficulty Difficulty in bits (max: 64) + * @param challenge Challenge string + * @param challengeLength Length of challenge in bytes (max allowed: ZT_PROTO_MAX_PACKET_LENGTH) + * @param result Buffer to fill with 16-byte result + */ + static void computeSalsa2012Sha512ProofOfWork(unsigned int difficulty,const void *challenge,unsigned int challengeLength,unsigned char result[16]); + + /** + * Verify the result of Salsa20/12+SHA512 proof of work + * + * @param difficulty Difficulty in bits (max: 64) + * @param challenge Challenge bytes + * @param challengeLength Length of challenge in bytes (max allowed: ZT_PROTO_MAX_PACKET_LENGTH) + * @param proposedResult Result supplied by client + * @return True if result is valid + */ + static bool testSalsa2012Sha512ProofOfWorkResult(unsigned int difficulty,const void *challenge,unsigned int challengeLength,const unsigned char proposedResult[16]); + private: // These are called internally to handle packet contents once it has // been authenticated, decrypted, decompressed, and classified. @@ -126,6 +147,7 @@ private: bool _doPUSH_DIRECT_PATHS(const RuntimeEnvironment *RR,const SharedPtr &peer); bool _doCIRCUIT_TEST(const RuntimeEnvironment *RR,const SharedPtr &peer); bool _doCIRCUIT_TEST_REPORT(const RuntimeEnvironment *RR,const SharedPtr &peer); + bool _doREQUEST_PROOF_OF_WORK(const RuntimeEnvironment *RR,const SharedPtr &peer); // Send an ERROR_NEED_MEMBERSHIP_CERTIFICATE to a peer indicating that an updated cert is needed to communicate void _sendErrorNeedCertificate(const RuntimeEnvironment *RR,const SharedPtr &peer,uint64_t nwid); diff --git a/node/Packet.cpp b/node/Packet.cpp index f69e4e79..2d973dff 100644 --- a/node/Packet.cpp +++ b/node/Packet.cpp @@ -45,7 +45,6 @@ const char *Packet::verbString(Verb v) case VERB_RENDEZVOUS: return "RENDEZVOUS"; case VERB_FRAME: return "FRAME"; case VERB_EXT_FRAME: return "EXT_FRAME"; - case VERB_P5_MULTICAST_FRAME: return "P5_MULTICAST_FRAME"; case VERB_MULTICAST_LIKE: return "MULTICAST_LIKE"; case VERB_NETWORK_MEMBERSHIP_CERTIFICATE: return "NETWORK_MEMBERSHIP_CERTIFICATE"; case VERB_NETWORK_CONFIG_REQUEST: return "NETWORK_CONFIG_REQUEST"; @@ -56,6 +55,7 @@ const char *Packet::verbString(Verb v) case VERB_PUSH_DIRECT_PATHS: return "PUSH_DIRECT_PATHS"; case VERB_CIRCUIT_TEST: return "CIRCUIT_TEST"; case VERB_CIRCUIT_TEST_REPORT: return "CIRCUIT_TEST_REPORT"; + case VERB_REQUEST_PROOF_OF_WORK: return "REQUEST_PROOF_OF_WORK"; } return "(unknown)"; } diff --git a/node/Packet.hpp b/node/Packet.hpp index 1413db10..93b594e5 100644 --- a/node/Packet.hpp +++ b/node/Packet.hpp @@ -523,10 +523,13 @@ public: */ enum Verb /* Max value: 32 (5 bits) */ { - /* No operation, payload ignored, no reply */ + /** + * No operation (ignored, no reply) + */ VERB_NOP = 0, - /* Announcement of a node's existence: + /** + * Announcement of a node's existence: * <[1] protocol version> * <[1] software major version> * <[1] software minor version> @@ -564,7 +567,8 @@ public: */ VERB_HELLO = 1, - /* Error response: + /** + * Error response: * <[1] in-re verb> * <[8] in-re packet ID> * <[1] error code> @@ -572,14 +576,16 @@ public: */ VERB_ERROR = 2, - /* Success response: + /** + * Success response: * <[1] in-re verb> * <[8] in-re packet ID> * <[...] request-specific payload> */ VERB_OK = 3, - /* Query an identity by address: + /** + * Query an identity by address: * <[5] address to look up> * * OK response payload: @@ -590,7 +596,8 @@ public: */ VERB_WHOIS = 4, - /* Meet another node at a given protocol address: + /** + * Meet another node at a given protocol address: * <[1] flags (unused, currently 0)> * <[5] ZeroTier address of peer that might be found at this address> * <[2] 16-bit protocol address port> @@ -613,7 +620,8 @@ public: */ VERB_RENDEZVOUS = 5, - /* ZT-to-ZT unicast ethernet frame (shortened EXT_FRAME): + /** + * ZT-to-ZT unicast ethernet frame (shortened EXT_FRAME): * <[8] 64-bit network ID> * <[2] 16-bit ethertype> * <[...] ethernet payload> @@ -628,7 +636,8 @@ public: */ VERB_FRAME = 6, - /* Full Ethernet frame with MAC addressing and optional fields: + /** + * Full Ethernet frame with MAC addressing and optional fields: * <[8] 64-bit network ID> * <[1] flags> * [<[...] certificate of network membership>] @@ -652,9 +661,10 @@ public: VERB_EXT_FRAME = 7, /* DEPRECATED */ - VERB_P5_MULTICAST_FRAME = 8, + //VERB_P5_MULTICAST_FRAME = 8, - /* Announce interest in multicast group(s): + /** + * Announce interest in multicast group(s): * <[8] 64-bit network ID> * <[6] multicast Ethernet address> * <[4] multicast additional distinguishing information (ADI)> @@ -667,7 +677,8 @@ public: */ VERB_MULTICAST_LIKE = 9, - /* Network member certificate replication/push: + /** + * Network member certificate replication/push: * <[...] serialized certificate of membership> * [ ... additional certificates may follow ...] * @@ -678,7 +689,8 @@ public: */ VERB_NETWORK_MEMBERSHIP_CERTIFICATE = 10, - /* Network configuration request: + /** + * Network configuration request: * <[8] 64-bit network ID> * <[2] 16-bit length of request meta-data dictionary> * <[...] string-serialized request meta-data> @@ -713,7 +725,8 @@ public: */ VERB_NETWORK_CONFIG_REQUEST = 11, - /* Network configuration refresh request: + /** + * Network configuration refresh request: * <[...] array of 64-bit network IDs> * * This can be sent by the network controller to inform a node that it @@ -724,7 +737,8 @@ public: */ VERB_NETWORK_CONFIG_REFRESH = 12, - /* Request endpoints for multicast distribution: + /** + * Request endpoints for multicast distribution: * <[8] 64-bit network ID> * <[1] flags> * <[6] MAC address of multicast group being queried> @@ -762,7 +776,8 @@ public: */ VERB_MULTICAST_GATHER = 13, - /* Multicast frame: + /** + * Multicast frame: * <[8] 64-bit network ID> * <[1] flags> * [<[...] network certificate of membership>] @@ -803,7 +818,8 @@ public: */ VERB_MULTICAST_FRAME = 14, - /* Ephemeral (PFS) key push: (UNFINISHED, NOT IMPLEMENTED YET) + /** + * Ephemeral (PFS) key push: (UNFINISHED, NOT IMPLEMENTED YET) * <[2] flags (unused and reserved, must be 0)> * <[2] length of padding / extra field section> * <[...] padding / extra field section> @@ -859,7 +875,8 @@ public: */ VERB_SET_EPHEMERAL_KEY = 15, - /* Push of potential endpoints for direct communication: + /** + * Push of potential endpoints for direct communication: * <[2] 16-bit number of paths> * <[...] paths> * @@ -899,7 +916,8 @@ public: */ VERB_PUSH_DIRECT_PATHS = 16, - /* Source-routed circuit test message: + /** + * Source-routed circuit test message: * <[5] address of originator of circuit test> * <[2] 16-bit flags> * <[8] 64-bit timestamp> @@ -977,7 +995,8 @@ public: */ VERB_CIRCUIT_TEST = 17, - /* Circuit test hop report: + /** + * Circuit test hop report: * <[8] 64-bit timestamp (from original test)> * <[8] 64-bit test ID (from original test)> * <[8] 64-bit reporter timestamp (reporter's clock, 0 if unspec)> @@ -1010,7 +1029,50 @@ public: * If a test report is received and no circuit test was sent, it should be * ignored. This message generates no OK or ERROR response. */ - VERB_CIRCUIT_TEST_REPORT = 18 + VERB_CIRCUIT_TEST_REPORT = 18, + + /** + * Request proof of work: + * <[1] 8-bit proof of work type> + * <[1] 8-bit proof of work difficulty> + * <[2] 16-bit length of proof of work challenge> + * <[...] proof of work challenge> + * + * This requests that a peer perform a proof of work calucation. It can be + * sent by highly trusted peers (e.g. root servers, network controllers) + * under suspected denial of service conditions in an attempt to filter + * out "non-serious" peers and remain responsive to those proving their + * intent to actually communicate. + * + * If the peer obliges to perform the work, it does so and responds with + * an OK containing the result. Otherwise it may ignore the message or + * response with an ERROR_INVALID_REQUEST or ERROR_UNSUPPORTED_OPERATION. + * + * Proof of work type IDs: + * 0x01 - Salsa20/12+SHA512 hashcash function + * + * Salsa20/12+SHA512 is based on the following composite hash function: + * + * (1) Compute SHA512(candidate) + * (2) Use the first 256 bits of the result of #1 as a key to encrypt + * 131072 zero bytes with Salsa20/12 (with a zero IV). + * (3) Compute SHA512(the result of step #2) + * (4) Accept this candiate if the first [difficulty] bits of the result + * from step #3 are zero. Otherwise generate a new candidate and try + * again. + * + * This is performed repeatedly on candidates generated by appending the + * supplied challenge to an arbitrary nonce until a valid candidate + * is found. This chosen prepended nonce is then returned as the result + * in OK. + * + * OK payload: + * <[2] 16-bit length of result> + * <[...] computed proof of work> + * + * ERROR has no payload. + */ + VERB_REQUEST_PROOF_OF_WORK = 19 }; /** diff --git a/selftest.cpp b/selftest.cpp index e938cf4d..090839ee 100644 --- a/selftest.cpp +++ b/selftest.cpp @@ -52,6 +52,7 @@ #include "node/CertificateOfMembership.hpp" #include "node/Defaults.hpp" #include "node/Node.hpp" +#include "node/IncomingPacket.hpp" #include "osdep/OSUtils.hpp" #include "osdep/Phy.hpp" @@ -285,6 +286,19 @@ static int testCrypto() ::free((void *)bb); } + /* + for(unsigned int d=8;d<=10;++d) { + for(int k=0;k<8;++k) { + std::cout << "[crypto] computeSalsa2012Sha512ProofOfWork(" << d << ",\"foobarbaz\",9) == "; std::cout.flush(); + unsigned char result[16]; + uint64_t start = OSUtils::now(); + IncomingPacket::computeSalsa2012Sha512ProofOfWork(d,"foobarbaz",9,result); + uint64_t end = OSUtils::now(); + std::cout << Utils::hex(result,16) << " -- valid: " << IncomingPacket::testSalsa2012Sha512ProofOfWorkResult(d,"foobarbaz",9,result) << ", " << (end - start) << "ms" << std::endl; + } + } + */ + std::cout << "[crypto] Testing C25519 and Ed25519 against test vectors... "; std::cout.flush(); for(int k=0;k Date: Wed, 7 Oct 2015 16:11:50 -0700 Subject: Finally add an ECHO. --- node/IncomingPacket.cpp | 15 ++++++++++++++- node/IncomingPacket.hpp | 1 + node/Packet.cpp | 1 + node/Packet.hpp | 28 ++++++++++++++++------------ 4 files changed, 32 insertions(+), 13 deletions(-) (limited to 'node/Packet.cpp') diff --git a/node/IncomingPacket.cpp b/node/IncomingPacket.cpp index cfe5a6c3..083c47f8 100644 --- a/node/IncomingPacket.cpp +++ b/node/IncomingPacket.cpp @@ -83,6 +83,7 @@ bool IncomingPacket::tryDecode(const RuntimeEnvironment *RR) case Packet::VERB_RENDEZVOUS: return _doRENDEZVOUS(RR,peer); case Packet::VERB_FRAME: return _doFRAME(RR,peer); case Packet::VERB_EXT_FRAME: return _doEXT_FRAME(RR,peer); + case Packet::VERB_ECHO: return _doECHO(RR,peer); case Packet::VERB_MULTICAST_LIKE: return _doMULTICAST_LIKE(RR,peer); case Packet::VERB_NETWORK_MEMBERSHIP_CERTIFICATE: return _doNETWORK_MEMBERSHIP_CERTIFICATE(RR,peer); case Packet::VERB_NETWORK_CONFIG_REQUEST: return _doNETWORK_CONFIG_REQUEST(RR,peer); @@ -569,6 +570,18 @@ bool IncomingPacket::_doEXT_FRAME(const RuntimeEnvironment *RR,const SharedPtr

&peer) +{ + try { + Packet outp(peer->address(),RR->identity.address(),Packet::VERB_OK); + outp.append((unsigned char)Packet::VERB_ECHO); + outp.append(packetId()); + outp.append(field(ZT_PACKET_IDX_PAYLOAD,size() - ZT_PACKET_IDX_PAYLOAD),size() - ZT_PACKET_IDX_PAYLOAD); + RR->sw->send(outp,true,0); + } catch ( ... ) {} + return true; +} + bool IncomingPacket::_doMULTICAST_LIKE(const RuntimeEnvironment *RR,const SharedPtr &peer) { try { @@ -636,7 +649,7 @@ bool IncomingPacket::_doNETWORK_CONFIG_REQUEST(const RuntimeEnvironment *RR,cons outp.append(netconfStr.data(),(unsigned int)netconfStr.length()); outp.compress(); outp.armor(peer->key(),true); - if (outp.size() > ZT_PROTO_MAX_PACKET_LENGTH) { + if (outp.size() > ZT_PROTO_MAX_PACKET_LENGTH) { // sanity check TRACE("NETWORK_CONFIG_REQUEST failed: internal error: netconf size %u is too large",(unsigned int)netconfStr.length()); } else { RR->node->putPacket(_localAddress,_remoteAddress,outp.data(),outp.size()); diff --git a/node/IncomingPacket.hpp b/node/IncomingPacket.hpp index fd7a06c0..f5dd4b27 100644 --- a/node/IncomingPacket.hpp +++ b/node/IncomingPacket.hpp @@ -138,6 +138,7 @@ private: bool _doRENDEZVOUS(const RuntimeEnvironment *RR,const SharedPtr &peer); bool _doFRAME(const RuntimeEnvironment *RR,const SharedPtr &peer); bool _doEXT_FRAME(const RuntimeEnvironment *RR,const SharedPtr &peer); + bool _doECHO(const RuntimeEnvironment *RR,const SharedPtr &peer); bool _doMULTICAST_LIKE(const RuntimeEnvironment *RR,const SharedPtr &peer); bool _doNETWORK_MEMBERSHIP_CERTIFICATE(const RuntimeEnvironment *RR,const SharedPtr &peer); bool _doNETWORK_CONFIG_REQUEST(const RuntimeEnvironment *RR,const SharedPtr &peer); diff --git a/node/Packet.cpp b/node/Packet.cpp index 2d973dff..2fb7d488 100644 --- a/node/Packet.cpp +++ b/node/Packet.cpp @@ -45,6 +45,7 @@ const char *Packet::verbString(Verb v) case VERB_RENDEZVOUS: return "RENDEZVOUS"; case VERB_FRAME: return "FRAME"; case VERB_EXT_FRAME: return "EXT_FRAME"; + case VERB_ECHO: return "ECHO"; case VERB_MULTICAST_LIKE: return "MULTICAST_LIKE"; case VERB_NETWORK_MEMBERSHIP_CERTIFICATE: return "NETWORK_MEMBERSHIP_CERTIFICATE"; case VERB_NETWORK_CONFIG_REQUEST: return "NETWORK_CONFIG_REQUEST"; diff --git a/node/Packet.hpp b/node/Packet.hpp index 93b594e5..01aadad0 100644 --- a/node/Packet.hpp +++ b/node/Packet.hpp @@ -46,22 +46,20 @@ #include "../ext/lz4/lz4.h" /** - * Protocol version -- incremented only for MAJOR changes + * Protocol version -- incremented only for major changes * * 1 - 0.2.0 ... 0.2.5 * 2 - 0.3.0 ... 0.4.5 - * * Added signature and originating peer to multicast frame - * * Double size of multicast frame bloom filter + * + Added signature and originating peer to multicast frame + * + Double size of multicast frame bloom filter * 3 - 0.5.0 ... 0.6.0 - * * Yet another multicast redesign - * * New crypto completely changes key agreement cipher + * + Yet another multicast redesign + * + New crypto completely changes key agreement cipher * 4 - 0.6.0 ... 1.0.6 - * * New identity format based on hashcash design + * + New identity format based on hashcash design * 5 - 1.0.6 ... CURRENT - * * Supports CIRCUIT_TEST and friends, otherwise compatibie w/v4 - * - * This isn't going to change again for a long time unless your - * author wakes up again at 4am with another great idea. :P + * + Supports circuit test, proof of work, and echo + * + Otherwise backward compatible with 4 */ #define ZT_PROTO_VERSION 5 @@ -660,8 +658,14 @@ public: */ VERB_EXT_FRAME = 7, - /* DEPRECATED */ - //VERB_P5_MULTICAST_FRAME = 8, + /** + * ECHO request (a.k.a. ping): + * <[...] arbitrary payload to be echoed back> + * + * This generates OK with a copy of the transmitted payload. No ERROR + * is generated. Response to ECHO requests is optional. + */ + VERB_ECHO = 8, /** * Announce interest in multicast group(s): -- cgit v1.2.3 From 0c498556d5b11c101d2b18cf85cff2d53aa97d58 Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Fri, 9 Oct 2015 09:39:27 -0700 Subject: Unroll Salsa20 fully for a little more speed (non-SSE now almost as fast as SSE) --- node/Identity.cpp | 9 +- node/IncomingPacket.cpp | 8 +- node/Node.cpp | 6 +- node/Packet.cpp | 12 +- node/Salsa20.cpp | 1206 ++++++++++++++++++++++++++++++++++++++++++----- node/Salsa20.hpp | 40 +- selftest.cpp | 42 +- 7 files changed, 1138 insertions(+), 185 deletions(-) (limited to 'node/Packet.cpp') diff --git a/node/Identity.cpp b/node/Identity.cpp index 8765da51..e5aaf13d 100644 --- a/node/Identity.cpp +++ b/node/Identity.cpp @@ -41,7 +41,6 @@ #define ZT_IDENTITY_GEN_HASHCASH_FIRST_BYTE_LESS_THAN 17 #define ZT_IDENTITY_GEN_MEMORY 2097152 -#define ZT_IDENTITY_GEN_SALSA20_ROUNDS 20 namespace ZeroTier { @@ -55,8 +54,8 @@ static inline void _computeMemoryHardHash(const void *publicKey,unsigned int pub // ordinary Salsa20 is randomly seekable. This is good for a cipher // but is not what we want for sequential memory-harndess. memset(genmem,0,ZT_IDENTITY_GEN_MEMORY); - Salsa20 s20(digest,256,(char *)digest + 32,ZT_IDENTITY_GEN_SALSA20_ROUNDS); - s20.encrypt((char *)genmem,(char *)genmem,64); + Salsa20 s20(digest,256,(char *)digest + 32); + s20.encrypt20((char *)genmem,(char *)genmem,64); for(unsigned long i=64;i(candidate)); SHA512::hash(shabuf,candidate,16 + challengeLength); - s20.init(shabuf,256,&s20iv,12); + s20.init(shabuf,256,&s20iv); memset(salsabuf,0,sizeof(salsabuf)); - s20.encrypt(salsabuf,salsabuf,sizeof(salsabuf)); + s20.encrypt12(salsabuf,salsabuf,sizeof(salsabuf)); SHA512::hash(shabuf,salsabuf,sizeof(salsabuf)); d = difficulty; @@ -1186,9 +1186,9 @@ bool IncomingPacket::testSalsa2012Sha512ProofOfWorkResult(unsigned int difficult memcpy(candidate + 16,challenge,challengeLength); SHA512::hash(shabuf,candidate,16 + challengeLength); - s20.init(shabuf,256,&s20iv,12); + s20.init(shabuf,256,&s20iv); memset(salsabuf,0,sizeof(salsabuf)); - s20.encrypt(salsabuf,salsabuf,sizeof(salsabuf)); + s20.encrypt12(salsabuf,salsabuf,sizeof(salsabuf)); SHA512::hash(shabuf,salsabuf,sizeof(salsabuf)); d = difficulty; diff --git a/node/Node.cpp b/node/Node.cpp index 7f469b97..84452146 100644 --- a/node/Node.cpp +++ b/node/Node.cpp @@ -88,9 +88,9 @@ Node::Node( { char foo[32]; Utils::getSecureRandom(foo,32); - _prng.init(foo,256,foo,8); + _prng.init(foo,256,foo); memset(_prngStream,0,sizeof(_prngStream)); - _prng.encrypt(_prngStream,_prngStream,sizeof(_prngStream)); + _prng.encrypt12(_prngStream,_prngStream,sizeof(_prngStream)); } std::string idtmp(dataStoreGet("identity.secret")); @@ -574,7 +574,7 @@ uint64_t Node::prng() { unsigned int p = (++_prngStreamPtr % (sizeof(_prngStream) / sizeof(uint64_t))); if (!p) - _prng.encrypt(_prngStream,_prngStream,sizeof(_prngStream)); + _prng.encrypt12(_prngStream,_prngStream,sizeof(_prngStream)); return _prngStream[p]; } diff --git a/node/Packet.cpp b/node/Packet.cpp index 2fb7d488..f11ae1b8 100644 --- a/node/Packet.cpp +++ b/node/Packet.cpp @@ -92,14 +92,14 @@ void Packet::armor(const void *key,bool encryptPayload) setCipher(encryptPayload ? ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012 : ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE); _salsa20MangleKey((const unsigned char *)key,mangledKey); - Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8),ZT_PROTO_SALSA20_ROUNDS); + Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8)/*,ZT_PROTO_SALSA20_ROUNDS*/); // MAC key is always the first 32 bytes of the Salsa20 key stream // This is the same construction DJB's NaCl library uses - s20.encrypt(ZERO_KEY,macKey,sizeof(macKey)); + s20.encrypt12(ZERO_KEY,macKey,sizeof(macKey)); if (encryptPayload) - s20.encrypt(payload,payload,payloadLen); + s20.encrypt12(payload,payload,payloadLen); Poly1305::compute(mac,payload,payloadLen,macKey); memcpy(field(ZT_PACKET_IDX_MAC,8),mac,8); @@ -116,15 +116,15 @@ bool Packet::dearmor(const void *key) if ((cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE)||(cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012)) { _salsa20MangleKey((const unsigned char *)key,mangledKey); - Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8),ZT_PROTO_SALSA20_ROUNDS); + Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8)/*,ZT_PROTO_SALSA20_ROUNDS*/); - s20.encrypt(ZERO_KEY,macKey,sizeof(macKey)); + s20.encrypt12(ZERO_KEY,macKey,sizeof(macKey)); Poly1305::compute(mac,payload,payloadLen,macKey); if (!Utils::secureEq(mac,field(ZT_PACKET_IDX_MAC,8),8)) return false; if (cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012) - s20.decrypt(payload,payload,payloadLen); + s20.decrypt12(payload,payload,payloadLen); return true; } else return false; // unrecognized cipher suite diff --git a/node/Salsa20.cpp b/node/Salsa20.cpp index dec14faf..3aa19ac6 100644 --- a/node/Salsa20.cpp +++ b/node/Salsa20.cpp @@ -66,7 +66,7 @@ static const _s20sseconsts _S20SSECONSTANTS; namespace ZeroTier { -void Salsa20::init(const void *key,unsigned int kbits,const void *iv,unsigned int rounds) +void Salsa20::init(const void *key,unsigned int kbits,const void *iv) throw() { #ifdef ZT_SALSA20_SSE @@ -121,11 +121,9 @@ void Salsa20::init(const void *key,unsigned int kbits,const void *iv,unsigned in _state.i[15] = U8TO32_LITTLE(constants + 12); _state.i[0] = U8TO32_LITTLE(constants + 0); #endif - - _roundsDiv4 = rounds / 4; } -void Salsa20::encrypt(const void *in,void *out,unsigned int bytes) +void Salsa20::encrypt12(const void *in,void *out,unsigned int bytes) throw() { uint8_t tmp[64]; @@ -181,61 +179,149 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes) __m128i X2s = X2; __m128i X3s = X3; - for (i=0;i<_roundsDiv4;++i) { - T = _mm_add_epi32(X0, X3); - X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); - T = _mm_add_epi32(X1, X0); - X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); - T = _mm_add_epi32(X2, X1); - X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); - T = _mm_add_epi32(X3, X2); - X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); - - X1 = _mm_shuffle_epi32(X1, 0x93); - X2 = _mm_shuffle_epi32(X2, 0x4E); - X3 = _mm_shuffle_epi32(X3, 0x39); - - T = _mm_add_epi32(X0, X1); - X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); - T = _mm_add_epi32(X3, X0); - X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); - T = _mm_add_epi32(X2, X3); - X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); - T = _mm_add_epi32(X1, X2); - X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); - - X1 = _mm_shuffle_epi32(X1, 0x39); - X2 = _mm_shuffle_epi32(X2, 0x4E); - X3 = _mm_shuffle_epi32(X3, 0x93); - - // -- - - T = _mm_add_epi32(X0, X3); - X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); - T = _mm_add_epi32(X1, X0); - X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); - T = _mm_add_epi32(X2, X1); - X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); - T = _mm_add_epi32(X3, X2); - X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); - - X1 = _mm_shuffle_epi32(X1, 0x93); - X2 = _mm_shuffle_epi32(X2, 0x4E); - X3 = _mm_shuffle_epi32(X3, 0x39); - - T = _mm_add_epi32(X0, X1); - X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); - T = _mm_add_epi32(X3, X0); - X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); - T = _mm_add_epi32(X2, X3); - X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); - T = _mm_add_epi32(X1, X2); - X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); - - X1 = _mm_shuffle_epi32(X1, 0x39); - X2 = _mm_shuffle_epi32(X2, 0x4E); - X3 = _mm_shuffle_epi32(X3, 0x93); - } + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); X0 = _mm_add_epi32(X0s,X0); X1 = _mm_add_epi32(X1s,X1); @@ -273,76 +359,942 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes) x14 = j14; x15 = j15; - for(i=0;i<_roundsDiv4;++i) { - x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); - x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); - x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); - x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); - x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); - x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); - x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); - x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); - x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); - x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); - x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); - x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); - x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); - x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); - x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); - x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); - x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); - x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); - x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); - x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); - x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); - x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); - x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); - x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); - x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); - x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); - x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); - x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); - x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); - x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); - x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); - x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); - - // -- - - x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); - x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); - x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); - x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); - x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); - x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); - x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); - x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); - x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); - x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); - x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); - x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); - x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); - x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); - x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); - x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); - x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); - x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); - x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); - x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); - x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); - x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); - x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); - x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); - x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); - x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); - x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); - x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); - x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); - x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); - x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); - x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + x0 = PLUS(x0,j0); + x1 = PLUS(x1,j1); + x2 = PLUS(x2,j2); + x3 = PLUS(x3,j3); + x4 = PLUS(x4,j4); + x5 = PLUS(x5,j5); + x6 = PLUS(x6,j6); + x7 = PLUS(x7,j7); + x8 = PLUS(x8,j8); + x9 = PLUS(x9,j9); + x10 = PLUS(x10,j10); + x11 = PLUS(x11,j11); + x12 = PLUS(x12,j12); + x13 = PLUS(x13,j13); + x14 = PLUS(x14,j14); + x15 = PLUS(x15,j15); + + U32TO8_LITTLE(c + 0,XOR(x0,U8TO32_LITTLE(m + 0))); + U32TO8_LITTLE(c + 4,XOR(x1,U8TO32_LITTLE(m + 4))); + U32TO8_LITTLE(c + 8,XOR(x2,U8TO32_LITTLE(m + 8))); + U32TO8_LITTLE(c + 12,XOR(x3,U8TO32_LITTLE(m + 12))); + U32TO8_LITTLE(c + 16,XOR(x4,U8TO32_LITTLE(m + 16))); + U32TO8_LITTLE(c + 20,XOR(x5,U8TO32_LITTLE(m + 20))); + U32TO8_LITTLE(c + 24,XOR(x6,U8TO32_LITTLE(m + 24))); + U32TO8_LITTLE(c + 28,XOR(x7,U8TO32_LITTLE(m + 28))); + U32TO8_LITTLE(c + 32,XOR(x8,U8TO32_LITTLE(m + 32))); + U32TO8_LITTLE(c + 36,XOR(x9,U8TO32_LITTLE(m + 36))); + U32TO8_LITTLE(c + 40,XOR(x10,U8TO32_LITTLE(m + 40))); + U32TO8_LITTLE(c + 44,XOR(x11,U8TO32_LITTLE(m + 44))); + U32TO8_LITTLE(c + 48,XOR(x12,U8TO32_LITTLE(m + 48))); + U32TO8_LITTLE(c + 52,XOR(x13,U8TO32_LITTLE(m + 52))); + U32TO8_LITTLE(c + 56,XOR(x14,U8TO32_LITTLE(m + 56))); + U32TO8_LITTLE(c + 60,XOR(x15,U8TO32_LITTLE(m + 60))); + + if (!(++j8)) { + ++j9; + /* stopping at 2^70 bytes per nonce is user's responsibility */ + } +#endif + + if (bytes <= 64) { + if (bytes < 64) { + for (i = 0;i < bytes;++i) + ctarget[i] = c[i]; + } + +#ifndef ZT_SALSA20_SSE + _state.i[8] = j8; + _state.i[9] = j9; +#endif + + return; + } + + bytes -= 64; + c += 64; + m += 64; + } +} + +void Salsa20::encrypt20(const void *in,void *out,unsigned int bytes) + throw() +{ + uint8_t tmp[64]; + const uint8_t *m = (const uint8_t *)in; + uint8_t *c = (uint8_t *)out; + uint8_t *ctarget = c; + unsigned int i; + +#ifndef ZT_SALSA20_SSE + uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; + uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; +#endif + + if (!bytes) + return; + +#ifndef ZT_SALSA20_SSE + j0 = _state.i[0]; + j1 = _state.i[1]; + j2 = _state.i[2]; + j3 = _state.i[3]; + j4 = _state.i[4]; + j5 = _state.i[5]; + j6 = _state.i[6]; + j7 = _state.i[7]; + j8 = _state.i[8]; + j9 = _state.i[9]; + j10 = _state.i[10]; + j11 = _state.i[11]; + j12 = _state.i[12]; + j13 = _state.i[13]; + j14 = _state.i[14]; + j15 = _state.i[15]; +#endif + + for (;;) { + if (bytes < 64) { + for (i = 0;i < bytes;++i) + tmp[i] = m[i]; + m = tmp; + ctarget = c; + c = tmp; } +#ifdef ZT_SALSA20_SSE + __m128i X0 = _mm_loadu_si128((const __m128i *)&(_state.v[0])); + __m128i X1 = _mm_loadu_si128((const __m128i *)&(_state.v[1])); + __m128i X2 = _mm_loadu_si128((const __m128i *)&(_state.v[2])); + __m128i X3 = _mm_loadu_si128((const __m128i *)&(_state.v[3])); + __m128i T; + __m128i X0s = X0; + __m128i X1s = X1; + __m128i X2s = X2; + __m128i X3s = X3; + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + // 2X round ------------------------------------------------------------- + T = _mm_add_epi32(X0, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X1, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X3, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x93); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x39); + T = _mm_add_epi32(X0, X1); + X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25)); + T = _mm_add_epi32(X3, X0); + X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23)); + T = _mm_add_epi32(X2, X3); + X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19)); + T = _mm_add_epi32(X1, X2); + X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14)); + X1 = _mm_shuffle_epi32(X1, 0x39); + X2 = _mm_shuffle_epi32(X2, 0x4E); + X3 = _mm_shuffle_epi32(X3, 0x93); + + X0 = _mm_add_epi32(X0s,X0); + X1 = _mm_add_epi32(X1s,X1); + X2 = _mm_add_epi32(X2s,X2); + X3 = _mm_add_epi32(X3s,X3); + + __m128i k02 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X0, 32), _mm_srli_epi64(X3, 32)), _MM_SHUFFLE(0, 1, 2, 3)); + __m128i k13 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X1, 32), _mm_srli_epi64(X0, 32)), _MM_SHUFFLE(0, 1, 2, 3)); + __m128i k20 = _mm_or_si128(_mm_and_si128(X2, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X1, _S20SSECONSTANTS.maskHi32)); + __m128i k31 = _mm_or_si128(_mm_and_si128(X3, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X2, _S20SSECONSTANTS.maskHi32)); + _mm_storeu_ps(reinterpret_cast(c),_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k02,k20),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(m)))))); + _mm_storeu_ps(reinterpret_cast(c) + 4,_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k13,k31),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(m) + 4))))); + _mm_storeu_ps(reinterpret_cast(c) + 8,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k20,k02),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(m) + 8))))); + _mm_storeu_ps(reinterpret_cast(c) + 12,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k31,k13),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast(m) + 12))))); + + if (!(++_state.i[8])) { + ++_state.i[5]; // state reordered for SSE + /* stopping at 2^70 bytes per nonce is user's responsibility */ + } +#else + x0 = j0; + x1 = j1; + x2 = j2; + x3 = j3; + x4 = j4; + x5 = j5; + x6 = j6; + x7 = j7; + x8 = j8; + x9 = j9; + x10 = j10; + x11 = j11; + x12 = j12; + x13 = j13; + x14 = j14; + x15 = j15; + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + + // 2X round ------------------------------------------------------------- + x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7)); + x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9)); + x12 = XOR(x12,ROTATE(PLUS( x8, x4),13)); + x0 = XOR( x0,ROTATE(PLUS(x12, x8),18)); + x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7)); + x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9)); + x1 = XOR( x1,ROTATE(PLUS(x13, x9),13)); + x5 = XOR( x5,ROTATE(PLUS( x1,x13),18)); + x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7)); + x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9)); + x6 = XOR( x6,ROTATE(PLUS( x2,x14),13)); + x10 = XOR(x10,ROTATE(PLUS( x6, x2),18)); + x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7)); + x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9)); + x11 = XOR(x11,ROTATE(PLUS( x7, x3),13)); + x15 = XOR(x15,ROTATE(PLUS(x11, x7),18)); + x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7)); + x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9)); + x3 = XOR( x3,ROTATE(PLUS( x2, x1),13)); + x0 = XOR( x0,ROTATE(PLUS( x3, x2),18)); + x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7)); + x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9)); + x4 = XOR( x4,ROTATE(PLUS( x7, x6),13)); + x5 = XOR( x5,ROTATE(PLUS( x4, x7),18)); + x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7)); + x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9)); + x9 = XOR( x9,ROTATE(PLUS( x8,x11),13)); + x10 = XOR(x10,ROTATE(PLUS( x9, x8),18)); + x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7)); + x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9)); + x14 = XOR(x14,ROTATE(PLUS(x13,x12),13)); + x15 = XOR(x15,ROTATE(PLUS(x14,x13),18)); + x0 = PLUS(x0,j0); x1 = PLUS(x1,j1); x2 = PLUS(x2,j2); diff --git a/node/Salsa20.hpp b/node/Salsa20.hpp index 84baf3da..a2082bea 100644 --- a/node/Salsa20.hpp +++ b/node/Salsa20.hpp @@ -35,12 +35,11 @@ public: * @param key Key bits * @param kbits Number of key bits: 128 or 256 (recommended) * @param iv 64-bit initialization vector - * @param rounds Number of rounds: 8, 12, or 20 */ - Salsa20(const void *key,unsigned int kbits,const void *iv,unsigned int rounds) + Salsa20(const void *key,unsigned int kbits,const void *iv) throw() { - init(key,kbits,iv,rounds); + init(key,kbits,iv); } /** @@ -49,21 +48,43 @@ public: * @param key Key bits * @param kbits Number of key bits: 128 or 256 (recommended) * @param iv 64-bit initialization vector - * @param rounds Number of rounds: 8, 12, or 20 */ - void init(const void *key,unsigned int kbits,const void *iv,unsigned int rounds) + void init(const void *key,unsigned int kbits,const void *iv) throw(); /** - * Encrypt data + * Encrypt data using Salsa20/12 * * @param in Input data * @param out Output buffer * @param bytes Length of data */ - void encrypt(const void *in,void *out,unsigned int bytes) + void encrypt12(const void *in,void *out,unsigned int bytes) throw(); + /** + * Encrypt data using Salsa20/20 + * + * @param in Input data + * @param out Output buffer + * @param bytes Length of data + */ + void encrypt20(const void *in,void *out,unsigned int bytes) + throw(); + + /** + * Decrypt data + * + * @param in Input data + * @param out Output buffer + * @param bytes Length of data + */ + inline void decrypt12(const void *in,void *out,unsigned int bytes) + throw() + { + encrypt12(in,out,bytes); + } + /** * Decrypt data * @@ -71,10 +92,10 @@ public: * @param out Output buffer * @param bytes Length of data */ - inline void decrypt(const void *in,void *out,unsigned int bytes) + inline void decrypt20(const void *in,void *out,unsigned int bytes) throw() { - encrypt(in,out,bytes); + encrypt20(in,out,bytes); } private: @@ -84,7 +105,6 @@ private: #endif // ZT_SALSA20_SSE uint32_t i[16]; } _state; - unsigned int _roundsDiv4; }; } // namespace ZeroTier diff --git a/selftest.cpp b/selftest.cpp index 090839ee..4ba76c0b 100644 --- a/selftest.cpp +++ b/selftest.cpp @@ -162,27 +162,27 @@ static int testCrypto() memset(buf2,0,sizeof(buf2)); memset(buf3,0,sizeof(buf3)); Salsa20 s20; - s20.init("12345678123456781234567812345678",256,"12345678",20); - s20.encrypt(buf1,buf2,sizeof(buf1)); - s20.init("12345678123456781234567812345678",256,"12345678",20); - s20.decrypt(buf2,buf3,sizeof(buf2)); + s20.init("12345678123456781234567812345678",256,"12345678"); + s20.encrypt20(buf1,buf2,sizeof(buf1)); + s20.init("12345678123456781234567812345678",256,"12345678"); + s20.decrypt20(buf2,buf3,sizeof(buf2)); if (memcmp(buf1,buf3,sizeof(buf1))) { std::cout << "FAIL (encrypt/decrypt test)" << std::endl; return -1; } } - Salsa20 s20(s20TV0Key,256,s20TV0Iv,20); + Salsa20 s20(s20TV0Key,256,s20TV0Iv); memset(buf1,0,sizeof(buf1)); memset(buf2,0,sizeof(buf2)); - s20.encrypt(buf1,buf2,64); + s20.encrypt20(buf1,buf2,64); if (memcmp(buf2,s20TV0Ks,64)) { std::cout << "FAIL (test vector 0)" << std::endl; return -1; } - s20.init(s2012TV0Key,256,s2012TV0Iv,12); + s20.init(s2012TV0Key,256,s2012TV0Iv); memset(buf1,0,sizeof(buf1)); memset(buf2,0,sizeof(buf2)); - s20.encrypt(buf1,buf2,64); + s20.encrypt12(buf1,buf2,64); if (memcmp(buf2,s2012TV0Ks,64)) { std::cout << "FAIL (test vector 1)" << std::endl; return -1; @@ -195,34 +195,16 @@ static int testCrypto() std::cout << "[crypto] Salsa20 SSE: DISABLED" << std::endl; #endif - std::cout << "[crypto] Benchmarking Salsa20/8... "; std::cout.flush(); - { - unsigned char *bb = (unsigned char *)::malloc(1234567); - for(unsigned int i=0;i<1234567;++i) - bb[i] = (unsigned char)i; - Salsa20 s20(s20TV0Key,256,s20TV0Iv,8); - double bytes = 0.0; - uint64_t start = OSUtils::now(); - for(unsigned int i=0;i<200;++i) { - s20.encrypt(bb,bb,1234567); - bytes += 1234567.0; - } - uint64_t end = OSUtils::now(); - SHA512::hash(buf1,bb,1234567); - std::cout << ((bytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (" << Utils::hex(buf1,16) << ')' << std::endl; - ::free((void *)bb); - } - std::cout << "[crypto] Benchmarking Salsa20/12... "; std::cout.flush(); { unsigned char *bb = (unsigned char *)::malloc(1234567); for(unsigned int i=0;i<1234567;++i) bb[i] = (unsigned char)i; - Salsa20 s20(s20TV0Key,256,s20TV0Iv,12); + Salsa20 s20(s20TV0Key,256,s20TV0Iv); double bytes = 0.0; uint64_t start = OSUtils::now(); for(unsigned int i=0;i<200;++i) { - s20.encrypt(bb,bb,1234567); + s20.encrypt12(bb,bb,1234567); bytes += 1234567.0; } uint64_t end = OSUtils::now(); @@ -236,11 +218,11 @@ static int testCrypto() unsigned char *bb = (unsigned char *)::malloc(1234567); for(unsigned int i=0;i<1234567;++i) bb[i] = (unsigned char)i; - Salsa20 s20(s20TV0Key,256,s20TV0Iv,20); + Salsa20 s20(s20TV0Key,256,s20TV0Iv); double bytes = 0.0; uint64_t start = OSUtils::now(); for(unsigned int i=0;i<200;++i) { - s20.encrypt(bb,bb,1234567); + s20.encrypt20(bb,bb,1234567); bytes += 1234567.0; } uint64_t end = OSUtils::now(); -- cgit v1.2.3