summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--node/Identity.cpp9
-rw-r--r--node/IncomingPacket.cpp8
-rw-r--r--node/Node.cpp6
-rw-r--r--node/Packet.cpp12
-rw-r--r--node/Salsa20.cpp1206
-rw-r--r--node/Salsa20.hpp40
-rw-r--r--selftest.cpp42
7 files changed, 1138 insertions, 185 deletions
diff --git a/node/Identity.cpp b/node/Identity.cpp
index 8765da51..e5aaf13d 100644
--- a/node/Identity.cpp
+++ b/node/Identity.cpp
@@ -41,7 +41,6 @@
#define ZT_IDENTITY_GEN_HASHCASH_FIRST_BYTE_LESS_THAN 17
#define ZT_IDENTITY_GEN_MEMORY 2097152
-#define ZT_IDENTITY_GEN_SALSA20_ROUNDS 20
namespace ZeroTier {
@@ -55,8 +54,8 @@ static inline void _computeMemoryHardHash(const void *publicKey,unsigned int pub
// ordinary Salsa20 is randomly seekable. This is good for a cipher
// but is not what we want for sequential memory-harndess.
memset(genmem,0,ZT_IDENTITY_GEN_MEMORY);
- Salsa20 s20(digest,256,(char *)digest + 32,ZT_IDENTITY_GEN_SALSA20_ROUNDS);
- s20.encrypt((char *)genmem,(char *)genmem,64);
+ Salsa20 s20(digest,256,(char *)digest + 32);
+ s20.encrypt20((char *)genmem,(char *)genmem,64);
for(unsigned long i=64;i<ZT_IDENTITY_GEN_MEMORY;i+=64) {
unsigned long k = i - 64;
*((uint64_t *)((char *)genmem + i)) = *((uint64_t *)((char *)genmem + k));
@@ -67,7 +66,7 @@ static inline void _computeMemoryHardHash(const void *publicKey,unsigned int pub
*((uint64_t *)((char *)genmem + i + 40)) = *((uint64_t *)((char *)genmem + k + 40));
*((uint64_t *)((char *)genmem + i + 48)) = *((uint64_t *)((char *)genmem + k + 48));
*((uint64_t *)((char *)genmem + i + 56)) = *((uint64_t *)((char *)genmem + k + 56));
- s20.encrypt((char *)genmem + i,(char *)genmem + i,64);
+ s20.encrypt20((char *)genmem + i,(char *)genmem + i,64);
}
// Render final digest using genmem as a lookup table
@@ -77,7 +76,7 @@ static inline void _computeMemoryHardHash(const void *publicKey,unsigned int pub
uint64_t tmp = ((uint64_t *)genmem)[idx2];
((uint64_t *)genmem)[idx2] = ((uint64_t *)digest)[idx1];
((uint64_t *)digest)[idx1] = tmp;
- s20.encrypt(digest,digest,64);
+ s20.encrypt20(digest,digest,64);
}
}
diff --git a/node/IncomingPacket.cpp b/node/IncomingPacket.cpp
index 0aadc104..5d31a5d4 100644
--- a/node/IncomingPacket.cpp
+++ b/node/IncomingPacket.cpp
@@ -1149,9 +1149,9 @@ try_salsa2012sha512_again:
++*(reinterpret_cast<volatile uint64_t *>(candidate));
SHA512::hash(shabuf,candidate,16 + challengeLength);
- s20.init(shabuf,256,&s20iv,12);
+ s20.init(shabuf,256,&s20iv);
memset(salsabuf,0,sizeof(salsabuf));
- s20.encrypt(salsabuf,salsabuf,sizeof(salsabuf));
+ s20.encrypt12(salsabuf,salsabuf,sizeof(salsabuf));
SHA512::hash(shabuf,salsabuf,sizeof(salsabuf));
d = difficulty;
@@ -1186,9 +1186,9 @@ bool IncomingPacket::testSalsa2012Sha512ProofOfWorkResult(unsigned int difficult
memcpy(candidate + 16,challenge,challengeLength);
SHA512::hash(shabuf,candidate,16 + challengeLength);
- s20.init(shabuf,256,&s20iv,12);
+ s20.init(shabuf,256,&s20iv);
memset(salsabuf,0,sizeof(salsabuf));
- s20.encrypt(salsabuf,salsabuf,sizeof(salsabuf));
+ s20.encrypt12(salsabuf,salsabuf,sizeof(salsabuf));
SHA512::hash(shabuf,salsabuf,sizeof(salsabuf));
d = difficulty;
diff --git a/node/Node.cpp b/node/Node.cpp
index 7f469b97..84452146 100644
--- a/node/Node.cpp
+++ b/node/Node.cpp
@@ -88,9 +88,9 @@ Node::Node(
{
char foo[32];
Utils::getSecureRandom(foo,32);
- _prng.init(foo,256,foo,8);
+ _prng.init(foo,256,foo);
memset(_prngStream,0,sizeof(_prngStream));
- _prng.encrypt(_prngStream,_prngStream,sizeof(_prngStream));
+ _prng.encrypt12(_prngStream,_prngStream,sizeof(_prngStream));
}
std::string idtmp(dataStoreGet("identity.secret"));
@@ -574,7 +574,7 @@ uint64_t Node::prng()
{
unsigned int p = (++_prngStreamPtr % (sizeof(_prngStream) / sizeof(uint64_t)));
if (!p)
- _prng.encrypt(_prngStream,_prngStream,sizeof(_prngStream));
+ _prng.encrypt12(_prngStream,_prngStream,sizeof(_prngStream));
return _prngStream[p];
}
diff --git a/node/Packet.cpp b/node/Packet.cpp
index 2fb7d488..f11ae1b8 100644
--- a/node/Packet.cpp
+++ b/node/Packet.cpp
@@ -92,14 +92,14 @@ void Packet::armor(const void *key,bool encryptPayload)
setCipher(encryptPayload ? ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012 : ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE);
_salsa20MangleKey((const unsigned char *)key,mangledKey);
- Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8),ZT_PROTO_SALSA20_ROUNDS);
+ Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8)/*,ZT_PROTO_SALSA20_ROUNDS*/);
// MAC key is always the first 32 bytes of the Salsa20 key stream
// This is the same construction DJB's NaCl library uses
- s20.encrypt(ZERO_KEY,macKey,sizeof(macKey));
+ s20.encrypt12(ZERO_KEY,macKey,sizeof(macKey));
if (encryptPayload)
- s20.encrypt(payload,payload,payloadLen);
+ s20.encrypt12(payload,payload,payloadLen);
Poly1305::compute(mac,payload,payloadLen,macKey);
memcpy(field(ZT_PACKET_IDX_MAC,8),mac,8);
@@ -116,15 +116,15 @@ bool Packet::dearmor(const void *key)
if ((cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_NONE)||(cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012)) {
_salsa20MangleKey((const unsigned char *)key,mangledKey);
- Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8),ZT_PROTO_SALSA20_ROUNDS);
+ Salsa20 s20(mangledKey,256,field(ZT_PACKET_IDX_IV,8)/*,ZT_PROTO_SALSA20_ROUNDS*/);
- s20.encrypt(ZERO_KEY,macKey,sizeof(macKey));
+ s20.encrypt12(ZERO_KEY,macKey,sizeof(macKey));
Poly1305::compute(mac,payload,payloadLen,macKey);
if (!Utils::secureEq(mac,field(ZT_PACKET_IDX_MAC,8),8))
return false;
if (cs == ZT_PROTO_CIPHER_SUITE__C25519_POLY1305_SALSA2012)
- s20.decrypt(payload,payload,payloadLen);
+ s20.decrypt12(payload,payload,payloadLen);
return true;
} else return false; // unrecognized cipher suite
diff --git a/node/Salsa20.cpp b/node/Salsa20.cpp
index dec14faf..3aa19ac6 100644
--- a/node/Salsa20.cpp
+++ b/node/Salsa20.cpp
@@ -66,7 +66,7 @@ static const _s20sseconsts _S20SSECONSTANTS;
namespace ZeroTier {
-void Salsa20::init(const void *key,unsigned int kbits,const void *iv,unsigned int rounds)
+void Salsa20::init(const void *key,unsigned int kbits,const void *iv)
throw()
{
#ifdef ZT_SALSA20_SSE
@@ -121,11 +121,9 @@ void Salsa20::init(const void *key,unsigned int kbits,const void *iv,unsigned in
_state.i[15] = U8TO32_LITTLE(constants + 12);
_state.i[0] = U8TO32_LITTLE(constants + 0);
#endif
-
- _roundsDiv4 = rounds / 4;
}
-void Salsa20::encrypt(const void *in,void *out,unsigned int bytes)
+void Salsa20::encrypt12(const void *in,void *out,unsigned int bytes)
throw()
{
uint8_t tmp[64];
@@ -181,61 +179,149 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes)
__m128i X2s = X2;
__m128i X3s = X3;
- for (i=0;i<_roundsDiv4;++i) {
- T = _mm_add_epi32(X0, X3);
- X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
- T = _mm_add_epi32(X1, X0);
- X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
- T = _mm_add_epi32(X2, X1);
- X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
- T = _mm_add_epi32(X3, X2);
- X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-
- X1 = _mm_shuffle_epi32(X1, 0x93);
- X2 = _mm_shuffle_epi32(X2, 0x4E);
- X3 = _mm_shuffle_epi32(X3, 0x39);
-
- T = _mm_add_epi32(X0, X1);
- X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
- T = _mm_add_epi32(X3, X0);
- X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
- T = _mm_add_epi32(X2, X3);
- X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
- T = _mm_add_epi32(X1, X2);
- X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-
- X1 = _mm_shuffle_epi32(X1, 0x39);
- X2 = _mm_shuffle_epi32(X2, 0x4E);
- X3 = _mm_shuffle_epi32(X3, 0x93);
-
- // --
-
- T = _mm_add_epi32(X0, X3);
- X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
- T = _mm_add_epi32(X1, X0);
- X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
- T = _mm_add_epi32(X2, X1);
- X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
- T = _mm_add_epi32(X3, X2);
- X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-
- X1 = _mm_shuffle_epi32(X1, 0x93);
- X2 = _mm_shuffle_epi32(X2, 0x4E);
- X3 = _mm_shuffle_epi32(X3, 0x39);
-
- T = _mm_add_epi32(X0, X1);
- X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
- T = _mm_add_epi32(X3, X0);
- X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
- T = _mm_add_epi32(X2, X3);
- X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
- T = _mm_add_epi32(X1, X2);
- X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
-
- X1 = _mm_shuffle_epi32(X1, 0x39);
- X2 = _mm_shuffle_epi32(X2, 0x4E);
- X3 = _mm_shuffle_epi32(X3, 0x93);
- }
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
X0 = _mm_add_epi32(X0s,X0);
X1 = _mm_add_epi32(X1s,X1);
@@ -273,76 +359,942 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes)
x14 = j14;
x15 = j15;
- for(i=0;i<_roundsDiv4;++i) {
- x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
- x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
- x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
- x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
- x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
- x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
- x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
- x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
- x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
- x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
- x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
- x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
- x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
- x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
- x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
- x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
- x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
- x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
- x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
- x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
- x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
- x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
- x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
- x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
- x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
- x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
- x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
- x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
- x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
- x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
- x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
- x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
-
- // --
-
- x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
- x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
- x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
- x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
- x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
- x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
- x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
- x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
- x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
- x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
- x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
- x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
- x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
- x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
- x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
- x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
- x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
- x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
- x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
- x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
- x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
- x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
- x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
- x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
- x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
- x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
- x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
- x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
- x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
- x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
- x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
- x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ x0 = PLUS(x0,j0);
+ x1 = PLUS(x1,j1);
+ x2 = PLUS(x2,j2);
+ x3 = PLUS(x3,j3);
+ x4 = PLUS(x4,j4);
+ x5 = PLUS(x5,j5);
+ x6 = PLUS(x6,j6);
+ x7 = PLUS(x7,j7);
+ x8 = PLUS(x8,j8);
+ x9 = PLUS(x9,j9);
+ x10 = PLUS(x10,j10);
+ x11 = PLUS(x11,j11);
+ x12 = PLUS(x12,j12);
+ x13 = PLUS(x13,j13);
+ x14 = PLUS(x14,j14);
+ x15 = PLUS(x15,j15);
+
+ U32TO8_LITTLE(c + 0,XOR(x0,U8TO32_LITTLE(m + 0)));
+ U32TO8_LITTLE(c + 4,XOR(x1,U8TO32_LITTLE(m + 4)));
+ U32TO8_LITTLE(c + 8,XOR(x2,U8TO32_LITTLE(m + 8)));
+ U32TO8_LITTLE(c + 12,XOR(x3,U8TO32_LITTLE(m + 12)));
+ U32TO8_LITTLE(c + 16,XOR(x4,U8TO32_LITTLE(m + 16)));
+ U32TO8_LITTLE(c + 20,XOR(x5,U8TO32_LITTLE(m + 20)));
+ U32TO8_LITTLE(c + 24,XOR(x6,U8TO32_LITTLE(m + 24)));
+ U32TO8_LITTLE(c + 28,XOR(x7,U8TO32_LITTLE(m + 28)));
+ U32TO8_LITTLE(c + 32,XOR(x8,U8TO32_LITTLE(m + 32)));
+ U32TO8_LITTLE(c + 36,XOR(x9,U8TO32_LITTLE(m + 36)));
+ U32TO8_LITTLE(c + 40,XOR(x10,U8TO32_LITTLE(m + 40)));
+ U32TO8_LITTLE(c + 44,XOR(x11,U8TO32_LITTLE(m + 44)));
+ U32TO8_LITTLE(c + 48,XOR(x12,U8TO32_LITTLE(m + 48)));
+ U32TO8_LITTLE(c + 52,XOR(x13,U8TO32_LITTLE(m + 52)));
+ U32TO8_LITTLE(c + 56,XOR(x14,U8TO32_LITTLE(m + 56)));
+ U32TO8_LITTLE(c + 60,XOR(x15,U8TO32_LITTLE(m + 60)));
+
+ if (!(++j8)) {
+ ++j9;
+ /* stopping at 2^70 bytes per nonce is user's responsibility */
+ }
+#endif
+
+ if (bytes <= 64) {
+ if (bytes < 64) {
+ for (i = 0;i < bytes;++i)
+ ctarget[i] = c[i];
+ }
+
+#ifndef ZT_SALSA20_SSE
+ _state.i[8] = j8;
+ _state.i[9] = j9;
+#endif
+
+ return;
+ }
+
+ bytes -= 64;
+ c += 64;
+ m += 64;
+ }
+}
+
+void Salsa20::encrypt20(const void *in,void *out,unsigned int bytes)
+ throw()
+{
+ uint8_t tmp[64];
+ const uint8_t *m = (const uint8_t *)in;
+ uint8_t *c = (uint8_t *)out;
+ uint8_t *ctarget = c;
+ unsigned int i;
+
+#ifndef ZT_SALSA20_SSE
+ uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
+ uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
+#endif
+
+ if (!bytes)
+ return;
+
+#ifndef ZT_SALSA20_SSE
+ j0 = _state.i[0];
+ j1 = _state.i[1];
+ j2 = _state.i[2];
+ j3 = _state.i[3];
+ j4 = _state.i[4];
+ j5 = _state.i[5];
+ j6 = _state.i[6];
+ j7 = _state.i[7];
+ j8 = _state.i[8];
+ j9 = _state.i[9];
+ j10 = _state.i[10];
+ j11 = _state.i[11];
+ j12 = _state.i[12];
+ j13 = _state.i[13];
+ j14 = _state.i[14];
+ j15 = _state.i[15];
+#endif
+
+ for (;;) {
+ if (bytes < 64) {
+ for (i = 0;i < bytes;++i)
+ tmp[i] = m[i];
+ m = tmp;
+ ctarget = c;
+ c = tmp;
}
+#ifdef ZT_SALSA20_SSE
+ __m128i X0 = _mm_loadu_si128((const __m128i *)&(_state.v[0]));
+ __m128i X1 = _mm_loadu_si128((const __m128i *)&(_state.v[1]));
+ __m128i X2 = _mm_loadu_si128((const __m128i *)&(_state.v[2]));
+ __m128i X3 = _mm_loadu_si128((const __m128i *)&(_state.v[3]));
+ __m128i T;
+ __m128i X0s = X0;
+ __m128i X1s = X1;
+ __m128i X2s = X2;
+ __m128i X3s = X3;
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ // 2X round -------------------------------------------------------------
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X1, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X3, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x93);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x39);
+ T = _mm_add_epi32(X0, X1);
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X3, X0);
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
+ T = _mm_add_epi32(X2, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
+ T = _mm_add_epi32(X1, X2);
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
+ X1 = _mm_shuffle_epi32(X1, 0x39);
+ X2 = _mm_shuffle_epi32(X2, 0x4E);
+ X3 = _mm_shuffle_epi32(X3, 0x93);
+
+ X0 = _mm_add_epi32(X0s,X0);
+ X1 = _mm_add_epi32(X1s,X1);
+ X2 = _mm_add_epi32(X2s,X2);
+ X3 = _mm_add_epi32(X3s,X3);
+
+ __m128i k02 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X0, 32), _mm_srli_epi64(X3, 32)), _MM_SHUFFLE(0, 1, 2, 3));
+ __m128i k13 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X1, 32), _mm_srli_epi64(X0, 32)), _MM_SHUFFLE(0, 1, 2, 3));
+ __m128i k20 = _mm_or_si128(_mm_and_si128(X2, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X1, _S20SSECONSTANTS.maskHi32));
+ __m128i k31 = _mm_or_si128(_mm_and_si128(X3, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X2, _S20SSECONSTANTS.maskHi32));
+ _mm_storeu_ps(reinterpret_cast<float *>(c),_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k02,k20),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m))))));
+ _mm_storeu_ps(reinterpret_cast<float *>(c) + 4,_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k13,k31),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 4)))));
+ _mm_storeu_ps(reinterpret_cast<float *>(c) + 8,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k20,k02),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 8)))));
+ _mm_storeu_ps(reinterpret_cast<float *>(c) + 12,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k31,k13),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 12)))));
+
+ if (!(++_state.i[8])) {
+ ++_state.i[5]; // state reordered for SSE
+ /* stopping at 2^70 bytes per nonce is user's responsibility */
+ }
+#else
+ x0 = j0;
+ x1 = j1;
+ x2 = j2;
+ x3 = j3;
+ x4 = j4;
+ x5 = j5;
+ x6 = j6;
+ x7 = j7;
+ x8 = j8;
+ x9 = j9;
+ x10 = j10;
+ x11 = j11;
+ x12 = j12;
+ x13 = j13;
+ x14 = j14;
+ x15 = j15;
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
+ // 2X round -------------------------------------------------------------
+ x4 = XOR( x4,ROTATE(PLUS( x0,x12), 7));
+ x8 = XOR( x8,ROTATE(PLUS( x4, x0), 9));
+ x12 = XOR(x12,ROTATE(PLUS( x8, x4),13));
+ x0 = XOR( x0,ROTATE(PLUS(x12, x8),18));
+ x9 = XOR( x9,ROTATE(PLUS( x5, x1), 7));
+ x13 = XOR(x13,ROTATE(PLUS( x9, x5), 9));
+ x1 = XOR( x1,ROTATE(PLUS(x13, x9),13));
+ x5 = XOR( x5,ROTATE(PLUS( x1,x13),18));
+ x14 = XOR(x14,ROTATE(PLUS(x10, x6), 7));
+ x2 = XOR( x2,ROTATE(PLUS(x14,x10), 9));
+ x6 = XOR( x6,ROTATE(PLUS( x2,x14),13));
+ x10 = XOR(x10,ROTATE(PLUS( x6, x2),18));
+ x3 = XOR( x3,ROTATE(PLUS(x15,x11), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x3,x15), 9));
+ x11 = XOR(x11,ROTATE(PLUS( x7, x3),13));
+ x15 = XOR(x15,ROTATE(PLUS(x11, x7),18));
+ x1 = XOR( x1,ROTATE(PLUS( x0, x3), 7));
+ x2 = XOR( x2,ROTATE(PLUS( x1, x0), 9));
+ x3 = XOR( x3,ROTATE(PLUS( x2, x1),13));
+ x0 = XOR( x0,ROTATE(PLUS( x3, x2),18));
+ x6 = XOR( x6,ROTATE(PLUS( x5, x4), 7));
+ x7 = XOR( x7,ROTATE(PLUS( x6, x5), 9));
+ x4 = XOR( x4,ROTATE(PLUS( x7, x6),13));
+ x5 = XOR( x5,ROTATE(PLUS( x4, x7),18));
+ x11 = XOR(x11,ROTATE(PLUS(x10, x9), 7));
+ x8 = XOR( x8,ROTATE(PLUS(x11,x10), 9));
+ x9 = XOR( x9,ROTATE(PLUS( x8,x11),13));
+ x10 = XOR(x10,ROTATE(PLUS( x9, x8),18));
+ x12 = XOR(x12,ROTATE(PLUS(x15,x14), 7));
+ x13 = XOR(x13,ROTATE(PLUS(x12,x15), 9));
+ x14 = XOR(x14,ROTATE(PLUS(x13,x12),13));
+ x15 = XOR(x15,ROTATE(PLUS(x14,x13),18));
+
x0 = PLUS(x0,j0);
x1 = PLUS(x1,j1);
x2 = PLUS(x2,j2);
diff --git a/node/Salsa20.hpp b/node/Salsa20.hpp
index 84baf3da..a2082bea 100644
--- a/node/Salsa20.hpp
+++ b/node/Salsa20.hpp
@@ -35,12 +35,11 @@ public:
* @param key Key bits
* @param kbits Number of key bits: 128 or 256 (recommended)
* @param iv 64-bit initialization vector
- * @param rounds Number of rounds: 8, 12, or 20
*/
- Salsa20(const void *key,unsigned int kbits,const void *iv,unsigned int rounds)
+ Salsa20(const void *key,unsigned int kbits,const void *iv)
throw()
{
- init(key,kbits,iv,rounds);
+ init(key,kbits,iv);
}
/**
@@ -49,32 +48,54 @@ public:
* @param key Key bits
* @param kbits Number of key bits: 128 or 256 (recommended)
* @param iv 64-bit initialization vector
- * @param rounds Number of rounds: 8, 12, or 20
*/
- void init(const void *key,unsigned int kbits,const void *iv,unsigned int rounds)
+ void init(const void *key,unsigned int kbits,const void *iv)
throw();
/**
- * Encrypt data
+ * Encrypt data using Salsa20/12
*
* @param in Input data
* @param out Output buffer
* @param bytes Length of data
*/
- void encrypt(const void *in,void *out,unsigned int bytes)
+ void encrypt12(const void *in,void *out,unsigned int bytes)
throw();
/**
+ * Encrypt data using Salsa20/20
+ *
+ * @param in Input data
+ * @param out Output buffer
+ * @param bytes Length of data
+ */
+ void encrypt20(const void *in,void *out,unsigned int bytes)
+ throw();
+
+ /**
+ * Decrypt data
+ *
+ * @param in Input data
+ * @param out Output buffer
+ * @param bytes Length of data
+ */
+ inline void decrypt12(const void *in,void *out,unsigned int bytes)
+ throw()
+ {
+ encrypt12(in,out,bytes);
+ }
+
+ /**
* Decrypt data
*
* @param in Input data
* @param out Output buffer
* @param bytes Length of data
*/
- inline void decrypt(const void *in,void *out,unsigned int bytes)
+ inline void decrypt20(const void *in,void *out,unsigned int bytes)
throw()
{
- encrypt(in,out,bytes);
+ encrypt20(in,out,bytes);
}
private:
@@ -84,7 +105,6 @@ private:
#endif // ZT_SALSA20_SSE
uint32_t i[16];
} _state;
- unsigned int _roundsDiv4;
};
} // namespace ZeroTier
diff --git a/selftest.cpp b/selftest.cpp
index 090839ee..4ba76c0b 100644
--- a/selftest.cpp
+++ b/selftest.cpp
@@ -162,27 +162,27 @@ static int testCrypto()
memset(buf2,0,sizeof(buf2));
memset(buf3,0,sizeof(buf3));
Salsa20 s20;
- s20.init("12345678123456781234567812345678",256,"12345678",20);
- s20.encrypt(buf1,buf2,sizeof(buf1));
- s20.init("12345678123456781234567812345678",256,"12345678",20);
- s20.decrypt(buf2,buf3,sizeof(buf2));
+ s20.init("12345678123456781234567812345678",256,"12345678");
+ s20.encrypt20(buf1,buf2,sizeof(buf1));
+ s20.init("12345678123456781234567812345678",256,"12345678");
+ s20.decrypt20(buf2,buf3,sizeof(buf2));
if (memcmp(buf1,buf3,sizeof(buf1))) {
std::cout << "FAIL (encrypt/decrypt test)" << std::endl;
return -1;
}
}
- Salsa20 s20(s20TV0Key,256,s20TV0Iv,20);
+ Salsa20 s20(s20TV0Key,256,s20TV0Iv);
memset(buf1,0,sizeof(buf1));
memset(buf2,0,sizeof(buf2));
- s20.encrypt(buf1,buf2,64);
+ s20.encrypt20(buf1,buf2,64);
if (memcmp(buf2,s20TV0Ks,64)) {
std::cout << "FAIL (test vector 0)" << std::endl;
return -1;
}
- s20.init(s2012TV0Key,256,s2012TV0Iv,12);
+ s20.init(s2012TV0Key,256,s2012TV0Iv);
memset(buf1,0,sizeof(buf1));
memset(buf2,0,sizeof(buf2));
- s20.encrypt(buf1,buf2,64);
+ s20.encrypt12(buf1,buf2,64);
if (memcmp(buf2,s2012TV0Ks,64)) {
std::cout << "FAIL (test vector 1)" << std::endl;
return -1;
@@ -195,34 +195,16 @@ static int testCrypto()
std::cout << "[crypto] Salsa20 SSE: DISABLED" << std::endl;
#endif
- std::cout << "[crypto] Benchmarking Salsa20/8... "; std::cout.flush();
- {
- unsigned char *bb = (unsigned char *)::malloc(1234567);
- for(unsigned int i=0;i<1234567;++i)
- bb[i] = (unsigned char)i;
- Salsa20 s20(s20TV0Key,256,s20TV0Iv,8);
- double bytes = 0.0;
- uint64_t start = OSUtils::now();
- for(unsigned int i=0;i<200;++i) {
- s20.encrypt(bb,bb,1234567);
- bytes += 1234567.0;
- }
- uint64_t end = OSUtils::now();
- SHA512::hash(buf1,bb,1234567);
- std::cout << ((bytes / 1048576.0) / ((double)(end - start) / 1000.0)) << " MiB/second (" << Utils::hex(buf1,16) << ')' << std::endl;
- ::free((void *)bb);
- }
-
std::cout << "[crypto] Benchmarking Salsa20/12... "; std::cout.flush();
{
unsigned char *bb = (unsigned char *)::malloc(1234567);
for(unsigned int i=0;i<1234567;++i)
bb[i] = (unsigned char)i;
- Salsa20 s20(s20TV0Key,256,s20TV0Iv,12);
+ Salsa20 s20(s20TV0Key,256,s20TV0Iv);
double bytes = 0.0;
uint64_t start = OSUtils::now();
for(unsigned int i=0;i<200;++i) {
- s20.encrypt(bb,bb,1234567);
+ s20.encrypt12(bb,bb,1234567);
bytes += 1234567.0;
}
uint64_t end = OSUtils::now();
@@ -236,11 +218,11 @@ static int testCrypto()
unsigned char *bb = (unsigned char *)::malloc(1234567);
for(unsigned int i=0;i<1234567;++i)
bb[i] = (unsigned char)i;
- Salsa20 s20(s20TV0Key,256,s20TV0Iv,20);
+ Salsa20 s20(s20TV0Key,256,s20TV0Iv);
double bytes = 0.0;
uint64_t start = OSUtils::now();
for(unsigned int i=0;i<200;++i) {
- s20.encrypt(bb,bb,1234567);
+ s20.encrypt20(bb,bb,1234567);
bytes += 1234567.0;
}
uint64_t end = OSUtils::now();