summaryrefslogtreecommitdiff
path: root/node
diff options
context:
space:
mode:
authorAdam Ierymenko <adam.ierymenko@gmail.com>2015-10-08 17:42:53 -0700
committerAdam Ierymenko <adam.ierymenko@gmail.com>2015-10-08 17:42:53 -0700
commit160278c489b8ec2f11235f839836f0f014990fda (patch)
treeb3a4041104ca3b1bc658dc8536f35e6cfe90a6c9 /node
parent59da8b2a4b3e36605886944f3fa111870bbb8a2c (diff)
downloadinfinitytier-160278c489b8ec2f11235f839836f0f014990fda.tar.gz
infinitytier-160278c489b8ec2f11235f839836f0f014990fda.zip
Little bit of reorg in Salsa20 which seems to speed things up very slightly.
Diffstat (limited to 'node')
-rw-r--r--node/Salsa20.cpp75
1 files changed, 26 insertions, 49 deletions
diff --git a/node/Salsa20.cpp b/node/Salsa20.cpp
index f8cf8591..dec14faf 100644
--- a/node/Salsa20.cpp
+++ b/node/Salsa20.cpp
@@ -175,41 +175,34 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes)
__m128i X1 = _mm_loadu_si128((const __m128i *)&(_state.v[1]));
__m128i X2 = _mm_loadu_si128((const __m128i *)&(_state.v[2]));
__m128i X3 = _mm_loadu_si128((const __m128i *)&(_state.v[3]));
+ __m128i T;
__m128i X0s = X0;
__m128i X1s = X1;
__m128i X2s = X2;
__m128i X3s = X3;
for (i=0;i<_roundsDiv4;++i) {
- __m128i T = _mm_add_epi32(X0, X3);
- X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7));
- X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25));
+ T = _mm_add_epi32(X0, X3);
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X1, X0);
- X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
- X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X1);
- X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13));
- X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19));
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X3, X2);
- X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
- X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x93);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x39);
T = _mm_add_epi32(X0, X1);
- X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7));
- X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25));
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X3, X0);
- X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
- X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X3);
- X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13));
- X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19));
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X1, X2);
- X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
- X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x39);
X2 = _mm_shuffle_epi32(X2, 0x4E);
@@ -218,34 +211,26 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes)
// --
T = _mm_add_epi32(X0, X3);
- X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7));
- X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25));
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X1, X0);
- X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
- X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X1);
- X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13));
- X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19));
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X3, X2);
- X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
- X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x93);
X2 = _mm_shuffle_epi32(X2, 0x4E);
X3 = _mm_shuffle_epi32(X3, 0x39);
T = _mm_add_epi32(X0, X1);
- X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7));
- X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25));
+ X3 = _mm_xor_si128(_mm_xor_si128(X3, _mm_slli_epi32(T, 7)), _mm_srli_epi32(T, 25));
T = _mm_add_epi32(X3, X0);
- X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
- X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
+ X2 = _mm_xor_si128(_mm_xor_si128(X2, _mm_slli_epi32(T, 9)), _mm_srli_epi32(T, 23));
T = _mm_add_epi32(X2, X3);
- X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13));
- X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19));
+ X1 = _mm_xor_si128(_mm_xor_si128(X1, _mm_slli_epi32(T, 13)), _mm_srli_epi32(T, 19));
T = _mm_add_epi32(X1, X2);
- X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
- X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));
+ X0 = _mm_xor_si128(_mm_xor_si128(X0, _mm_slli_epi32(T, 18)), _mm_srli_epi32(T, 14));
X1 = _mm_shuffle_epi32(X1, 0x39);
X2 = _mm_shuffle_epi32(X2, 0x4E);
@@ -257,22 +242,14 @@ void Salsa20::encrypt(const void *in,void *out,unsigned int bytes)
X2 = _mm_add_epi32(X2s,X2);
X3 = _mm_add_epi32(X3s,X3);
- {
- __m128i k02 = _mm_or_si128(_mm_slli_epi64(X0, 32), _mm_srli_epi64(X3, 32));
- k02 = _mm_shuffle_epi32(k02, _MM_SHUFFLE(0, 1, 2, 3));
- __m128i k13 = _mm_or_si128(_mm_slli_epi64(X1, 32), _mm_srli_epi64(X0, 32));
- k13 = _mm_shuffle_epi32(k13, _MM_SHUFFLE(0, 1, 2, 3));
- __m128i k20 = _mm_or_si128(_mm_and_si128(X2, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X1, _S20SSECONSTANTS.maskHi32));
- __m128i k31 = _mm_or_si128(_mm_and_si128(X3, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X2, _S20SSECONSTANTS.maskHi32));
-
- const float *const mv = (const float *)m;
- float *const cv = (float *)c;
-
- _mm_storeu_ps(cv,_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k02,k20),_mm_castps_si128(_mm_loadu_ps(mv)))));
- _mm_storeu_ps(cv + 4,_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k13,k31),_mm_castps_si128(_mm_loadu_ps(mv + 4)))));
- _mm_storeu_ps(cv + 8,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k20,k02),_mm_castps_si128(_mm_loadu_ps(mv + 8)))));
- _mm_storeu_ps(cv + 12,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k31,k13),_mm_castps_si128(_mm_loadu_ps(mv + 12)))));
- }
+ __m128i k02 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X0, 32), _mm_srli_epi64(X3, 32)), _MM_SHUFFLE(0, 1, 2, 3));
+ __m128i k13 = _mm_shuffle_epi32(_mm_or_si128(_mm_slli_epi64(X1, 32), _mm_srli_epi64(X0, 32)), _MM_SHUFFLE(0, 1, 2, 3));
+ __m128i k20 = _mm_or_si128(_mm_and_si128(X2, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X1, _S20SSECONSTANTS.maskHi32));
+ __m128i k31 = _mm_or_si128(_mm_and_si128(X3, _S20SSECONSTANTS.maskLo32), _mm_and_si128(X2, _S20SSECONSTANTS.maskHi32));
+ _mm_storeu_ps(reinterpret_cast<float *>(c),_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k02,k20),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m))))));
+ _mm_storeu_ps(reinterpret_cast<float *>(c) + 4,_mm_castsi128_ps(_mm_xor_si128(_mm_unpackhi_epi64(k13,k31),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 4)))));
+ _mm_storeu_ps(reinterpret_cast<float *>(c) + 8,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k20,k02),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 8)))));
+ _mm_storeu_ps(reinterpret_cast<float *>(c) + 12,_mm_castsi128_ps(_mm_xor_si128(_mm_unpacklo_epi64(k31,k13),_mm_castps_si128(_mm_loadu_ps(reinterpret_cast<const float *>(m) + 12)))));
if (!(++_state.i[8])) {
++_state.i[5]; // state reordered for SSE