summaryrefslogtreecommitdiff
path: root/node
diff options
context:
space:
mode:
authorAdam Ierymenko <adam.ierymenko@gmail.com>2017-05-03 07:43:23 -0700
committerAdam Ierymenko <adam.ierymenko@gmail.com>2017-05-03 07:43:23 -0700
commit41c187ba12fc05f6e9ccd5f8acbc248c2a3d16e1 (patch)
tree0a30d7afcdb07667f8bbe553fe4351291857336d /node
parentd7c99728bc6a2f83f94fd936fb4855086265c691 (diff)
downloadinfinitytier-41c187ba12fc05f6e9ccd5f8acbc248c2a3d16e1.tar.gz
infinitytier-41c187ba12fc05f6e9ccd5f8acbc248c2a3d16e1.zip
Another very small crypto optimization.
Diffstat (limited to 'node')
-rw-r--r--node/Salsa20.hpp41
1 files changed, 40 insertions, 1 deletions
diff --git a/node/Salsa20.hpp b/node/Salsa20.hpp
index 52592602..bfb6d9d9 100644
--- a/node/Salsa20.hpp
+++ b/node/Salsa20.hpp
@@ -48,6 +48,43 @@ public:
static inline void memxor(uint8_t *d,const uint8_t *s,unsigned int len)
{
#ifdef ZT_SALSA20_SSE
+ while (len >= 128) {
+ __m128i s0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s));
+ __m128i s1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 16));
+ __m128i s2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 32));
+ __m128i s3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 48));
+ __m128i s4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 64));
+ __m128i s5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 80));
+ __m128i s6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 96));
+ __m128i s7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 112));
+ __m128i d0 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d));
+ __m128i d1 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 16));
+ __m128i d2 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 32));
+ __m128i d3 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 48));
+ __m128i d4 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 64));
+ __m128i d5 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 80));
+ __m128i d6 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 96));
+ __m128i d7 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 112));
+ d0 = _mm_xor_si128(d0,s0);
+ d1 = _mm_xor_si128(d1,s1);
+ d2 = _mm_xor_si128(d2,s2);
+ d3 = _mm_xor_si128(d3,s3);
+ d4 = _mm_xor_si128(d4,s4);
+ d5 = _mm_xor_si128(d5,s5);
+ d6 = _mm_xor_si128(d6,s6);
+ d7 = _mm_xor_si128(d7,s7);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(d),d0);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 16),d1);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 32),d2);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 48),d3);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 64),d4);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 80),d5);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 96),d6);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 112),d7);
+ s += 128;
+ d += 128;
+ len -= 128;
+ }
while (len >= 16) {
_mm_storeu_si128(reinterpret_cast<__m128i *>(d),_mm_xor_si128(_mm_loadu_si128(reinterpret_cast<__m128i *>(d)),_mm_loadu_si128(reinterpret_cast<const __m128i *>(s))));
s += 16;
@@ -67,8 +104,10 @@ public:
}
#endif
#endif
- while (len--)
+ while (len) {
+ --len;
*(d++) ^= *(s++);
+ }
}
/**