diff options
author | Adam Ierymenko <adam.ierymenko@gmail.com> | 2017-05-03 07:43:23 -0700 |
---|---|---|
committer | Adam Ierymenko <adam.ierymenko@gmail.com> | 2017-05-03 07:43:23 -0700 |
commit | 41c187ba12fc05f6e9ccd5f8acbc248c2a3d16e1 (patch) | |
tree | 0a30d7afcdb07667f8bbe553fe4351291857336d /node | |
parent | d7c99728bc6a2f83f94fd936fb4855086265c691 (diff) | |
download | infinitytier-41c187ba12fc05f6e9ccd5f8acbc248c2a3d16e1.tar.gz infinitytier-41c187ba12fc05f6e9ccd5f8acbc248c2a3d16e1.zip |
Another very small crypto optimization.
Diffstat (limited to 'node')
-rw-r--r-- | node/Salsa20.hpp | 41 |
1 files changed, 40 insertions, 1 deletions
diff --git a/node/Salsa20.hpp b/node/Salsa20.hpp index 52592602..bfb6d9d9 100644 --- a/node/Salsa20.hpp +++ b/node/Salsa20.hpp @@ -48,6 +48,43 @@ public: static inline void memxor(uint8_t *d,const uint8_t *s,unsigned int len) { #ifdef ZT_SALSA20_SSE + while (len >= 128) { + __m128i s0 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s)); + __m128i s1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 16)); + __m128i s2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 32)); + __m128i s3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 48)); + __m128i s4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 64)); + __m128i s5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 80)); + __m128i s6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 96)); + __m128i s7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(s + 112)); + __m128i d0 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d)); + __m128i d1 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 16)); + __m128i d2 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 32)); + __m128i d3 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 48)); + __m128i d4 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 64)); + __m128i d5 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 80)); + __m128i d6 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 96)); + __m128i d7 = _mm_loadu_si128(reinterpret_cast<__m128i *>(d + 112)); + d0 = _mm_xor_si128(d0,s0); + d1 = _mm_xor_si128(d1,s1); + d2 = _mm_xor_si128(d2,s2); + d3 = _mm_xor_si128(d3,s3); + d4 = _mm_xor_si128(d4,s4); + d5 = _mm_xor_si128(d5,s5); + d6 = _mm_xor_si128(d6,s6); + d7 = _mm_xor_si128(d7,s7); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d),d0); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 16),d1); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 32),d2); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 48),d3); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 64),d4); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 80),d5); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 96),d6); + _mm_storeu_si128(reinterpret_cast<__m128i *>(d + 112),d7); + s += 128; + d += 128; + len -= 128; + } while (len >= 16) { _mm_storeu_si128(reinterpret_cast<__m128i *>(d),_mm_xor_si128(_mm_loadu_si128(reinterpret_cast<__m128i *>(d)),_mm_loadu_si128(reinterpret_cast<const __m128i *>(s)))); s += 16; @@ -67,8 +104,10 @@ public: } #endif #endif - while (len--) + while (len) { + --len; *(d++) ^= *(s++); + } } /** |