diff options
author | Adam Ierymenko <adam.ierymenko@gmail.com> | 2015-07-02 09:00:00 -0700 |
---|---|---|
committer | Adam Ierymenko <adam.ierymenko@gmail.com> | 2015-07-02 09:00:00 -0700 |
commit | f19c3c51d3ca2bc886a9125aa9b187aa794b1676 (patch) | |
tree | 70c50fbf266cc7513b25ee007982e6d798933830 | |
parent | 7c9949eea3d4d40b1f1f2787ec774449ef3b8e3d (diff) | |
download | infinitytier-f19c3c51d3ca2bc886a9125aa9b187aa794b1676.tar.gz infinitytier-f19c3c51d3ca2bc886a9125aa9b187aa794b1676.zip |
Revert slow non-SSE Salsa20 modification since it did not fix Android/ARM issue. Also update Salsa20 comments and clean up a bit.
-rw-r--r-- | node/Salsa20.cpp | 44 | ||||
-rw-r--r-- | node/Salsa20.hpp | 2 |
2 files changed, 24 insertions, 22 deletions
diff --git a/node/Salsa20.cpp b/node/Salsa20.cpp index 2eb68381..ae8e1802 100644 --- a/node/Salsa20.cpp +++ b/node/Salsa20.cpp @@ -1,51 +1,53 @@ /* * Based on public domain code available at: http://cr.yp.to/snuffle.html * - * This therefore is public domain. + * Modifications and C-native SSE macro based SSE implementation by + * Adam Ierymenko <adam.ierymenko@zerotier.com>. + * + * Since the original was public domain, this is too. */ -#include "Salsa20.hpp" #include "Constants.hpp" +#include "Salsa20.hpp" #define ROTATE(v,c) (((v) << (c)) | ((v) >> (32 - (c)))) #define XOR(v,w) ((v) ^ (w)) #define PLUS(v,w) ((uint32_t)((v) + (w))) +// Set up laod/store macros with appropriate endianness (we don't use these in SSE mode) #ifndef ZT_SALSA20_SSE #if __BYTE_ORDER == __LITTLE_ENDIAN -/* We have a slower version of these macros for CPU/compiler combos that - * do not allow unaligned access to a uint32_t. Another solution would be - * to methodically require alignment across the code, but this is quicker - * for now. The culprit appears to be some Android-based ARM devices. */ -#if 1 -#define U8TO32_LITTLE(p) ( ((uint32_t)(p)[0]) | ((uint32_t)(p)[1] << 8) | ((uint32_t)(p)[2] << 16) | ((uint32_t)(p)[3] << 24) ) -static inline void U32TO8_LITTLE(uint8_t *const c,const uint32_t v) -{ - c[0] = (uint8_t)v; - c[1] = (uint8_t)(v >> 8); - c[2] = (uint8_t)(v >> 16); - c[3] = (uint8_t)(v >> 24); -} -#else +// Slow version that does not use type punning +//#define U8TO32_LITTLE(p) ( ((uint32_t)(p)[0]) | ((uint32_t)(p)[1] << 8) | ((uint32_t)(p)[2] << 16) | ((uint32_t)(p)[3] << 24) ) +//static inline void U32TO8_LITTLE(uint8_t *const c,const uint32_t v) { c[0] = (uint8_t)v; c[1] = (uint8_t)(v >> 8); c[2] = (uint8_t)(v >> 16); c[3] = (uint8_t)(v >> 24); } + +// Fast version that just does 32-bit load/store #define U8TO32_LITTLE(p) (*((const uint32_t *)((const void *)(p)))) #define U32TO8_LITTLE(c,v) *((uint32_t *)((void *)(c))) = (v) -#endif -#else // big endian +#else // __BYTE_ORDER == __BIG_ENDIAN (we don't support anything else... does MIDDLE_ENDIAN even still exist?) #ifdef __GNUC__ + +// Use GNUC builtin bswap macros on big-endian machines if available #define U8TO32_LITTLE(p) __builtin_bswap32(*((const uint32_t *)((const void *)(p)))) #define U32TO8_LITTLE(c,v) *((uint32_t *)((void *)(c))) = __builtin_bswap32((v)) -#else // no bswap stuff... need to do it manually? -error need be; + +#else // no __GNUC__ + +// Otherwise do it the slow, manual way on BE machines +#define U8TO32_LITTLE(p) ( ((uint32_t)(p)[0]) | ((uint32_t)(p)[1] << 8) | ((uint32_t)(p)[2] << 16) | ((uint32_t)(p)[3] << 24) ) +static inline void U32TO8_LITTLE(uint8_t *const c,const uint32_t v) { c[0] = (uint8_t)v; c[1] = (uint8_t)(v >> 8); c[2] = (uint8_t)(v >> 16); c[3] = (uint8_t)(v >> 24); } + #endif // __GNUC__ or not -#endif // little/big endian +#endif // __BYTE_ORDER little or big? #endif // !ZT_SALSA20_SSE +// Statically compute and define SSE constants #ifdef ZT_SALSA20_SSE class _s20sseconsts { diff --git a/node/Salsa20.hpp b/node/Salsa20.hpp index 9631a6db..3bb041ac 100644 --- a/node/Salsa20.hpp +++ b/node/Salsa20.hpp @@ -78,7 +78,7 @@ public: } private: - volatile union { + union { #ifdef ZT_SALSA20_SSE __m128i v[4]; #endif // ZT_SALSA20_SSE |