From 060870462606f82d3a9b0fe6aca87f8a5de8705e Mon Sep 17 00:00:00 2001 From: Adam Ierymenko Date: Fri, 15 Dec 2017 11:03:20 -0800 Subject: Some micro-optimizations, including a memcpy that is faster than Linux for most distro/compiler versions. --- node/Utils.hpp | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) (limited to 'node/Utils.hpp') diff --git a/node/Utils.hpp b/node/Utils.hpp index 87584fcf..20108e27 100644 --- a/node/Utils.hpp +++ b/node/Utils.hpp @@ -40,6 +40,44 @@ #include "Constants.hpp" +// So it's 2017 and this still helps on most Linux versions. It shouldn't but it does. Go figure. +#if defined(__LINUX__) && ((defined(_MSC_VER) || defined(__GNUC__)) && (defined(__amd64) || defined(__amd64__) || defined(__x86_64) || defined(__x86_64__) || defined(__AMD64) || defined(__AMD64__) || defined(_M_X64))) +#include +static inline void ZT_FAST_MEMCPY(void *a,const void *b,unsigned long k) +{ + char *aa = reinterpret_cast(a); + const char *bb = reinterpret_cast(b); + while (likely(k >= 128)) { + __m128i t1 = _mm_loadu_si128(reinterpret_cast(bb)); bb += 16; + __m128i t2 = _mm_loadu_si128(reinterpret_cast(bb)); bb += 16; + __m128i t3 = _mm_loadu_si128(reinterpret_cast(bb)); bb += 16; + __m128i t4 = _mm_loadu_si128(reinterpret_cast(bb)); bb += 16; + _mm_storeu_si128(reinterpret_cast<__m128i *>(aa),t1); aa += 16; + _mm_storeu_si128(reinterpret_cast<__m128i *>(aa),t2); aa += 16; + _mm_storeu_si128(reinterpret_cast<__m128i *>(aa),t3); aa += 16; + _mm_storeu_si128(reinterpret_cast<__m128i *>(aa),t4); aa += 16; + __m128i t5 = _mm_loadu_si128(reinterpret_cast(bb)); bb += 16; + __m128i t6 = _mm_loadu_si128(reinterpret_cast(bb)); bb += 16; + __m128i t7 = _mm_loadu_si128(reinterpret_cast(bb)); bb += 16; + __m128i t8 = _mm_loadu_si128(reinterpret_cast(bb)); bb += 16; + _mm_storeu_si128(reinterpret_cast<__m128i *>(aa),t5); aa += 16; + _mm_storeu_si128(reinterpret_cast<__m128i *>(aa),t6); aa += 16; + _mm_storeu_si128(reinterpret_cast<__m128i *>(aa),t7); aa += 16; + _mm_storeu_si128(reinterpret_cast<__m128i *>(aa),t8); aa += 16; + k -= 128; + } + while (likely(k >= 16)) { + __m128i t1 = _mm_loadu_si128(reinterpret_cast(bb)); bb += 16; + _mm_storeu_si128(reinterpret_cast<__m128i *>(aa),t1); aa += 16; + k -= 16; + } + for(unsigned long i=0;i