diff --git a/common/memcpySSE2.h b/common/memcpySSE2.h index a43317dc..f3a95333 100644 --- a/common/memcpySSE2.h +++ b/common/memcpySSE2.h @@ -26,176 +26,112 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "debug.h" -#if defined(__GNUC__) || defined(__GNUG__) +#if defined(NATIVE_MEMCPY) + #define memcpySSE memcpy +#elif defined(MSVC) + extern "C" void memcpySSE(void *dst, const void * src, size_t length); +#elif (defined(__GNUC__) || defined(__GNUG__)) && defined(__i386__) + inline static void memcpySSE(void *dst, const void * src, size_t length) + { + if (length == 0 || dst == src) + return; -#define OP(...) #__VA_ARGS__ "\n\t" + // copies under 1MB are faster with the inlined memcpy + // tell the dev to use that instead + if (length < 1048576) + { + static bool smallBufferWarn = false; + if (!smallBufferWarn) + { + DEBUG_WARN("Do not use memcpySSE for copies under 1MB in size!"); + smallBufferWarn = true; + } + memcpy(dst, src, length); + return; + } -inline static void memcpySSE(void *dst, const void * src, size_t length) -{ -#if !defined(NATIVE_MEMCPY) && (defined(__x86_64__) || defined(__i386__)) - if (length == 0 || dst == src) - return; + const void * end = dst + (length & ~0x7F); + const size_t off = (7 - ((length & 0x7F) >> 4)) * 10; -#ifdef __x86_64__ - const void * end = dst + (length & ~0xFF); - size_t off = (15 - ((length & 0xFF) >> 4)); - off = (off < 8) ? off * 16 : 7 * 16 + (off - 7) * 10; + __asm__ __volatile__ ( + "cmp %[dst],%[end] \n\t" + "je Remain_%= \n\t" + + // perform SIMD block copy + "loop_%=: \n\t" + "vmovaps 0x00(%[src]),%%xmm0 \n\t" + "vmovaps 0x10(%[src]),%%xmm1 \n\t" + "vmovaps 0x20(%[src]),%%xmm2 \n\t" + "vmovaps 0x30(%[src]),%%xmm3 \n\t" + "vmovaps 0x40(%[src]),%%xmm4 \n\t" + "vmovaps 0x50(%[src]),%%xmm5 \n\t" + "vmovaps 0x60(%[src]),%%xmm6 \n\t" + "vmovaps 0x70(%[src]),%%xmm7 \n\t" + "vmovntdq %%xmm0 ,0x00(%[dst]) \n\t" + "vmovntdq %%xmm1 ,0x10(%[dst]) \n\t" + "vmovntdq %%xmm2 ,0x20(%[dst]) \n\t" + "vmovntdq %%xmm3 ,0x30(%[dst]) \n\t" + "vmovntdq %%xmm4 ,0x40(%[dst]) \n\t" + "vmovntdq %%xmm5 ,0x50(%[dst]) \n\t" + "vmovntdq %%xmm6 ,0x60(%[dst]) \n\t" + "vmovntdq %%xmm7 ,0x70(%[dst]) \n\t" + "add $0x80,%[dst] \n\t" + "add $0x80,%[src] \n\t" + "cmp %[dst],%[end] \n\t" + "jne loop_%= \n\t" + + "Remain_%=: \n\t" + + // copy any remaining 16 byte blocks + "call GetPC_%=\n\t" + "Offset_%=:\n\t" + "add $(BlockTable_%= - Offset_%=), %%eax \n\t" + "add %[off],%%eax \n\t" + "jmp *%%eax \n\t" + + "GetPC_%=:\n\t" + "mov (%%esp), %%eax \n\t" + "ret \n\t" + + "BlockTable_%=:\n\t" + "vmovaps 0x60(%[src]),%%xmm6 \n\t" + "vmovntdq %%xmm6 ,0x60(%[dst]) \n\t" + "vmovaps 0x50(%[src]),%%xmm5 \n\t" + "vmovntdq %%xmm5 ,0x50(%[dst]) \n\t" + "vmovaps 0x40(%[src]),%%xmm4 \n\t" + "vmovntdq %%xmm4 ,0x40(%[dst]) \n\t" + "vmovaps 0x30(%[src]),%%xmm3 \n\t" + "vmovntdq %%xmm3 ,0x30(%[dst]) \n\t" + "vmovaps 0x20(%[src]),%%xmm2 \n\t" + "vmovntdq %%xmm2 ,0x20(%[dst]) \n\t" + "vmovaps 0x10(%[src]),%%xmm1 \n\t" + "vmovntdq %%xmm1 ,0x10(%[dst]) \n\t" + "vmovaps 0x00(%[src]),%%xmm0 \n\t" + "vmovntdq %%xmm0 ,0x00(%[dst]) \n\t" + "nop\n\t" + "nop\n\t" + + : [dst]"+r" (dst), + [src]"+r" (src) + : [off]"r" (off), + [end]"r" (end) + : "eax", + "xmm0", + "xmm1", + "xmm2", + "xmm3", + "xmm4", + "xmm5", + "xmm6", + "xmm7", + "memory" + ); + + //copy any remaining bytes + for(size_t i = (length & 0xF); i; --i) + ((uint8_t *)dst)[length - i] = + ((uint8_t *)src)[length - i]; + } #else - const void * end = dst + (length & ~0x7F); - const size_t off = (7 - ((length & 0x7F) >> 4)) * 10; -#endif - -#ifdef __x86_64__ - #define REG "rax" -#else - #define REG "eax" -#endif - - __asm__ __volatile__ ( - "cmp %[dst],%[end] \n\t" - "je Remain_%= \n\t" - - // perform SIMD block copy - "loop_%=: \n\t" - "vmovaps 0x00(%[src]),%%xmm0 \n\t" - "vmovaps 0x10(%[src]),%%xmm1 \n\t" - "vmovaps 0x20(%[src]),%%xmm2 \n\t" - "vmovaps 0x30(%[src]),%%xmm3 \n\t" - "vmovaps 0x40(%[src]),%%xmm4 \n\t" - "vmovaps 0x50(%[src]),%%xmm5 \n\t" - "vmovaps 0x60(%[src]),%%xmm6 \n\t" - "vmovaps 0x70(%[src]),%%xmm7 \n\t" -#ifdef __x86_64__ - "vmovaps 0x80(%[src]),%%xmm8 \n\t" - "vmovaps 0x90(%[src]),%%xmm9 \n\t" - "vmovaps 0xA0(%[src]),%%xmm10 \n\t" - "vmovaps 0xB0(%[src]),%%xmm11 \n\t" - "vmovaps 0xC0(%[src]),%%xmm12 \n\t" - "vmovaps 0xD0(%[src]),%%xmm13 \n\t" - "vmovaps 0xE0(%[src]),%%xmm14 \n\t" - "vmovaps 0xF0(%[src]),%%xmm15 \n\t" -#endif - "vmovntdq %%xmm0 ,0x00(%[dst]) \n\t" - "vmovntdq %%xmm1 ,0x10(%[dst]) \n\t" - "vmovntdq %%xmm2 ,0x20(%[dst]) \n\t" - "vmovntdq %%xmm3 ,0x30(%[dst]) \n\t" - "vmovntdq %%xmm4 ,0x40(%[dst]) \n\t" - "vmovntdq %%xmm5 ,0x50(%[dst]) \n\t" - "vmovntdq %%xmm6 ,0x60(%[dst]) \n\t" - "vmovntdq %%xmm7 ,0x70(%[dst]) \n\t" -#ifdef __x86_64__ - "vmovntdq %%xmm8 ,0x80(%[dst]) \n\t" - "vmovntdq %%xmm9 ,0x90(%[dst]) \n\t" - "vmovntdq %%xmm10,0xA0(%[dst]) \n\t" - "vmovntdq %%xmm11,0xB0(%[dst]) \n\t" - "vmovntdq %%xmm12,0xC0(%[dst]) \n\t" - "vmovntdq %%xmm13,0xD0(%[dst]) \n\t" - "vmovntdq %%xmm14,0xE0(%[dst]) \n\t" - "vmovntdq %%xmm15,0xF0(%[dst]) \n\t" - - "add $0x100,%[dst] \n\t" - "add $0x100,%[src] \n\t" -#else - "add $0x80,%[dst] \n\t" - "add $0x80,%[src] \n\t" -#endif - "cmp %[dst],%[end] \n\t" - "jne loop_%= \n\t" - - "Remain_%=: \n\t" - - // copy any remaining 16 byte blocks -#ifdef __x86_64__ - "leaq (%%rip), %%rax\n\t" -#else - "call GetPC_%=\n\t" -#endif - "Offset_%=:\n\t" - "add $(BlockTable_%= - Offset_%=), %%" REG "\n\t" - "add %[off],%%" REG " \n\t" - "jmp *%%" REG " \n\t" - -#ifdef __i386__ - "GetPC_%=:\n\t" - "mov (%%esp), %%eax \n\t" - "ret \n\t" -#endif - - "BlockTable_%=:\n\t" -#ifdef __x86_64__ - "vmovaps 0xE0(%[src]),%%xmm14 \n\t" - "vmovntdq %%xmm14,0xE0(%[dst]) \n\t" - "vmovaps 0xD0(%[src]),%%xmm13 \n\t" - "vmovntdq %%xmm13,0xD0(%[dst]) \n\t" - "vmovaps 0xC0(%[src]),%%xmm12 \n\t" - "vmovntdq %%xmm12,0xC0(%[dst]) \n\t" - "vmovaps 0xB0(%[src]),%%xmm11 \n\t" - "vmovntdq %%xmm11,0xB0(%[dst]) \n\t" - "vmovaps 0xA0(%[src]),%%xmm10 \n\t" - "vmovntdq %%xmm10,0xA0(%[dst]) \n\t" - "vmovaps 0x90(%[src]),%%xmm9 \n\t" - "vmovntdq %%xmm9 ,0x90(%[dst]) \n\t" - "vmovaps 0x80(%[src]),%%xmm8 \n\t" - "vmovntdq %%xmm8 ,0x80(%[dst]) \n\t" - "vmovaps 0x70(%[src]),%%xmm7 \n\t" - "vmovntdq %%xmm7 ,0x70(%[dst]) \n\t" -#endif - "vmovaps 0x60(%[src]),%%xmm6 \n\t" - "vmovntdq %%xmm6 ,0x60(%[dst]) \n\t" - "vmovaps 0x50(%[src]),%%xmm5 \n\t" - "vmovntdq %%xmm5 ,0x50(%[dst]) \n\t" - "vmovaps 0x40(%[src]),%%xmm4 \n\t" - "vmovntdq %%xmm4 ,0x40(%[dst]) \n\t" - "vmovaps 0x30(%[src]),%%xmm3 \n\t" - "vmovntdq %%xmm3 ,0x30(%[dst]) \n\t" - "vmovaps 0x20(%[src]),%%xmm2 \n\t" - "vmovntdq %%xmm2 ,0x20(%[dst]) \n\t" - "vmovaps 0x10(%[src]),%%xmm1 \n\t" - "vmovntdq %%xmm1 ,0x10(%[dst]) \n\t" - "vmovaps 0x00(%[src]),%%xmm0 \n\t" - "vmovntdq %%xmm0 ,0x00(%[dst]) \n\t" - "nop\n\t" - "nop\n\t" - - : [dst]"+r" (dst), - [src]"+r" (src) - : [off]"r" (off), - [end]"r" (end) - : REG, - "xmm0", - "xmm1", - "xmm2", - "xmm3", - "xmm4", - "xmm5", - "xmm6", - "xmm7", -#ifdef __x86_64__ - "xmm8", - "xmm9", - "xmm10", - "xmm11", - "xmm12", - "xmm13", - "xmm14", - "xmm15", - "rax", -#else - "eax", -#endif - "memory" - ); - -#undef REG - - //copy any remaining bytes - for(size_t i = (length & 0xF); i; --i) - ((uint8_t *)dst)[length - i] = - ((uint8_t *)src)[length - i]; -#else - memcpy(dst, src, length); -#endif -} -#else -extern "C" void memcpySSE(void *dst, const void * src, size_t length); + #define memcpySSE memcpy #endif \ No newline at end of file