[common] new sse2 memcpy improvements

This commit is contained in:
Geoffrey McRae 2018-05-18 20:56:57 +10:00
parent 3c77c1eb2b
commit f63c8043af

View File

@ -26,98 +26,150 @@ Place, Suite 330, Boston, MA 02111-1307 USA
#include "debug.h" #include "debug.h"
#if defined(__GNUC___) || defined(__GNUG__) #if defined(__GNUC__) || defined(__GNUG__)
#define OP(...) #__VA_ARGS__ "\n\t" #define OP(...) #__VA_ARGS__ "\n\t"
inline static void memcpySSE(void *dst, const void * src, size_t length) inline static void memcpySSE(void *dst, const void * src, size_t length)
{ {
#if defined(__x86_64__) || defined(__i386__) #if !defined(NATIVE_MEMCPY) && (defined(__x86_64__) || defined(__i386__))
if (length == 0 || dst == src)
return;
#ifdef __x86_64__
void * end = dst + (length & ~0xFF);
size_t off = (15 - ((length & 0xFF) >> 4));
off = (off < 8) ? off * 16 : 7 * 16 + (off - 7) * 10;
#else
void * end = dst + (length & ~0x7F); void * end = dst + (length & ~0x7F);
size_t rem = (7 - ((length & 0x7F) >> 4)) * 10; size_t off = (7 - ((length & 0x7F) >> 4)) * 10;
#endif
__asm__ __volatile__ ( __asm__ __volatile__ (
// save the registers we intend to alter, failure to do so causes problems "cmp %[dst],%[end] \n\t"
// when gcc -O3 is used "je Remain_%= \n\t"
OP(push %[dst])
OP(push %[src])
OP(push %[end])
// perform 128 byte SIMD block copy // perform SIMD block copy
OP(cmp %[dst],%[end]) "loop_%=: \n\t"
OP(je ramain_%=) "vmovaps 0x00(%[src]),%%xmm0 \n\t"
OP(loop_%=:) "vmovaps 0x10(%[src]),%%xmm1 \n\t"
OP(vmovaps 0x00(%[src]),%%xmm0) "vmovaps 0x20(%[src]),%%xmm2 \n\t"
OP(vmovaps 0x10(%[src]),%%xmm1) "vmovaps 0x30(%[src]),%%xmm3 \n\t"
OP(vmovaps 0x20(%[src]),%%xmm2) "vmovaps 0x40(%[src]),%%xmm4 \n\t"
OP(vmovaps 0x30(%[src]),%%xmm3) "vmovaps 0x50(%[src]),%%xmm5 \n\t"
OP(vmovaps 0x40(%[src]),%%xmm4) "vmovaps 0x60(%[src]),%%xmm6 \n\t"
OP(vmovaps 0x50(%[src]),%%xmm5) "vmovaps 0x70(%[src]),%%xmm7 \n\t"
OP(vmovaps 0x60(%[src]),%%xmm6)
OP(vmovaps 0x70(%[src]),%%xmm7)
OP(vmovntdq %%xmm0,0x00(%[dst]))
OP(vmovntdq %%xmm1,0x10(%[dst]))
OP(vmovntdq %%xmm2,0x20(%[dst]))
OP(vmovntdq %%xmm3,0x30(%[dst]))
OP(vmovntdq %%xmm4,0x40(%[dst]))
OP(vmovntdq %%xmm5,0x50(%[dst]))
OP(vmovntdq %%xmm6,0x60(%[dst]))
OP(vmovntdq %%xmm7,0x70(%[dst]))
OP(add $0x80,%[dst])
OP(add $0x80,%[src])
OP(cmp %[dst],%[end])
OP(jne loop_%=)
// copy any remaining 16 byte blocks
OP(remain_%=:)
#ifdef __x86_64__ #ifdef __x86_64__
OP(leaq (%%rip), %[end]) "vmovaps 0x80(%[src]),%%xmm8 \n\t"
OP(add $10,%[end]) "vmovaps 0x90(%[src]),%%xmm9 \n\t"
#else "vmovaps 0xA0(%[src]),%%xmm10 \n\t"
OP(call .+5) "vmovaps 0xB0(%[src]),%%xmm11 \n\t"
OP(pop %[end]) "vmovaps 0xC0(%[src]),%%xmm12 \n\t"
OP(add $8,%[end]) "vmovaps 0xD0(%[src]),%%xmm13 \n\t"
"vmovaps 0xE0(%[src]),%%xmm14 \n\t"
"vmovaps 0xF0(%[src]),%%xmm15 \n\t"
#endif #endif
OP(add %[rem],%[end]) "vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
OP(jmp *%[end]) "vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
"vmovntdq %%xmm7 ,0x70(%[dst]) \n\t"
#ifdef __x86_64__
"vmovntdq %%xmm8 ,0x80(%[dst]) \n\t"
"vmovntdq %%xmm9 ,0x90(%[dst]) \n\t"
"vmovntdq %%xmm10,0xA0(%[dst]) \n\t"
"vmovntdq %%xmm11,0xB0(%[dst]) \n\t"
"vmovntdq %%xmm12,0xC0(%[dst]) \n\t"
"vmovntdq %%xmm13,0xD0(%[dst]) \n\t"
"vmovntdq %%xmm14,0xE0(%[dst]) \n\t"
"vmovntdq %%xmm15,0xF0(%[dst]) \n\t"
// jump table "add $0x100,%[dst] \n\t"
OP(vmovaps 0x60(%[src]),%%xmm0) "add $0x100,%[src] \n\t"
OP(vmovntdq %%xmm0,0x60(%[dst])) #else
OP(vmovaps 0x50(%[src]),%%xmm1) "add $0x80,%[dst] \n\t"
OP(vmovntdq %%xmm1,0x50(%[dst])) "add $0x80,%[src] \n\t"
OP(vmovaps 0x40(%[src]),%%xmm2) #endif
OP(vmovntdq %%xmm2,0x40(%[dst])) "cmp %[dst],%[end] \n\t"
OP(vmovaps 0x30(%[src]),%%xmm3) "jne loop_%= \n\t"
OP(vmovntdq %%xmm3,0x30(%[dst]))
OP(vmovaps 0x20(%[src]),%%xmm4)
OP(vmovntdq %%xmm4,0x20(%[dst]))
OP(vmovaps 0x10(%[src]),%%xmm5)
OP(vmovntdq %%xmm5,0x10(%[dst]))
OP(vmovaps 0x00(%[src]),%%xmm6)
OP(vmovntdq %%xmm6,0x00(%[dst]))
// alignment as the previous two instructions are only 4 bytes "Remain_%=: \n\t"
OP(nop)
OP(nop)
// restore the registers // copy any remaining 16 byte blocks
OP(pop %[end]) #ifdef __x86_64__
OP(pop %[src]) "leaq (%%rip), %[end]\n\t"
OP(pop %[dst]) "Offset_%=:\n\t"
: #else
: [dst]"r" (dst), "call .+5 \n\t"
[src]"r" (src), "Offset_%=:\n\t"
[end]"c" (end), "pop %[end] \n\t"
[rem]"d" (rem) #endif
: "xmm0", "add $(BlockTable_%= - Offset_%=), %[end]\n\t"
"xmm1", "add %[off],%[end] \n\t"
"xmm2", "jmp *%[end] \n\t"
"xmm3",
"xmm4", "BlockTable_%=:\n\t"
"xmm5", #ifdef __x86_64__
"xmm6", "vmovaps 0xE0(%[src]),%%xmm14 \n\t"
"xmm7", "vmovntdq %%xmm14,0xE0(%[dst]) \n\t"
"memory" "vmovaps 0xD0(%[src]),%%xmm13 \n\t"
"vmovntdq %%xmm13,0xD0(%[dst]) \n\t"
"vmovaps 0xC0(%[src]),%%xmm12 \n\t"
"vmovntdq %%xmm12,0xC0(%[dst]) \n\t"
"vmovaps 0xB0(%[src]),%%xmm11 \n\t"
"vmovntdq %%xmm11,0xB0(%[dst]) \n\t"
"vmovaps 0xA0(%[src]),%%xmm10 \n\t"
"vmovntdq %%xmm10,0xA0(%[dst]) \n\t"
"vmovaps 0x90(%[src]),%%xmm9 \n\t"
"vmovntdq %%xmm9 ,0x90(%[dst]) \n\t"
"vmovaps 0x80(%[src]),%%xmm8 \n\t"
"vmovntdq %%xmm8 ,0x80(%[dst]) \n\t"
"vmovaps 0x70(%[src]),%%xmm7 \n\t"
"vmovntdq %%xmm7 ,0x70(%[dst]) \n\t"
#endif
"vmovaps 0x60(%[src]),%%xmm6 \n\t"
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
"vmovaps 0x50(%[src]),%%xmm5 \n\t"
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
"vmovaps 0x40(%[src]),%%xmm4 \n\t"
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
"vmovaps 0x30(%[src]),%%xmm3 \n\t"
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
"vmovaps 0x20(%[src]),%%xmm2 \n\t"
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
"vmovaps 0x10(%[src]),%%xmm1 \n\t"
"vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
"vmovaps 0x00(%[src]),%%xmm0 \n\t"
"vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
"nop\n\t"
"nop\n\t"
: [dst]"+r" (dst),
[src]"+r" (src),
[end]"+r" (end)
: [off]"r" (off)
: "xmm0",
"xmm1",
"xmm2",
"xmm3",
"xmm4",
"xmm5",
"xmm6",
"xmm7",
#ifdef __x86_64__
"xmm8",
"xmm9",
"xmm10",
"xmm11",
"xmm12",
"xmm13",
"xmm14",
"xmm15",
#endif
"memory"
); );
//copy any remaining bytes //copy any remaining bytes
@ -129,5 +181,5 @@ inline static void memcpySSE(void *dst, const void * src, size_t length)
#endif #endif
} }
#else #else
extern "C" void __fastcall memcpySSE(void *dst, const void * src, size_t length); extern "C" void memcpySSE(void *dst, const void * src, size_t length);
#endif #endif