mirror of
https://github.com/gnif/LookingGlass.git
synced 2024-11-10 00:28:20 +00:00
[common] more SSE improvements
* 32bit inlined is slow for only large copies, warn if memcpySSE is used when it shouldn't be. * Removed 64bit memcpySSE as native inlined is faster See: https://stackoverflow.com/questions/50422510/why-is-i386-memcpy-slow-on-x86-64
This commit is contained in:
parent
778af24d82
commit
56f0a8525b
@ -26,176 +26,112 @@ Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#if defined(__GNUC__) || defined(__GNUG__)
|
||||
#if defined(NATIVE_MEMCPY)
|
||||
#define memcpySSE memcpy
|
||||
#elif defined(MSVC)
|
||||
extern "C" void memcpySSE(void *dst, const void * src, size_t length);
|
||||
#elif (defined(__GNUC__) || defined(__GNUG__)) && defined(__i386__)
|
||||
inline static void memcpySSE(void *dst, const void * src, size_t length)
|
||||
{
|
||||
if (length == 0 || dst == src)
|
||||
return;
|
||||
|
||||
#define OP(...) #__VA_ARGS__ "\n\t"
|
||||
// copies under 1MB are faster with the inlined memcpy
|
||||
// tell the dev to use that instead
|
||||
if (length < 1048576)
|
||||
{
|
||||
static bool smallBufferWarn = false;
|
||||
if (!smallBufferWarn)
|
||||
{
|
||||
DEBUG_WARN("Do not use memcpySSE for copies under 1MB in size!");
|
||||
smallBufferWarn = true;
|
||||
}
|
||||
memcpy(dst, src, length);
|
||||
return;
|
||||
}
|
||||
|
||||
inline static void memcpySSE(void *dst, const void * src, size_t length)
|
||||
{
|
||||
#if !defined(NATIVE_MEMCPY) && (defined(__x86_64__) || defined(__i386__))
|
||||
if (length == 0 || dst == src)
|
||||
return;
|
||||
const void * end = dst + (length & ~0x7F);
|
||||
const size_t off = (7 - ((length & 0x7F) >> 4)) * 10;
|
||||
|
||||
#ifdef __x86_64__
|
||||
const void * end = dst + (length & ~0xFF);
|
||||
size_t off = (15 - ((length & 0xFF) >> 4));
|
||||
off = (off < 8) ? off * 16 : 7 * 16 + (off - 7) * 10;
|
||||
__asm__ __volatile__ (
|
||||
"cmp %[dst],%[end] \n\t"
|
||||
"je Remain_%= \n\t"
|
||||
|
||||
// perform SIMD block copy
|
||||
"loop_%=: \n\t"
|
||||
"vmovaps 0x00(%[src]),%%xmm0 \n\t"
|
||||
"vmovaps 0x10(%[src]),%%xmm1 \n\t"
|
||||
"vmovaps 0x20(%[src]),%%xmm2 \n\t"
|
||||
"vmovaps 0x30(%[src]),%%xmm3 \n\t"
|
||||
"vmovaps 0x40(%[src]),%%xmm4 \n\t"
|
||||
"vmovaps 0x50(%[src]),%%xmm5 \n\t"
|
||||
"vmovaps 0x60(%[src]),%%xmm6 \n\t"
|
||||
"vmovaps 0x70(%[src]),%%xmm7 \n\t"
|
||||
"vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm7 ,0x70(%[dst]) \n\t"
|
||||
"add $0x80,%[dst] \n\t"
|
||||
"add $0x80,%[src] \n\t"
|
||||
"cmp %[dst],%[end] \n\t"
|
||||
"jne loop_%= \n\t"
|
||||
|
||||
"Remain_%=: \n\t"
|
||||
|
||||
// copy any remaining 16 byte blocks
|
||||
"call GetPC_%=\n\t"
|
||||
"Offset_%=:\n\t"
|
||||
"add $(BlockTable_%= - Offset_%=), %%eax \n\t"
|
||||
"add %[off],%%eax \n\t"
|
||||
"jmp *%%eax \n\t"
|
||||
|
||||
"GetPC_%=:\n\t"
|
||||
"mov (%%esp), %%eax \n\t"
|
||||
"ret \n\t"
|
||||
|
||||
"BlockTable_%=:\n\t"
|
||||
"vmovaps 0x60(%[src]),%%xmm6 \n\t"
|
||||
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
|
||||
"vmovaps 0x50(%[src]),%%xmm5 \n\t"
|
||||
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
|
||||
"vmovaps 0x40(%[src]),%%xmm4 \n\t"
|
||||
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
|
||||
"vmovaps 0x30(%[src]),%%xmm3 \n\t"
|
||||
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
|
||||
"vmovaps 0x20(%[src]),%%xmm2 \n\t"
|
||||
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
|
||||
"vmovaps 0x10(%[src]),%%xmm1 \n\t"
|
||||
"vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
|
||||
"vmovaps 0x00(%[src]),%%xmm0 \n\t"
|
||||
"vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
|
||||
"nop\n\t"
|
||||
"nop\n\t"
|
||||
|
||||
: [dst]"+r" (dst),
|
||||
[src]"+r" (src)
|
||||
: [off]"r" (off),
|
||||
[end]"r" (end)
|
||||
: "eax",
|
||||
"xmm0",
|
||||
"xmm1",
|
||||
"xmm2",
|
||||
"xmm3",
|
||||
"xmm4",
|
||||
"xmm5",
|
||||
"xmm6",
|
||||
"xmm7",
|
||||
"memory"
|
||||
);
|
||||
|
||||
//copy any remaining bytes
|
||||
for(size_t i = (length & 0xF); i; --i)
|
||||
((uint8_t *)dst)[length - i] =
|
||||
((uint8_t *)src)[length - i];
|
||||
}
|
||||
#else
|
||||
const void * end = dst + (length & ~0x7F);
|
||||
const size_t off = (7 - ((length & 0x7F) >> 4)) * 10;
|
||||
#endif
|
||||
|
||||
#ifdef __x86_64__
|
||||
#define REG "rax"
|
||||
#else
|
||||
#define REG "eax"
|
||||
#endif
|
||||
|
||||
__asm__ __volatile__ (
|
||||
"cmp %[dst],%[end] \n\t"
|
||||
"je Remain_%= \n\t"
|
||||
|
||||
// perform SIMD block copy
|
||||
"loop_%=: \n\t"
|
||||
"vmovaps 0x00(%[src]),%%xmm0 \n\t"
|
||||
"vmovaps 0x10(%[src]),%%xmm1 \n\t"
|
||||
"vmovaps 0x20(%[src]),%%xmm2 \n\t"
|
||||
"vmovaps 0x30(%[src]),%%xmm3 \n\t"
|
||||
"vmovaps 0x40(%[src]),%%xmm4 \n\t"
|
||||
"vmovaps 0x50(%[src]),%%xmm5 \n\t"
|
||||
"vmovaps 0x60(%[src]),%%xmm6 \n\t"
|
||||
"vmovaps 0x70(%[src]),%%xmm7 \n\t"
|
||||
#ifdef __x86_64__
|
||||
"vmovaps 0x80(%[src]),%%xmm8 \n\t"
|
||||
"vmovaps 0x90(%[src]),%%xmm9 \n\t"
|
||||
"vmovaps 0xA0(%[src]),%%xmm10 \n\t"
|
||||
"vmovaps 0xB0(%[src]),%%xmm11 \n\t"
|
||||
"vmovaps 0xC0(%[src]),%%xmm12 \n\t"
|
||||
"vmovaps 0xD0(%[src]),%%xmm13 \n\t"
|
||||
"vmovaps 0xE0(%[src]),%%xmm14 \n\t"
|
||||
"vmovaps 0xF0(%[src]),%%xmm15 \n\t"
|
||||
#endif
|
||||
"vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm7 ,0x70(%[dst]) \n\t"
|
||||
#ifdef __x86_64__
|
||||
"vmovntdq %%xmm8 ,0x80(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm9 ,0x90(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm10,0xA0(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm11,0xB0(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm12,0xC0(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm13,0xD0(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm14,0xE0(%[dst]) \n\t"
|
||||
"vmovntdq %%xmm15,0xF0(%[dst]) \n\t"
|
||||
|
||||
"add $0x100,%[dst] \n\t"
|
||||
"add $0x100,%[src] \n\t"
|
||||
#else
|
||||
"add $0x80,%[dst] \n\t"
|
||||
"add $0x80,%[src] \n\t"
|
||||
#endif
|
||||
"cmp %[dst],%[end] \n\t"
|
||||
"jne loop_%= \n\t"
|
||||
|
||||
"Remain_%=: \n\t"
|
||||
|
||||
// copy any remaining 16 byte blocks
|
||||
#ifdef __x86_64__
|
||||
"leaq (%%rip), %%rax\n\t"
|
||||
#else
|
||||
"call GetPC_%=\n\t"
|
||||
#endif
|
||||
"Offset_%=:\n\t"
|
||||
"add $(BlockTable_%= - Offset_%=), %%" REG "\n\t"
|
||||
"add %[off],%%" REG " \n\t"
|
||||
"jmp *%%" REG " \n\t"
|
||||
|
||||
#ifdef __i386__
|
||||
"GetPC_%=:\n\t"
|
||||
"mov (%%esp), %%eax \n\t"
|
||||
"ret \n\t"
|
||||
#endif
|
||||
|
||||
"BlockTable_%=:\n\t"
|
||||
#ifdef __x86_64__
|
||||
"vmovaps 0xE0(%[src]),%%xmm14 \n\t"
|
||||
"vmovntdq %%xmm14,0xE0(%[dst]) \n\t"
|
||||
"vmovaps 0xD0(%[src]),%%xmm13 \n\t"
|
||||
"vmovntdq %%xmm13,0xD0(%[dst]) \n\t"
|
||||
"vmovaps 0xC0(%[src]),%%xmm12 \n\t"
|
||||
"vmovntdq %%xmm12,0xC0(%[dst]) \n\t"
|
||||
"vmovaps 0xB0(%[src]),%%xmm11 \n\t"
|
||||
"vmovntdq %%xmm11,0xB0(%[dst]) \n\t"
|
||||
"vmovaps 0xA0(%[src]),%%xmm10 \n\t"
|
||||
"vmovntdq %%xmm10,0xA0(%[dst]) \n\t"
|
||||
"vmovaps 0x90(%[src]),%%xmm9 \n\t"
|
||||
"vmovntdq %%xmm9 ,0x90(%[dst]) \n\t"
|
||||
"vmovaps 0x80(%[src]),%%xmm8 \n\t"
|
||||
"vmovntdq %%xmm8 ,0x80(%[dst]) \n\t"
|
||||
"vmovaps 0x70(%[src]),%%xmm7 \n\t"
|
||||
"vmovntdq %%xmm7 ,0x70(%[dst]) \n\t"
|
||||
#endif
|
||||
"vmovaps 0x60(%[src]),%%xmm6 \n\t"
|
||||
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
|
||||
"vmovaps 0x50(%[src]),%%xmm5 \n\t"
|
||||
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
|
||||
"vmovaps 0x40(%[src]),%%xmm4 \n\t"
|
||||
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
|
||||
"vmovaps 0x30(%[src]),%%xmm3 \n\t"
|
||||
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
|
||||
"vmovaps 0x20(%[src]),%%xmm2 \n\t"
|
||||
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
|
||||
"vmovaps 0x10(%[src]),%%xmm1 \n\t"
|
||||
"vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
|
||||
"vmovaps 0x00(%[src]),%%xmm0 \n\t"
|
||||
"vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
|
||||
"nop\n\t"
|
||||
"nop\n\t"
|
||||
|
||||
: [dst]"+r" (dst),
|
||||
[src]"+r" (src)
|
||||
: [off]"r" (off),
|
||||
[end]"r" (end)
|
||||
: REG,
|
||||
"xmm0",
|
||||
"xmm1",
|
||||
"xmm2",
|
||||
"xmm3",
|
||||
"xmm4",
|
||||
"xmm5",
|
||||
"xmm6",
|
||||
"xmm7",
|
||||
#ifdef __x86_64__
|
||||
"xmm8",
|
||||
"xmm9",
|
||||
"xmm10",
|
||||
"xmm11",
|
||||
"xmm12",
|
||||
"xmm13",
|
||||
"xmm14",
|
||||
"xmm15",
|
||||
"rax",
|
||||
#else
|
||||
"eax",
|
||||
#endif
|
||||
"memory"
|
||||
);
|
||||
|
||||
#undef REG
|
||||
|
||||
//copy any remaining bytes
|
||||
for(size_t i = (length & 0xF); i; --i)
|
||||
((uint8_t *)dst)[length - i] =
|
||||
((uint8_t *)src)[length - i];
|
||||
#else
|
||||
memcpy(dst, src, length);
|
||||
#endif
|
||||
}
|
||||
#else
|
||||
extern "C" void memcpySSE(void *dst, const void * src, size_t length);
|
||||
#define memcpySSE memcpy
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user