mirror of
https://github.com/gnif/LookingGlass.git
synced 2024-11-10 08:38:20 +00:00
[host] use the new memcpySSE implementation
This commit is contained in:
parent
6f141fe393
commit
15a337fee8
@ -4,8 +4,11 @@ memcpySSE proc
|
|||||||
; src = rdx
|
; src = rdx
|
||||||
; len = r8
|
; len = r8
|
||||||
|
|
||||||
|
mov rax, rcx
|
||||||
|
|
||||||
test r8, r8
|
test r8, r8
|
||||||
jz @Exit
|
jz @Exit
|
||||||
|
|
||||||
cmp rcx, rdx
|
cmp rcx, rdx
|
||||||
je @Exit
|
je @Exit
|
||||||
|
|
||||||
@ -16,7 +19,7 @@ memcpySSE proc
|
|||||||
; void * end = dst + (length & ~0x7F);
|
; void * end = dst + (length & ~0x7F);
|
||||||
; end = r10
|
; end = r10
|
||||||
mov r9 , r8
|
mov r9 , r8
|
||||||
and r9 , -07Fh
|
and r9 , 0FFFFFFFFFFFFFF80h
|
||||||
jz @RemainingBlocks
|
jz @RemainingBlocks
|
||||||
mov r10, rcx
|
mov r10, rcx
|
||||||
add r10, r9
|
add r10, r9
|
||||||
@ -50,6 +53,7 @@ memcpySSE proc
|
|||||||
and r11, 07Fh
|
and r11, 07Fh
|
||||||
jz @RestoreExit
|
jz @RestoreExit
|
||||||
shr r11, 4
|
shr r11, 4
|
||||||
|
jz @FinalBytes
|
||||||
|
|
||||||
mov r10, 7
|
mov r10, 7
|
||||||
sub r10, r11
|
sub r10, r11
|
||||||
@ -59,12 +63,11 @@ memcpySSE proc
|
|||||||
jmp r9
|
jmp r9
|
||||||
|
|
||||||
@RestoreExit:
|
@RestoreExit:
|
||||||
movdqa xmm6 , oword ptr [rsp + 4*8 + 00 ]
|
movdqa xmm6 , oword ptr [rsp + 4*8 + 00]
|
||||||
movdqa xmm7 , oword ptr [rsp + 4*8 + 16 ]
|
movdqa xmm7 , oword ptr [rsp + 4*8 + 16]
|
||||||
add rsp, 8 + 2*16 + 4*8
|
add rsp, 8 + 2*16 + 4*8
|
||||||
|
|
||||||
@Exit:
|
@Exit:
|
||||||
sfence
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
@FinalBlocks:
|
@FinalBlocks:
|
||||||
@ -82,135 +85,132 @@ memcpySSE proc
|
|||||||
vmovntdq xmmword ptr [rcx + 010h], xmm1
|
vmovntdq xmmword ptr [rcx + 010h], xmm1
|
||||||
vmovaps xmm0 , xmmword ptr [rdx + 000h]
|
vmovaps xmm0 , xmmword ptr [rdx + 000h]
|
||||||
vmovntdq xmmword ptr [rcx + 000h], xmm0
|
vmovntdq xmmword ptr [rcx + 000h], xmm0
|
||||||
nop
|
|
||||||
nop
|
|
||||||
|
|
||||||
imul r9, 16
|
movdqa xmm6 , oword ptr [rsp + 4*8 + 00]
|
||||||
add rdx, r9
|
movdqa xmm7 , oword ptr [rsp + 4*8 + 16]
|
||||||
add rcx, r9
|
add rsp, 8 + 2*16 + 4*8
|
||||||
|
sfence
|
||||||
|
|
||||||
@EndBlocks:
|
shl r11, 4
|
||||||
|
add rdx, r11
|
||||||
|
add rcx, r11
|
||||||
|
|
||||||
|
@FinalBytes:
|
||||||
and r8, 0Fh
|
and r8, 0Fh
|
||||||
test r8, r8
|
jz @Exit
|
||||||
je @RestoreExit
|
imul r8, 5
|
||||||
|
lea r9, @FinalBytesTable
|
||||||
|
add r9, r8
|
||||||
|
jmp r9
|
||||||
|
|
||||||
cmp r8, 2
|
@FinalBytesTable:
|
||||||
je @Copy2
|
jmp @Copy1
|
||||||
cmp r8, 3
|
jmp @Copy2
|
||||||
je @Copy3
|
jmp @Copy3
|
||||||
cmp r8, 4
|
jmp @Copy4
|
||||||
je @Copy4
|
jmp @Copy5
|
||||||
cmp r8, 5
|
jmp @Copy6
|
||||||
je @Copy5
|
jmp @Copy7
|
||||||
cmp r8, 6
|
jmp @Copy8
|
||||||
je @Copy6
|
jmp @Copy9
|
||||||
cmp r8, 7
|
jmp @Copy10
|
||||||
je @Copy7
|
jmp @Copy11
|
||||||
cmp r8, 8
|
jmp @Copy12
|
||||||
je @Copy8
|
jmp @Copy13
|
||||||
cmp r8, 9
|
jmp @Copy14
|
||||||
je @Copy9
|
jmp @Copy15
|
||||||
cmp r8, 10
|
|
||||||
je @Copy10
|
db 128 DUP(0CCh)
|
||||||
cmp r8, 11
|
|
||||||
je @Copy11
|
|
||||||
cmp r8, 12
|
|
||||||
je @Copy12
|
|
||||||
cmp r8, 13
|
|
||||||
je @Copy13
|
|
||||||
cmp r8, 14
|
|
||||||
je @Copy14
|
|
||||||
cmp r8, 15
|
|
||||||
je @Copy15
|
|
||||||
|
|
||||||
; fall through - 1 byte
|
; fall through - 1 byte
|
||||||
|
@Copy1:
|
||||||
mov al, byte ptr [rdx]
|
mov al, byte ptr [rdx]
|
||||||
mov byte ptr [rcx], al
|
mov byte ptr [rcx], al
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
@Copy2:
|
@Copy2:
|
||||||
mov r10w, word ptr [rdx]
|
mov r10w, word ptr [rdx]
|
||||||
mov word ptr [rcx], r10w
|
mov word ptr [rcx], r10w
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
@Copy3:
|
@Copy3:
|
||||||
mov r10w, word ptr [rdx]
|
mov r10w, word ptr [rdx]
|
||||||
mov word ptr [rcx], r10w
|
mov word ptr [rcx], r10w
|
||||||
mov al, byte ptr [rdx + 02h]
|
mov r11b, byte ptr [rdx + 02h]
|
||||||
mov byte ptr [rcx + 02h], al
|
mov byte ptr [rcx + 02h], r11b
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
@Copy4:
|
@Copy4:
|
||||||
mov r9d, dword ptr [rdx]
|
mov r9d, dword ptr [rdx]
|
||||||
mov dword ptr [rcx], r9d
|
mov dword ptr [rcx], r9d
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
@Copy5:
|
@Copy5:
|
||||||
mov r9d, dword ptr [rdx ]
|
mov r9d, dword ptr [rdx ]
|
||||||
mov al , byte ptr [rdx + 04h]
|
mov r11b , byte ptr [rdx + 04h]
|
||||||
mov dword ptr [rcx ], r9d
|
mov dword ptr [rcx ], r9d
|
||||||
mov byte ptr [rcx + 04h], al
|
mov byte ptr [rcx + 04h], r11b
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
@Copy6:
|
@Copy6:
|
||||||
mov r9d , dword ptr [rdx ]
|
mov r9d , dword ptr [rdx ]
|
||||||
mov r10w, word ptr [rdx + 04h]
|
mov r10w, word ptr [rdx + 04h]
|
||||||
mov dword ptr [rcx ], r9d
|
mov dword ptr [rcx ], r9d
|
||||||
mov word ptr [rcx + 04h], r10w
|
mov word ptr [rcx + 04h], r10w
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
@Copy7:
|
@Copy7:
|
||||||
mov r9d , dword ptr [rdx ]
|
mov r9d , dword ptr [rdx ]
|
||||||
mov r10w, word ptr [rdx + 04h]
|
mov r10w, word ptr [rdx + 04h]
|
||||||
mov al , byte ptr [rdx + 06h]
|
mov r11b, byte ptr [rdx + 06h]
|
||||||
mov dword ptr [rcx ], r9d
|
mov dword ptr [rcx ], r9d
|
||||||
mov word ptr [rcx + 04h], r10w
|
mov word ptr [rcx + 04h], r10w
|
||||||
mov byte ptr [rcx + 06h], al
|
mov byte ptr [rcx + 06h], r11b
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
@Copy8:
|
@Copy8:
|
||||||
mov r8, qword ptr [rdx]
|
mov r8, qword ptr [rdx]
|
||||||
mov qword ptr [rcx], r8
|
mov qword ptr [rcx], r8
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
@Copy9:
|
@Copy9:
|
||||||
mov r8, qword ptr [rdx ]
|
mov r8 , qword ptr [rdx ]
|
||||||
mov al, byte ptr [rdx + 08h]
|
mov r11b, byte ptr [rdx + 08h]
|
||||||
mov qword ptr [rcx ], r8
|
mov qword ptr [rcx ], r8
|
||||||
mov byte ptr [rcx + 08h], al
|
mov byte ptr [rcx + 08h], r11b
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
@Copy10:
|
@Copy10:
|
||||||
mov r8 , qword ptr [rdx ]
|
mov r8 , qword ptr [rdx ]
|
||||||
mov r10w, word ptr [rdx + 08h]
|
mov r10w, word ptr [rdx + 08h]
|
||||||
mov qword ptr [rcx ], r8
|
mov qword ptr [rcx ], r8
|
||||||
mov word ptr [rcx + 08h], r10w
|
mov word ptr [rcx + 08h], r10w
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
@Copy11:
|
@Copy11:
|
||||||
mov r8 , qword ptr [rdx ]
|
mov r8 , qword ptr [rdx ]
|
||||||
mov r10w, word ptr [rdx + 08h]
|
mov r10w, word ptr [rdx + 08h]
|
||||||
mov al , byte ptr [rdx + 0Ah]
|
mov r11b, byte ptr [rdx + 0Ah]
|
||||||
mov qword ptr [rcx ], r8
|
mov qword ptr [rcx ], r8
|
||||||
mov word ptr [rcx + 08h], r10w
|
mov word ptr [rcx + 08h], r10w
|
||||||
mov byte ptr [rcx + 0Ah], al
|
mov byte ptr [rcx + 0Ah], r11b
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
@Copy12:
|
@Copy12:
|
||||||
mov r8 , qword ptr [rdx ]
|
mov r8 , qword ptr [rdx ]
|
||||||
mov r9d, dword ptr [rdx + 08h]
|
mov r9d, dword ptr [rdx + 08h]
|
||||||
mov qword ptr [rcx ], r8
|
mov qword ptr [rcx ], r8
|
||||||
mov dword ptr [rcx + 08h], r9d
|
mov dword ptr [rcx + 08h], r9d
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
@Copy13:
|
@Copy13:
|
||||||
mov r8 , qword ptr [rdx ]
|
mov r8 , qword ptr [rdx ]
|
||||||
mov r9d, dword ptr [rdx + 08h]
|
mov r9d , dword ptr [rdx + 08h]
|
||||||
mov al , byte ptr [rdx + 0Ch]
|
mov r11b, byte ptr [rdx + 0Ch]
|
||||||
mov qword ptr [rcx ], r8
|
mov qword ptr [rcx ], r8
|
||||||
mov dword ptr [rcx + 08h], r9d
|
mov dword ptr [rcx + 08h], r9d
|
||||||
mov byte ptr [rcx + 0Ch], al
|
mov byte ptr [rcx + 0Ch], r11b
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
@Copy14:
|
@Copy14:
|
||||||
mov r8 , qword ptr [rdx ]
|
mov r8 , qword ptr [rdx ]
|
||||||
@ -219,19 +219,19 @@ memcpySSE proc
|
|||||||
mov qword ptr [rcx ], r8
|
mov qword ptr [rcx ], r8
|
||||||
mov dword ptr [rcx + 08h], r9d
|
mov dword ptr [rcx + 08h], r9d
|
||||||
mov word ptr [rcx + 0Ch], r10w
|
mov word ptr [rcx + 0Ch], r10w
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
; copy 15
|
; copy 15
|
||||||
@Copy15:
|
@Copy15:
|
||||||
mov r8 , qword ptr [rdx + 00h]
|
mov r8 , qword ptr [rdx + 00h]
|
||||||
mov r9d , dword ptr [rdx + 08h]
|
mov r9d , dword ptr [rdx + 08h]
|
||||||
mov r10w, word ptr [rdx + 0Ch]
|
mov r10w, word ptr [rdx + 0Ch]
|
||||||
mov al , byte ptr [rdx + 0Eh]
|
mov r11b, byte ptr [rdx + 0Eh]
|
||||||
mov qword ptr [rcx + 00h], r8
|
mov qword ptr [rcx + 00h], r8
|
||||||
mov dword ptr [rcx + 08h], r9d
|
mov dword ptr [rcx + 08h], r9d
|
||||||
mov word ptr [rcx + 0Ch], r10w
|
mov word ptr [rcx + 0Ch], r10w
|
||||||
mov byte ptr [rcx + 0Eh], al
|
mov byte ptr [rcx + 0Eh], r11b
|
||||||
jmp @RestoreExit
|
ret
|
||||||
|
|
||||||
memcpySSE endp
|
memcpySSE endp
|
||||||
end
|
end
|
@ -26,108 +26,110 @@ Place, Suite 330, Boston, MA 02111-1307 USA
|
|||||||
|
|
||||||
#include "debug.h"
|
#include "debug.h"
|
||||||
|
|
||||||
static inline void memcpySSE(void * dst, const void * src, size_t length)
|
#if defined(NATIVE_MEMCPY)
|
||||||
{
|
#define memcpySSE memcpy
|
||||||
// check if we can't perform an aligned copy
|
#elif defined(_MSC_VER)
|
||||||
if (((uintptr_t)src & 0xF) != ((uintptr_t)dst & 0xF))
|
extern "C" void * memcpySSE(void *dst, const void * src, size_t length);
|
||||||
|
#elif (defined(__GNUC__) || defined(__GNUG__)) && defined(__i386__)
|
||||||
|
inline static void * memcpySSE(void *dst, const void * src, size_t length)
|
||||||
{
|
{
|
||||||
|
if (length == 0 || dst == src)
|
||||||
|
return;
|
||||||
|
|
||||||
static bool unalignedDstWarn = false;
|
// copies under 1MB are faster with the inlined memcpy
|
||||||
if (!unalignedDstWarn)
|
// tell the dev to use that instead
|
||||||
|
if (length < 1048576)
|
||||||
{
|
{
|
||||||
DEBUG_WARN("Memcpy64 unable to perform aligned copy, performance will suffer");
|
static bool smallBufferWarn = false;
|
||||||
unalignedDstWarn = true;
|
if (!smallBufferWarn)
|
||||||
|
{
|
||||||
|
DEBUG_WARN("Do not use memcpySSE for copies under 1MB in size!");
|
||||||
|
smallBufferWarn = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// fallback to system memcpy
|
|
||||||
memcpy(dst, src, length);
|
memcpy(dst, src, length);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if the source needs alignment
|
const void * end = dst + (length & ~0x7F);
|
||||||
{
|
const size_t off = (7 - ((length & 0x7F) >> 4)) * 10;
|
||||||
uint8_t * _src = (uint8_t *)src;
|
|
||||||
unsigned int count = (16 - ((uintptr_t)src & 0xF)) & 0xF;
|
|
||||||
|
|
||||||
static bool unalignedSrcWarn = false;
|
__asm__ __volatile__ (
|
||||||
if (count > 0)
|
"cmp %[dst],%[end] \n\t"
|
||||||
{
|
"je Remain_%= \n\t"
|
||||||
if (!unalignedSrcWarn)
|
|
||||||
{
|
// perform SIMD block copy
|
||||||
DEBUG_WARN("Memcpy64 unaligned source, performance will suffer");
|
"loop_%=: \n\t"
|
||||||
unalignedSrcWarn = true;
|
"vmovaps 0x00(%[src]),%%xmm0 \n\t"
|
||||||
|
"vmovaps 0x10(%[src]),%%xmm1 \n\t"
|
||||||
|
"vmovaps 0x20(%[src]),%%xmm2 \n\t"
|
||||||
|
"vmovaps 0x30(%[src]),%%xmm3 \n\t"
|
||||||
|
"vmovaps 0x40(%[src]),%%xmm4 \n\t"
|
||||||
|
"vmovaps 0x50(%[src]),%%xmm5 \n\t"
|
||||||
|
"vmovaps 0x60(%[src]),%%xmm6 \n\t"
|
||||||
|
"vmovaps 0x70(%[src]),%%xmm7 \n\t"
|
||||||
|
"vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
|
||||||
|
"vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
|
||||||
|
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
|
||||||
|
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
|
||||||
|
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
|
||||||
|
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
|
||||||
|
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
|
||||||
|
"vmovntdq %%xmm7 ,0x70(%[dst]) \n\t"
|
||||||
|
"add $0x80,%[dst] \n\t"
|
||||||
|
"add $0x80,%[src] \n\t"
|
||||||
|
"cmp %[dst],%[end] \n\t"
|
||||||
|
"jne loop_%= \n\t"
|
||||||
|
|
||||||
|
"Remain_%=: \n\t"
|
||||||
|
|
||||||
|
// copy any remaining 16 byte blocks
|
||||||
|
"call GetPC_%=\n\t"
|
||||||
|
"Offset_%=:\n\t"
|
||||||
|
"add $(BlockTable_%= - Offset_%=), %%eax \n\t"
|
||||||
|
"add %[off],%%eax \n\t"
|
||||||
|
"jmp *%%eax \n\t"
|
||||||
|
|
||||||
|
"GetPC_%=:\n\t"
|
||||||
|
"mov (%%esp), %%eax \n\t"
|
||||||
|
"ret \n\t"
|
||||||
|
|
||||||
|
"BlockTable_%=:\n\t"
|
||||||
|
"vmovaps 0x60(%[src]),%%xmm6 \n\t"
|
||||||
|
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
|
||||||
|
"vmovaps 0x50(%[src]),%%xmm5 \n\t"
|
||||||
|
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
|
||||||
|
"vmovaps 0x40(%[src]),%%xmm4 \n\t"
|
||||||
|
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
|
||||||
|
"vmovaps 0x30(%[src]),%%xmm3 \n\t"
|
||||||
|
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
|
||||||
|
"vmovaps 0x20(%[src]),%%xmm2 \n\t"
|
||||||
|
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
|
||||||
|
"vmovaps 0x10(%[src]),%%xmm1 \n\t"
|
||||||
|
"vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
|
||||||
|
"vmovaps 0x00(%[src]),%%xmm0 \n\t"
|
||||||
|
"vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
|
||||||
|
"nop\n\t"
|
||||||
|
"nop\n\t"
|
||||||
|
|
||||||
|
: [dst]"+r" (dst),
|
||||||
|
[src]"+r" (src)
|
||||||
|
: [off]"r" (off),
|
||||||
|
[end]"r" (end)
|
||||||
|
: "eax",
|
||||||
|
"xmm0",
|
||||||
|
"xmm1",
|
||||||
|
"xmm2",
|
||||||
|
"xmm3",
|
||||||
|
"xmm4",
|
||||||
|
"xmm5",
|
||||||
|
"xmm6",
|
||||||
|
"xmm7",
|
||||||
|
"memory"
|
||||||
|
);
|
||||||
|
|
||||||
|
//copy any remaining bytes
|
||||||
|
memcpy(dst, src, length & 0xF);
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
uint8_t * _dst = (uint8_t *)dst;
|
#define memcpySSE memcpy
|
||||||
for (unsigned int i = count; i > 0; --i)
|
#endif
|
||||||
*_dst++ = *_src++;
|
|
||||||
src = _src;
|
|
||||||
dst = _dst;
|
|
||||||
length -= count;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
__m128i * _src = (__m128i *)src;
|
|
||||||
__m128i * _dst = (__m128i *)dst;
|
|
||||||
__m128i v0, v1, v2, v3, v4, v5, v6, v7;
|
|
||||||
|
|
||||||
const size_t sselen = length & ~0x7F;
|
|
||||||
const __m128i * _end = (__m128i *)((uintptr_t)src + sselen);
|
|
||||||
for (; _src != _end; _src += 8, _dst += 8)
|
|
||||||
{
|
|
||||||
_mm_prefetch(((char *)(_src + 8 )), _MM_HINT_NTA);
|
|
||||||
_mm_prefetch(((char *)(_src + 9 )), _MM_HINT_NTA);
|
|
||||||
_mm_prefetch(((char *)(_src + 10)), _MM_HINT_NTA);
|
|
||||||
_mm_prefetch(((char *)(_src + 11)), _MM_HINT_NTA);
|
|
||||||
|
|
||||||
v0 = _mm_load_si128(_src + 0);
|
|
||||||
v1 = _mm_load_si128(_src + 1);
|
|
||||||
v2 = _mm_load_si128(_src + 2);
|
|
||||||
v3 = _mm_load_si128(_src + 3);
|
|
||||||
v4 = _mm_load_si128(_src + 4);
|
|
||||||
v5 = _mm_load_si128(_src + 5);
|
|
||||||
v6 = _mm_load_si128(_src + 6);
|
|
||||||
v7 = _mm_load_si128(_src + 7);
|
|
||||||
|
|
||||||
_mm_stream_si128(_dst + 0, v0);
|
|
||||||
_mm_stream_si128(_dst + 1, v1);
|
|
||||||
_mm_stream_si128(_dst + 2, v2);
|
|
||||||
_mm_stream_si128(_dst + 3, v3);
|
|
||||||
_mm_stream_si128(_dst + 4, v4);
|
|
||||||
_mm_stream_si128(_dst + 5, v5);
|
|
||||||
_mm_stream_si128(_dst + 6, v6);
|
|
||||||
_mm_stream_si128(_dst + 7, v7);
|
|
||||||
}
|
|
||||||
|
|
||||||
const size_t remain = length - sselen;
|
|
||||||
switch (remain & ~0xF)
|
|
||||||
{
|
|
||||||
case 112: v0 = _mm_load_si128(_src++);
|
|
||||||
case 96: v1 = _mm_load_si128(_src++);
|
|
||||||
case 80: v2 = _mm_load_si128(_src++);
|
|
||||||
case 64: v3 = _mm_load_si128(_src++);
|
|
||||||
case 48: v4 = _mm_load_si128(_src++);
|
|
||||||
case 32: v5 = _mm_load_si128(_src++);
|
|
||||||
case 16: v6 = _mm_load_si128(_src++);
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (remain & ~0xF)
|
|
||||||
{
|
|
||||||
case 112: _mm_stream_si128(_dst++, v0);
|
|
||||||
case 96: _mm_stream_si128(_dst++, v1);
|
|
||||||
case 80: _mm_stream_si128(_dst++, v2);
|
|
||||||
case 64: _mm_stream_si128(_dst++, v3);
|
|
||||||
case 48: _mm_stream_si128(_dst++, v4);
|
|
||||||
case 32: _mm_stream_si128(_dst++, v5);
|
|
||||||
case 16: _mm_stream_si128(_dst++, v6);
|
|
||||||
}
|
|
||||||
|
|
||||||
// copy any remaining data
|
|
||||||
if (remain & 0xF)
|
|
||||||
{
|
|
||||||
uint8_t * rsrc = (uint8_t *)_src;
|
|
||||||
uint8_t * rdst = (uint8_t *)_dst;
|
|
||||||
for (size_t i = remain & 0xF; i > 0; --i)
|
|
||||||
*rdst++ = *rsrc++;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,135 +0,0 @@
|
|||||||
/*
|
|
||||||
KVMGFX Client - A KVM Client for VGA Passthrough
|
|
||||||
Copyright (C) 2017 Geoffrey McRae <geoff@hostfission.com>
|
|
||||||
https://looking-glass.hostfission.com
|
|
||||||
|
|
||||||
This program is free software; you can redistribute it and/or modify it under
|
|
||||||
the terms of the GNU General Public License as published by the Free Software
|
|
||||||
Foundation; either version 2 of the License, or (at your option) any later
|
|
||||||
version.
|
|
||||||
|
|
||||||
This program is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
||||||
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
|
||||||
PARTICULAR PURPOSE. See the GNU General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU General Public License along with
|
|
||||||
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
|
|
||||||
Place, Suite 330, Boston, MA 02111-1307 USA
|
|
||||||
*/
|
|
||||||
|
|
||||||
#pragma once
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <tmmintrin.h>
|
|
||||||
#include <immintrin.h>
|
|
||||||
|
|
||||||
#include "debug.h"
|
|
||||||
|
|
||||||
#if defined(NATIVE_MEMCPY)
|
|
||||||
#define memcpySSE memcpy
|
|
||||||
#elif defined(_MSC_VER)
|
|
||||||
extern "C" void memcpySSE(void *dst, const void * src, size_t length);
|
|
||||||
#elif (defined(__GNUC__) || defined(__GNUG__)) && defined(__i386__)
|
|
||||||
inline static void memcpySSE(void *dst, const void * src, size_t length)
|
|
||||||
{
|
|
||||||
if (length == 0 || dst == src)
|
|
||||||
return;
|
|
||||||
|
|
||||||
// copies under 1MB are faster with the inlined memcpy
|
|
||||||
// tell the dev to use that instead
|
|
||||||
if (length < 1048576)
|
|
||||||
{
|
|
||||||
static bool smallBufferWarn = false;
|
|
||||||
if (!smallBufferWarn)
|
|
||||||
{
|
|
||||||
DEBUG_WARN("Do not use memcpySSE for copies under 1MB in size!");
|
|
||||||
smallBufferWarn = true;
|
|
||||||
}
|
|
||||||
memcpy(dst, src, length);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const void * end = dst + (length & ~0x7F);
|
|
||||||
const size_t off = (7 - ((length & 0x7F) >> 4)) * 10;
|
|
||||||
|
|
||||||
__asm__ __volatile__ (
|
|
||||||
"cmp %[dst],%[end] \n\t"
|
|
||||||
"je Remain_%= \n\t"
|
|
||||||
|
|
||||||
// perform SIMD block copy
|
|
||||||
"loop_%=: \n\t"
|
|
||||||
"vmovaps 0x00(%[src]),%%xmm0 \n\t"
|
|
||||||
"vmovaps 0x10(%[src]),%%xmm1 \n\t"
|
|
||||||
"vmovaps 0x20(%[src]),%%xmm2 \n\t"
|
|
||||||
"vmovaps 0x30(%[src]),%%xmm3 \n\t"
|
|
||||||
"vmovaps 0x40(%[src]),%%xmm4 \n\t"
|
|
||||||
"vmovaps 0x50(%[src]),%%xmm5 \n\t"
|
|
||||||
"vmovaps 0x60(%[src]),%%xmm6 \n\t"
|
|
||||||
"vmovaps 0x70(%[src]),%%xmm7 \n\t"
|
|
||||||
"vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
|
|
||||||
"vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
|
|
||||||
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
|
|
||||||
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
|
|
||||||
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
|
|
||||||
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
|
|
||||||
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
|
|
||||||
"vmovntdq %%xmm7 ,0x70(%[dst]) \n\t"
|
|
||||||
"add $0x80,%[dst] \n\t"
|
|
||||||
"add $0x80,%[src] \n\t"
|
|
||||||
"cmp %[dst],%[end] \n\t"
|
|
||||||
"jne loop_%= \n\t"
|
|
||||||
|
|
||||||
"Remain_%=: \n\t"
|
|
||||||
|
|
||||||
// copy any remaining 16 byte blocks
|
|
||||||
"call GetPC_%=\n\t"
|
|
||||||
"Offset_%=:\n\t"
|
|
||||||
"add $(BlockTable_%= - Offset_%=), %%eax \n\t"
|
|
||||||
"add %[off],%%eax \n\t"
|
|
||||||
"jmp *%%eax \n\t"
|
|
||||||
|
|
||||||
"GetPC_%=:\n\t"
|
|
||||||
"mov (%%esp), %%eax \n\t"
|
|
||||||
"ret \n\t"
|
|
||||||
|
|
||||||
"BlockTable_%=:\n\t"
|
|
||||||
"vmovaps 0x60(%[src]),%%xmm6 \n\t"
|
|
||||||
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
|
|
||||||
"vmovaps 0x50(%[src]),%%xmm5 \n\t"
|
|
||||||
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
|
|
||||||
"vmovaps 0x40(%[src]),%%xmm4 \n\t"
|
|
||||||
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
|
|
||||||
"vmovaps 0x30(%[src]),%%xmm3 \n\t"
|
|
||||||
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
|
|
||||||
"vmovaps 0x20(%[src]),%%xmm2 \n\t"
|
|
||||||
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
|
|
||||||
"vmovaps 0x10(%[src]),%%xmm1 \n\t"
|
|
||||||
"vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
|
|
||||||
"vmovaps 0x00(%[src]),%%xmm0 \n\t"
|
|
||||||
"vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
|
|
||||||
"nop\n\t"
|
|
||||||
"nop\n\t"
|
|
||||||
|
|
||||||
: [dst]"+r" (dst),
|
|
||||||
[src]"+r" (src)
|
|
||||||
: [off]"r" (off),
|
|
||||||
[end]"r" (end)
|
|
||||||
: "eax",
|
|
||||||
"xmm0",
|
|
||||||
"xmm1",
|
|
||||||
"xmm2",
|
|
||||||
"xmm3",
|
|
||||||
"xmm4",
|
|
||||||
"xmm5",
|
|
||||||
"xmm6",
|
|
||||||
"xmm7",
|
|
||||||
"memory"
|
|
||||||
);
|
|
||||||
|
|
||||||
//copy any remaining bytes
|
|
||||||
memcpy(dst, src, length & 0xF);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
#define memcpySSE memcpy
|
|
||||||
#endif
|
|
@ -761,7 +761,9 @@ GrabStatus Capture::DXGI::GrabFrameRaw(FrameInfo & frame)
|
|||||||
|
|
||||||
while(true)
|
while(true)
|
||||||
{
|
{
|
||||||
|
TRACE_START("GrabFrame");
|
||||||
result = GrabFrameTexture(frame, src, timeout);
|
result = GrabFrameTexture(frame, src, timeout);
|
||||||
|
TRACE_END;
|
||||||
if (result != GRAB_STATUS_OK)
|
if (result != GRAB_STATUS_OK)
|
||||||
return result;
|
return result;
|
||||||
|
|
||||||
@ -773,7 +775,7 @@ GrabStatus Capture::DXGI::GrabFrameRaw(FrameInfo & frame)
|
|||||||
|
|
||||||
// send the last frame again if we timeout to prevent the client stalling on restart
|
// send the last frame again if we timeout to prevent the client stalling on restart
|
||||||
frame.pitch = m_mapping.RowPitch;
|
frame.pitch = m_mapping.RowPitch;
|
||||||
frame.stride = m_mapping.RowPitch / 4;
|
frame.stride = m_mapping.RowPitch >> 2;
|
||||||
|
|
||||||
unsigned int size = m_height * m_mapping.RowPitch;
|
unsigned int size = m_height * m_mapping.RowPitch;
|
||||||
m_memcpy.Copy(frame.buffer, m_mapping.pData, LG_MIN(size, frame.bufferSize));
|
m_memcpy.Copy(frame.buffer, m_mapping.pData, LG_MIN(size, frame.bufferSize));
|
||||||
|
@ -92,6 +92,7 @@
|
|||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||||
<ImportGroup Label="ExtensionSettings">
|
<ImportGroup Label="ExtensionSettings">
|
||||||
|
<Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
<ImportGroup Label="Shared">
|
<ImportGroup Label="Shared">
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
@ -351,7 +352,11 @@
|
|||||||
<ClInclude Include="TraceUtil.h" />
|
<ClInclude Include="TraceUtil.h" />
|
||||||
<ClInclude Include="Util.h" />
|
<ClInclude Include="Util.h" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<MASM Include="..\common\memcpySSE.asm" />
|
||||||
|
</ItemGroup>
|
||||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||||
<ImportGroup Label="ExtensionTargets">
|
<ImportGroup Label="ExtensionTargets">
|
||||||
|
<Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
|
||||||
</ImportGroup>
|
</ImportGroup>
|
||||||
</Project>
|
</Project>
|
@ -81,4 +81,9 @@
|
|||||||
<Filter>Header Files</Filter>
|
<Filter>Header Files</Filter>
|
||||||
</ClInclude>
|
</ClInclude>
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
<ItemGroup>
|
||||||
|
<MASM Include="..\common\memcpySSE.asm">
|
||||||
|
<Filter>Source Files</Filter>
|
||||||
|
</MASM>
|
||||||
|
</ItemGroup>
|
||||||
</Project>
|
</Project>
|
Loading…
Reference in New Issue
Block a user