mirror of
https://github.com/gnif/LookingGlass.git
synced 2024-11-25 06:47:19 +00:00
[common] tune windows memcpySSE asm implementation:wq
This commit is contained in:
parent
e9d77e6c52
commit
e8b1b8fbdf
@ -1,44 +1,27 @@
|
||||
.code
|
||||
|
||||
memcpySSE proc
|
||||
; dst = rcx
|
||||
; src = rdx
|
||||
; len = r8
|
||||
|
||||
test r8 , r8
|
||||
jne OK
|
||||
ret
|
||||
test r8, r8
|
||||
jz @Exit
|
||||
cmp rcx, rdx
|
||||
je @Exit
|
||||
|
||||
OK:
|
||||
; void * end = dst + (length & ~0x100);
|
||||
sub rsp, 8 + 2*16 + 4*8
|
||||
movdqa oword ptr [rsp + 4*8 + 00 ], xmm6
|
||||
movdqa oword ptr [rsp + 4*8 + 16 ], xmm7
|
||||
|
||||
; void * end = dst + (length & ~0x7F);
|
||||
; end = r10
|
||||
mov r9 , r8
|
||||
and r9 , -0100h
|
||||
and r9 , -07Fh
|
||||
jz @RemainingBlocks
|
||||
mov r10, rcx
|
||||
add r10, r9
|
||||
|
||||
; size_t rem = (length & 0xFF) >> 4);
|
||||
; rem = r11
|
||||
mov r11, r8
|
||||
and r11, 0FFh
|
||||
shr r11, 4
|
||||
|
||||
sub rsp, 8 + 10*16 + 4*8
|
||||
movdqa oword ptr [rsp + 4*8 + 00 ], xmm6
|
||||
movdqa oword ptr [rsp + 4*8 + 16 ], xmm7
|
||||
movdqa oword ptr [rsp + 4*8 + 32 ], xmm8
|
||||
movdqa oword ptr [rsp + 4*8 + 48 ], xmm9
|
||||
movdqa oword ptr [rsp + 4*8 + 64 ], xmm10
|
||||
movdqa oword ptr [rsp + 4*8 + 80 ], xmm11
|
||||
movdqa oword ptr [rsp + 4*8 + 96 ], xmm12
|
||||
movdqa oword ptr [rsp + 4*8 + 112], xmm13
|
||||
movdqa oword ptr [rsp + 4*8 + 128], xmm14
|
||||
movdqa oword ptr [rsp + 4*8 + 144], xmm15
|
||||
|
||||
cmp rcx, r10
|
||||
je RemainingBlocks
|
||||
|
||||
FullLoop:
|
||||
@FullLoop:
|
||||
vmovaps xmm0 , xmmword ptr [rdx + 000h]
|
||||
vmovaps xmm1 , xmmword ptr [rdx + 010h]
|
||||
vmovaps xmm2 , xmmword ptr [rdx + 020h]
|
||||
@ -47,14 +30,6 @@ memcpySSE proc
|
||||
vmovaps xmm5 , xmmword ptr [rdx + 050h]
|
||||
vmovaps xmm6 , xmmword ptr [rdx + 060h]
|
||||
vmovaps xmm7 , xmmword ptr [rdx + 070h]
|
||||
vmovaps xmm8 , xmmword ptr [rdx + 080h]
|
||||
vmovaps xmm9 , xmmword ptr [rdx + 090h]
|
||||
vmovaps xmm10, xmmword ptr [rdx + 0A0h]
|
||||
vmovaps xmm11, xmmword ptr [rdx + 0B0h]
|
||||
vmovaps xmm12, xmmword ptr [rdx + 0C0h]
|
||||
vmovaps xmm13, xmmword ptr [rdx + 0D0h]
|
||||
vmovaps xmm14, xmmword ptr [rdx + 0E0h]
|
||||
vmovaps xmm15, xmmword ptr [rdx + 0F0h]
|
||||
vmovntdq xmmword ptr [rcx + 000h], xmm0
|
||||
vmovntdq xmmword ptr [rcx + 010h], xmm1
|
||||
vmovntdq xmmword ptr [rcx + 020h], xmm2
|
||||
@ -63,140 +38,191 @@ memcpySSE proc
|
||||
vmovntdq xmmword ptr [rcx + 050h], xmm5
|
||||
vmovntdq xmmword ptr [rcx + 060h], xmm6
|
||||
vmovntdq xmmword ptr [rcx + 070h], xmm7
|
||||
vmovntdq xmmword ptr [rcx + 080h], xmm8
|
||||
vmovntdq xmmword ptr [rcx + 090h], xmm9
|
||||
vmovntdq xmmword ptr [rcx + 0A0h], xmm10
|
||||
vmovntdq xmmword ptr [rcx + 0B0h], xmm11
|
||||
vmovntdq xmmword ptr [rcx + 0C0h], xmm12
|
||||
vmovntdq xmmword ptr [rcx + 0D0h], xmm13
|
||||
vmovntdq xmmword ptr [rcx + 0E0h], xmm14
|
||||
vmovntdq xmmword ptr [rcx + 0F0h], xmm15
|
||||
add rdx, 0100h
|
||||
add rcx, 0100h
|
||||
add rdx, 080h
|
||||
add rcx, 080h
|
||||
cmp rcx, r10
|
||||
jne FullLoop
|
||||
jne @FullLoop
|
||||
|
||||
RemainingBlocks:
|
||||
lea r9 , JumpTable
|
||||
mov r10, 15
|
||||
@RemainingBlocks:
|
||||
; size_t rem = (length & 0x7F) >> 4);
|
||||
; rem = r11
|
||||
mov r11, r8
|
||||
and r11, 07Fh
|
||||
jz @RestoreExit
|
||||
shr r11, 4
|
||||
|
||||
mov r10, 7
|
||||
sub r10, r11
|
||||
imul r10, 5
|
||||
imul r10, 10
|
||||
lea r9 , @FinalBlocks
|
||||
add r9 , r10
|
||||
jmp r9
|
||||
|
||||
JumpTable:
|
||||
jmp Block15
|
||||
jmp Block14
|
||||
jmp Block13
|
||||
jmp Block12
|
||||
jmp Block11
|
||||
jmp Block10
|
||||
jmp Block9
|
||||
jmp Block8
|
||||
jmp Block7
|
||||
jmp Block6
|
||||
jmp Block5
|
||||
jmp Block4
|
||||
jmp Block3
|
||||
jmp Block2
|
||||
jmp Block1
|
||||
jmp Block0
|
||||
|
||||
; ensure we generate near jumps
|
||||
padding1 db 127 dup(090h)
|
||||
|
||||
Block15:
|
||||
vmovaps xmm14, xmmword ptr [rdx + 0E0h]
|
||||
vmovntdq xmmword ptr [rcx + 0E0h], xmm14
|
||||
Block14:
|
||||
vmovaps xmm13, xmmword ptr [rdx + 0D0h]
|
||||
vmovntdq xmmword ptr [rcx + 0D0h], xmm13
|
||||
Block13:
|
||||
vmovaps xmm12, xmmword ptr [rdx + 0C0h]
|
||||
vmovntdq xmmword ptr [rcx + 0C0h], xmm12
|
||||
Block12:
|
||||
vmovaps xmm11, xmmword ptr [rdx + 0B0h]
|
||||
vmovntdq xmmword ptr [rcx + 0B0h], xmm11
|
||||
Block11:
|
||||
vmovaps xmm10, xmmword ptr [rdx + 0A0h]
|
||||
vmovntdq xmmword ptr [rcx + 0A0h], xmm10
|
||||
Block10:
|
||||
vmovaps xmm9 , xmmword ptr [rdx + 090h]
|
||||
vmovntdq xmmword ptr [rcx + 090h], xmm9
|
||||
Block9:
|
||||
vmovaps xmm8 , xmmword ptr [rdx + 080h]
|
||||
vmovntdq xmmword ptr [rcx + 080h], xmm8
|
||||
Block8:
|
||||
vmovaps xmm7 , xmmword ptr [rdx + 070h]
|
||||
vmovntdq xmmword ptr [rcx + 070h], xmm7
|
||||
Block7:
|
||||
vmovaps xmm6 , xmmword ptr [rdx + 060h]
|
||||
vmovntdq xmmword ptr [rcx + 060h], xmm6
|
||||
Block6:
|
||||
vmovaps xmm5 , xmmword ptr [rdx + 050h]
|
||||
vmovntdq xmmword ptr [rcx + 050h], xmm5
|
||||
Block5:
|
||||
vmovaps xmm4 , xmmword ptr [rdx + 040h]
|
||||
vmovntdq xmmword ptr [rcx + 040h], xmm4
|
||||
Block4:
|
||||
vmovaps xmm3 , xmmword ptr [rdx + 030h]
|
||||
vmovntdq xmmword ptr [rcx + 030h], xmm3
|
||||
Block3:
|
||||
vmovaps xmm2 , xmmword ptr [rdx + 020h]
|
||||
vmovntdq xmmword ptr [rcx + 020h], xmm2
|
||||
Block2:
|
||||
vmovaps xmm1 , xmmword ptr [rdx + 010h]
|
||||
vmovntdq xmmword ptr [rcx + 010h], xmm1
|
||||
Block1:
|
||||
vmovaps xmm0 , xmmword ptr [rdx + 000h]
|
||||
vmovntdq xmmword ptr [rcx + 000h], xmm0
|
||||
|
||||
imul r11, 16
|
||||
add rdx, r11
|
||||
add rcx, r11
|
||||
|
||||
Block0:
|
||||
@RestoreExit:
|
||||
movdqa xmm6 , oword ptr [rsp + 4*8 + 00 ]
|
||||
movdqa xmm7 , oword ptr [rsp + 4*8 + 16 ]
|
||||
movdqa xmm8 , oword ptr [rsp + 4*8 + 32 ]
|
||||
movdqa xmm9 , oword ptr [rsp + 4*8 + 48 ]
|
||||
movdqa xmm10, oword ptr [rsp + 4*8 + 64 ]
|
||||
movdqa xmm11, oword ptr [rsp + 4*8 + 80 ]
|
||||
movdqa xmm12, oword ptr [rsp + 4*8 + 96 ]
|
||||
movdqa xmm13, oword ptr [rsp + 4*8 + 112]
|
||||
movdqa xmm14, oword ptr [rsp + 4*8 + 128]
|
||||
movdqa xmm15, oword ptr [rsp + 4*8 + 144]
|
||||
add rsp, 8 + 10*16 + 4*8
|
||||
add rsp, 8 + 2*16 + 4*8
|
||||
|
||||
and r8, 0Fh
|
||||
imul r8, 5
|
||||
lea r9, CopyTable
|
||||
add r9, r8
|
||||
jmp r9
|
||||
|
||||
CopyTable:
|
||||
@Exit:
|
||||
sfence
|
||||
ret
|
||||
nop
|
||||
nop
|
||||
|
||||
@FinalBlocks:
|
||||
vmovaps xmm6 , xmmword ptr [rdx + 060h]
|
||||
vmovntdq xmmword ptr [rcx + 060h], xmm6
|
||||
vmovaps xmm5 , xmmword ptr [rdx + 050h]
|
||||
vmovntdq xmmword ptr [rcx + 050h], xmm5
|
||||
vmovaps xmm4 , xmmword ptr [rdx + 040h]
|
||||
vmovntdq xmmword ptr [rcx + 040h], xmm4
|
||||
vmovaps xmm3 , xmmword ptr [rdx + 030h]
|
||||
vmovntdq xmmword ptr [rcx + 030h], xmm3
|
||||
vmovaps xmm2 , xmmword ptr [rdx + 020h]
|
||||
vmovntdq xmmword ptr [rcx + 020h], xmm2
|
||||
vmovaps xmm1 , xmmword ptr [rdx + 010h]
|
||||
vmovntdq xmmword ptr [rcx + 010h], xmm1
|
||||
vmovaps xmm0 , xmmword ptr [rdx + 000h]
|
||||
vmovntdq xmmword ptr [rcx + 000h], xmm0
|
||||
nop
|
||||
nop
|
||||
|
||||
jmp Copy1
|
||||
jmp Copy2
|
||||
jmp Copy3
|
||||
jmp Copy4
|
||||
jmp Copy5
|
||||
jmp Copy6
|
||||
jmp Copy7
|
||||
jmp Copy8
|
||||
jmp Copy9
|
||||
jmp Copy10
|
||||
jmp Copy11
|
||||
jmp Copy12
|
||||
jmp Copy13
|
||||
jmp Copy14
|
||||
imul r9, 16
|
||||
add rdx, r9
|
||||
add rcx, r9
|
||||
|
||||
@EndBlocks:
|
||||
and r8, 0Fh
|
||||
test r8, r8
|
||||
je @RestoreExit
|
||||
|
||||
cmp r8, 2
|
||||
je @Copy2
|
||||
cmp r8, 3
|
||||
je @Copy3
|
||||
cmp r8, 4
|
||||
je @Copy4
|
||||
cmp r8, 5
|
||||
je @Copy5
|
||||
cmp r8, 6
|
||||
je @Copy6
|
||||
cmp r8, 7
|
||||
je @Copy7
|
||||
cmp r8, 8
|
||||
je @Copy8
|
||||
cmp r8, 9
|
||||
je @Copy9
|
||||
cmp r8, 10
|
||||
je @Copy10
|
||||
cmp r8, 11
|
||||
je @Copy11
|
||||
cmp r8, 12
|
||||
je @Copy12
|
||||
cmp r8, 13
|
||||
je @Copy13
|
||||
cmp r8, 14
|
||||
je @Copy14
|
||||
cmp r8, 15
|
||||
je @Copy15
|
||||
|
||||
; fall through - 1 byte
|
||||
mov al, byte ptr [rdx]
|
||||
mov byte ptr [rcx], al
|
||||
jmp @RestoreExit
|
||||
|
||||
@Copy2:
|
||||
mov r10w, word ptr [rdx]
|
||||
mov word ptr [rcx], r10w
|
||||
jmp @RestoreExit
|
||||
|
||||
@Copy3:
|
||||
mov r10w, word ptr [rdx]
|
||||
mov word ptr [rcx], r10w
|
||||
mov al, byte ptr [rdx + 02h]
|
||||
mov byte ptr [rcx + 02h], al
|
||||
jmp @RestoreExit
|
||||
|
||||
@Copy4:
|
||||
mov r9d, dword ptr [rdx]
|
||||
mov dword ptr [rcx], r9d
|
||||
jmp @RestoreExit
|
||||
|
||||
@Copy5:
|
||||
mov r9d, dword ptr [rdx ]
|
||||
mov al , byte ptr [rdx + 04h]
|
||||
mov dword ptr [rcx ], r9d
|
||||
mov byte ptr [rcx + 04h], al
|
||||
jmp @RestoreExit
|
||||
|
||||
@Copy6:
|
||||
mov r9d , dword ptr [rdx ]
|
||||
mov r10w, word ptr [rdx + 04h]
|
||||
mov dword ptr [rcx ], r9d
|
||||
mov word ptr [rcx + 04h], r10w
|
||||
jmp @RestoreExit
|
||||
|
||||
@Copy7:
|
||||
mov r9d , dword ptr [rdx ]
|
||||
mov r10w, word ptr [rdx + 04h]
|
||||
mov al , byte ptr [rdx + 06h]
|
||||
mov dword ptr [rcx ], r9d
|
||||
mov word ptr [rcx + 04h], r10w
|
||||
mov byte ptr [rcx + 06h], al
|
||||
jmp @RestoreExit
|
||||
|
||||
@Copy8:
|
||||
mov r8, qword ptr [rdx]
|
||||
mov qword ptr [rcx], r8
|
||||
jmp @RestoreExit
|
||||
|
||||
@Copy9:
|
||||
mov r8, qword ptr [rdx ]
|
||||
mov al, byte ptr [rdx + 08h]
|
||||
mov qword ptr [rcx ], r8
|
||||
mov byte ptr [rcx + 08h], al
|
||||
jmp @RestoreExit
|
||||
|
||||
@Copy10:
|
||||
mov r8 , qword ptr [rdx ]
|
||||
mov r10w, word ptr [rdx + 08h]
|
||||
mov qword ptr [rcx ], r8
|
||||
mov word ptr [rcx + 08h], r10w
|
||||
jmp @RestoreExit
|
||||
|
||||
@Copy11:
|
||||
mov r8 , qword ptr [rdx ]
|
||||
mov r10w, word ptr [rdx + 08h]
|
||||
mov al , byte ptr [rdx + 0Ah]
|
||||
mov qword ptr [rcx ], r8
|
||||
mov word ptr [rcx + 08h], r10w
|
||||
mov byte ptr [rcx + 0Ah], al
|
||||
jmp @RestoreExit
|
||||
|
||||
@Copy12:
|
||||
mov r8 , qword ptr [rdx ]
|
||||
mov r9d, dword ptr [rdx + 08h]
|
||||
mov qword ptr [rcx ], r8
|
||||
mov dword ptr [rcx + 08h], r9d
|
||||
jmp @RestoreExit
|
||||
|
||||
@Copy13:
|
||||
mov r8 , qword ptr [rdx ]
|
||||
mov r9d, dword ptr [rdx + 08h]
|
||||
mov al , byte ptr [rdx + 0Ch]
|
||||
mov qword ptr [rcx ], r8
|
||||
mov dword ptr [rcx + 08h], r9d
|
||||
mov byte ptr [rcx + 0Ch], al
|
||||
jmp @RestoreExit
|
||||
|
||||
@Copy14:
|
||||
mov r8 , qword ptr [rdx ]
|
||||
mov r9d , dword ptr [rdx + 08h]
|
||||
mov r10w, word ptr [rdx + 0Ch]
|
||||
mov qword ptr [rcx ], r8
|
||||
mov dword ptr [rcx + 08h], r9d
|
||||
mov word ptr [rcx + 0Ch], r10w
|
||||
jmp @RestoreExit
|
||||
|
||||
; copy 15
|
||||
@Copy15:
|
||||
mov r8 , qword ptr [rdx + 00h]
|
||||
mov r9d , dword ptr [rdx + 08h]
|
||||
mov r10w, word ptr [rdx + 0Ch]
|
||||
@ -205,108 +231,7 @@ memcpySSE proc
|
||||
mov dword ptr [rcx + 08h], r9d
|
||||
mov word ptr [rcx + 0Ch], r10w
|
||||
mov byte ptr [rcx + 0Eh], al
|
||||
ret
|
||||
|
||||
; ensure we generate near jumps
|
||||
padding2 db 127 dup(090h)
|
||||
|
||||
Copy1:
|
||||
mov al, byte ptr [rdx]
|
||||
mov byte ptr [rcx], al
|
||||
ret
|
||||
|
||||
Copy2:
|
||||
mov r10w, word ptr [rdx]
|
||||
mov word ptr [rcx], r10w
|
||||
ret
|
||||
|
||||
Copy3:
|
||||
mov r10w, word ptr [rdx]
|
||||
mov word ptr [rcx], r10w
|
||||
mov al, byte ptr [rdx + 02h]
|
||||
mov byte ptr [rcx + 02h], al
|
||||
ret
|
||||
|
||||
Copy4:
|
||||
mov r9d , dword ptr [rdx]
|
||||
mov dword ptr [rcx], r9d
|
||||
ret
|
||||
|
||||
Copy5:
|
||||
mov r9d , dword ptr [rdx]
|
||||
mov dword ptr [rcx], r9d
|
||||
mov al, byte ptr [rdx + 04h]
|
||||
mov byte ptr [rcx + 04h], al
|
||||
ret
|
||||
|
||||
Copy6:
|
||||
mov r9d , dword ptr [rdx]
|
||||
mov dword ptr [rcx], r9d
|
||||
mov r10w, word ptr [rdx + 04h]
|
||||
mov word ptr [rcx + 04h], r10w
|
||||
ret
|
||||
|
||||
Copy7:
|
||||
mov r9d , dword ptr [rdx]
|
||||
mov dword ptr [rcx], r9d
|
||||
mov r10w, word ptr [rdx + 04h]
|
||||
mov word ptr [rcx + 04h], r10w
|
||||
mov al, byte ptr [rdx + 06h]
|
||||
mov byte ptr [rcx + 06h], al
|
||||
ret
|
||||
|
||||
Copy8:
|
||||
mov r8, qword ptr [rdx]
|
||||
mov qword ptr [rcx], r8
|
||||
ret
|
||||
|
||||
Copy9:
|
||||
mov r8, qword ptr [rdx]
|
||||
mov qword ptr [rcx], r8
|
||||
mov al, byte ptr [rdx + 08h]
|
||||
mov byte ptr [rcx + 08h], al
|
||||
ret
|
||||
|
||||
Copy10:
|
||||
mov r8, qword ptr [rdx]
|
||||
mov qword ptr [rcx], r8
|
||||
mov r10w, word ptr [rdx + 08h]
|
||||
mov word ptr [rcx + 08h], r10w
|
||||
ret
|
||||
|
||||
Copy11:
|
||||
mov r8, qword ptr [rdx]
|
||||
mov qword ptr [rcx], r8
|
||||
mov r10w, word ptr [rdx + 08h]
|
||||
mov word ptr [rcx + 08h], r10w
|
||||
mov al, byte ptr [rdx + 0Ah]
|
||||
mov byte ptr [rcx + 0Ah], al
|
||||
ret
|
||||
|
||||
Copy12:
|
||||
mov r8, qword ptr [rdx]
|
||||
mov qword ptr [rcx], r8
|
||||
mov r9d , dword ptr [rdx + 08h]
|
||||
mov dword ptr [rcx + 08h], r9d
|
||||
ret
|
||||
|
||||
Copy13:
|
||||
mov r8, qword ptr [rdx]
|
||||
mov qword ptr [rcx], r8
|
||||
mov r9d , dword ptr [rdx + 08h]
|
||||
mov dword ptr [rcx + 08h], r9d
|
||||
mov al, byte ptr [rdx + 0Ch]
|
||||
mov byte ptr [rcx + 0Ch], al
|
||||
ret
|
||||
|
||||
Copy14:
|
||||
mov r8 , qword ptr [rdx ]
|
||||
mov r9d , dword ptr [rdx + 08h]
|
||||
mov r10w, word ptr [rdx + 0Ch]
|
||||
mov qword ptr [rcx ], r8
|
||||
mov dword ptr [rcx + 08h], r9d
|
||||
mov word ptr [rcx + 0Ch], r10w
|
||||
ret
|
||||
jmp @RestoreExit
|
||||
|
||||
memcpySSE endp
|
||||
end
|
@ -28,7 +28,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
#if defined(NATIVE_MEMCPY)
|
||||
#define memcpySSE memcpy
|
||||
#elif defined(MSVC)
|
||||
#elif defined(_MSC_VER)
|
||||
extern "C" void memcpySSE(void *dst, const void * src, size_t length);
|
||||
#elif (defined(__GNUC__) || defined(__GNUG__)) && defined(__i386__)
|
||||
inline static void memcpySSE(void *dst, const void * src, size_t length)
|
||||
|
Loading…
Reference in New Issue
Block a user