[common] tune windows memcpySSE asm implementation:wq

This commit is contained in:
Geoffrey McRae 2018-05-19 21:40:13 +10:00
parent e9d77e6c52
commit e8b1b8fbdf
2 changed files with 184 additions and 259 deletions

View File

@ -1,44 +1,27 @@
.code
memcpySSE proc
; dst = rcx
; src = rdx
; len = r8
test r8, r8
jne OK
ret
jz @Exit
cmp rcx, rdx
je @Exit
OK:
; void * end = dst + (length & ~0x100);
sub rsp, 8 + 2*16 + 4*8
movdqa oword ptr [rsp + 4*8 + 00 ], xmm6
movdqa oword ptr [rsp + 4*8 + 16 ], xmm7
; void * end = dst + (length & ~0x7F);
; end = r10
mov r9 , r8
and r9 , -0100h
and r9 , -07Fh
jz @RemainingBlocks
mov r10, rcx
add r10, r9
; size_t rem = (length & 0xFF) >> 4);
; rem = r11
mov r11, r8
and r11, 0FFh
shr r11, 4
sub rsp, 8 + 10*16 + 4*8
movdqa oword ptr [rsp + 4*8 + 00 ], xmm6
movdqa oword ptr [rsp + 4*8 + 16 ], xmm7
movdqa oword ptr [rsp + 4*8 + 32 ], xmm8
movdqa oword ptr [rsp + 4*8 + 48 ], xmm9
movdqa oword ptr [rsp + 4*8 + 64 ], xmm10
movdqa oword ptr [rsp + 4*8 + 80 ], xmm11
movdqa oword ptr [rsp + 4*8 + 96 ], xmm12
movdqa oword ptr [rsp + 4*8 + 112], xmm13
movdqa oword ptr [rsp + 4*8 + 128], xmm14
movdqa oword ptr [rsp + 4*8 + 144], xmm15
cmp rcx, r10
je RemainingBlocks
FullLoop:
@FullLoop:
vmovaps xmm0 , xmmword ptr [rdx + 000h]
vmovaps xmm1 , xmmword ptr [rdx + 010h]
vmovaps xmm2 , xmmword ptr [rdx + 020h]
@ -47,14 +30,6 @@ memcpySSE proc
vmovaps xmm5 , xmmword ptr [rdx + 050h]
vmovaps xmm6 , xmmword ptr [rdx + 060h]
vmovaps xmm7 , xmmword ptr [rdx + 070h]
vmovaps xmm8 , xmmword ptr [rdx + 080h]
vmovaps xmm9 , xmmword ptr [rdx + 090h]
vmovaps xmm10, xmmword ptr [rdx + 0A0h]
vmovaps xmm11, xmmword ptr [rdx + 0B0h]
vmovaps xmm12, xmmword ptr [rdx + 0C0h]
vmovaps xmm13, xmmword ptr [rdx + 0D0h]
vmovaps xmm14, xmmword ptr [rdx + 0E0h]
vmovaps xmm15, xmmword ptr [rdx + 0F0h]
vmovntdq xmmword ptr [rcx + 000h], xmm0
vmovntdq xmmword ptr [rcx + 010h], xmm1
vmovntdq xmmword ptr [rcx + 020h], xmm2
@ -63,140 +38,191 @@ memcpySSE proc
vmovntdq xmmword ptr [rcx + 050h], xmm5
vmovntdq xmmword ptr [rcx + 060h], xmm6
vmovntdq xmmword ptr [rcx + 070h], xmm7
vmovntdq xmmword ptr [rcx + 080h], xmm8
vmovntdq xmmword ptr [rcx + 090h], xmm9
vmovntdq xmmword ptr [rcx + 0A0h], xmm10
vmovntdq xmmword ptr [rcx + 0B0h], xmm11
vmovntdq xmmword ptr [rcx + 0C0h], xmm12
vmovntdq xmmword ptr [rcx + 0D0h], xmm13
vmovntdq xmmword ptr [rcx + 0E0h], xmm14
vmovntdq xmmword ptr [rcx + 0F0h], xmm15
add rdx, 0100h
add rcx, 0100h
add rdx, 080h
add rcx, 080h
cmp rcx, r10
jne FullLoop
jne @FullLoop
RemainingBlocks:
lea r9 , JumpTable
mov r10, 15
@RemainingBlocks:
; size_t rem = (length & 0x7F) >> 4);
; rem = r11
mov r11, r8
and r11, 07Fh
jz @RestoreExit
shr r11, 4
mov r10, 7
sub r10, r11
imul r10, 5
imul r10, 10
lea r9 , @FinalBlocks
add r9 , r10
jmp r9
JumpTable:
jmp Block15
jmp Block14
jmp Block13
jmp Block12
jmp Block11
jmp Block10
jmp Block9
jmp Block8
jmp Block7
jmp Block6
jmp Block5
jmp Block4
jmp Block3
jmp Block2
jmp Block1
jmp Block0
; ensure we generate near jumps
padding1 db 127 dup(090h)
Block15:
vmovaps xmm14, xmmword ptr [rdx + 0E0h]
vmovntdq xmmword ptr [rcx + 0E0h], xmm14
Block14:
vmovaps xmm13, xmmword ptr [rdx + 0D0h]
vmovntdq xmmword ptr [rcx + 0D0h], xmm13
Block13:
vmovaps xmm12, xmmword ptr [rdx + 0C0h]
vmovntdq xmmword ptr [rcx + 0C0h], xmm12
Block12:
vmovaps xmm11, xmmword ptr [rdx + 0B0h]
vmovntdq xmmword ptr [rcx + 0B0h], xmm11
Block11:
vmovaps xmm10, xmmword ptr [rdx + 0A0h]
vmovntdq xmmword ptr [rcx + 0A0h], xmm10
Block10:
vmovaps xmm9 , xmmword ptr [rdx + 090h]
vmovntdq xmmword ptr [rcx + 090h], xmm9
Block9:
vmovaps xmm8 , xmmword ptr [rdx + 080h]
vmovntdq xmmword ptr [rcx + 080h], xmm8
Block8:
vmovaps xmm7 , xmmword ptr [rdx + 070h]
vmovntdq xmmword ptr [rcx + 070h], xmm7
Block7:
vmovaps xmm6 , xmmword ptr [rdx + 060h]
vmovntdq xmmword ptr [rcx + 060h], xmm6
Block6:
vmovaps xmm5 , xmmword ptr [rdx + 050h]
vmovntdq xmmword ptr [rcx + 050h], xmm5
Block5:
vmovaps xmm4 , xmmword ptr [rdx + 040h]
vmovntdq xmmword ptr [rcx + 040h], xmm4
Block4:
vmovaps xmm3 , xmmword ptr [rdx + 030h]
vmovntdq xmmword ptr [rcx + 030h], xmm3
Block3:
vmovaps xmm2 , xmmword ptr [rdx + 020h]
vmovntdq xmmword ptr [rcx + 020h], xmm2
Block2:
vmovaps xmm1 , xmmword ptr [rdx + 010h]
vmovntdq xmmword ptr [rcx + 010h], xmm1
Block1:
vmovaps xmm0 , xmmword ptr [rdx + 000h]
vmovntdq xmmword ptr [rcx + 000h], xmm0
imul r11, 16
add rdx, r11
add rcx, r11
Block0:
@RestoreExit:
movdqa xmm6 , oword ptr [rsp + 4*8 + 00 ]
movdqa xmm7 , oword ptr [rsp + 4*8 + 16 ]
movdqa xmm8 , oword ptr [rsp + 4*8 + 32 ]
movdqa xmm9 , oword ptr [rsp + 4*8 + 48 ]
movdqa xmm10, oword ptr [rsp + 4*8 + 64 ]
movdqa xmm11, oword ptr [rsp + 4*8 + 80 ]
movdqa xmm12, oword ptr [rsp + 4*8 + 96 ]
movdqa xmm13, oword ptr [rsp + 4*8 + 112]
movdqa xmm14, oword ptr [rsp + 4*8 + 128]
movdqa xmm15, oword ptr [rsp + 4*8 + 144]
add rsp, 8 + 10*16 + 4*8
add rsp, 8 + 2*16 + 4*8
and r8, 0Fh
imul r8, 5
lea r9, CopyTable
add r9, r8
jmp r9
CopyTable:
@Exit:
sfence
ret
nop
nop
@FinalBlocks:
vmovaps xmm6 , xmmword ptr [rdx + 060h]
vmovntdq xmmword ptr [rcx + 060h], xmm6
vmovaps xmm5 , xmmword ptr [rdx + 050h]
vmovntdq xmmword ptr [rcx + 050h], xmm5
vmovaps xmm4 , xmmword ptr [rdx + 040h]
vmovntdq xmmword ptr [rcx + 040h], xmm4
vmovaps xmm3 , xmmword ptr [rdx + 030h]
vmovntdq xmmword ptr [rcx + 030h], xmm3
vmovaps xmm2 , xmmword ptr [rdx + 020h]
vmovntdq xmmword ptr [rcx + 020h], xmm2
vmovaps xmm1 , xmmword ptr [rdx + 010h]
vmovntdq xmmword ptr [rcx + 010h], xmm1
vmovaps xmm0 , xmmword ptr [rdx + 000h]
vmovntdq xmmword ptr [rcx + 000h], xmm0
nop
nop
jmp Copy1
jmp Copy2
jmp Copy3
jmp Copy4
jmp Copy5
jmp Copy6
jmp Copy7
jmp Copy8
jmp Copy9
jmp Copy10
jmp Copy11
jmp Copy12
jmp Copy13
jmp Copy14
imul r9, 16
add rdx, r9
add rcx, r9
@EndBlocks:
and r8, 0Fh
test r8, r8
je @RestoreExit
cmp r8, 2
je @Copy2
cmp r8, 3
je @Copy3
cmp r8, 4
je @Copy4
cmp r8, 5
je @Copy5
cmp r8, 6
je @Copy6
cmp r8, 7
je @Copy7
cmp r8, 8
je @Copy8
cmp r8, 9
je @Copy9
cmp r8, 10
je @Copy10
cmp r8, 11
je @Copy11
cmp r8, 12
je @Copy12
cmp r8, 13
je @Copy13
cmp r8, 14
je @Copy14
cmp r8, 15
je @Copy15
; fall through - 1 byte
mov al, byte ptr [rdx]
mov byte ptr [rcx], al
jmp @RestoreExit
@Copy2:
mov r10w, word ptr [rdx]
mov word ptr [rcx], r10w
jmp @RestoreExit
@Copy3:
mov r10w, word ptr [rdx]
mov word ptr [rcx], r10w
mov al, byte ptr [rdx + 02h]
mov byte ptr [rcx + 02h], al
jmp @RestoreExit
@Copy4:
mov r9d, dword ptr [rdx]
mov dword ptr [rcx], r9d
jmp @RestoreExit
@Copy5:
mov r9d, dword ptr [rdx ]
mov al , byte ptr [rdx + 04h]
mov dword ptr [rcx ], r9d
mov byte ptr [rcx + 04h], al
jmp @RestoreExit
@Copy6:
mov r9d , dword ptr [rdx ]
mov r10w, word ptr [rdx + 04h]
mov dword ptr [rcx ], r9d
mov word ptr [rcx + 04h], r10w
jmp @RestoreExit
@Copy7:
mov r9d , dword ptr [rdx ]
mov r10w, word ptr [rdx + 04h]
mov al , byte ptr [rdx + 06h]
mov dword ptr [rcx ], r9d
mov word ptr [rcx + 04h], r10w
mov byte ptr [rcx + 06h], al
jmp @RestoreExit
@Copy8:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
jmp @RestoreExit
@Copy9:
mov r8, qword ptr [rdx ]
mov al, byte ptr [rdx + 08h]
mov qword ptr [rcx ], r8
mov byte ptr [rcx + 08h], al
jmp @RestoreExit
@Copy10:
mov r8 , qword ptr [rdx ]
mov r10w, word ptr [rdx + 08h]
mov qword ptr [rcx ], r8
mov word ptr [rcx + 08h], r10w
jmp @RestoreExit
@Copy11:
mov r8 , qword ptr [rdx ]
mov r10w, word ptr [rdx + 08h]
mov al , byte ptr [rdx + 0Ah]
mov qword ptr [rcx ], r8
mov word ptr [rcx + 08h], r10w
mov byte ptr [rcx + 0Ah], al
jmp @RestoreExit
@Copy12:
mov r8 , qword ptr [rdx ]
mov r9d, dword ptr [rdx + 08h]
mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d
jmp @RestoreExit
@Copy13:
mov r8 , qword ptr [rdx ]
mov r9d, dword ptr [rdx + 08h]
mov al , byte ptr [rdx + 0Ch]
mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d
mov byte ptr [rcx + 0Ch], al
jmp @RestoreExit
@Copy14:
mov r8 , qword ptr [rdx ]
mov r9d , dword ptr [rdx + 08h]
mov r10w, word ptr [rdx + 0Ch]
mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d
mov word ptr [rcx + 0Ch], r10w
jmp @RestoreExit
; copy 15
@Copy15:
mov r8 , qword ptr [rdx + 00h]
mov r9d , dword ptr [rdx + 08h]
mov r10w, word ptr [rdx + 0Ch]
@ -205,108 +231,7 @@ memcpySSE proc
mov dword ptr [rcx + 08h], r9d
mov word ptr [rcx + 0Ch], r10w
mov byte ptr [rcx + 0Eh], al
ret
; ensure we generate near jumps
padding2 db 127 dup(090h)
Copy1:
mov al, byte ptr [rdx]
mov byte ptr [rcx], al
ret
Copy2:
mov r10w, word ptr [rdx]
mov word ptr [rcx], r10w
ret
Copy3:
mov r10w, word ptr [rdx]
mov word ptr [rcx], r10w
mov al, byte ptr [rdx + 02h]
mov byte ptr [rcx + 02h], al
ret
Copy4:
mov r9d , dword ptr [rdx]
mov dword ptr [rcx], r9d
ret
Copy5:
mov r9d , dword ptr [rdx]
mov dword ptr [rcx], r9d
mov al, byte ptr [rdx + 04h]
mov byte ptr [rcx + 04h], al
ret
Copy6:
mov r9d , dword ptr [rdx]
mov dword ptr [rcx], r9d
mov r10w, word ptr [rdx + 04h]
mov word ptr [rcx + 04h], r10w
ret
Copy7:
mov r9d , dword ptr [rdx]
mov dword ptr [rcx], r9d
mov r10w, word ptr [rdx + 04h]
mov word ptr [rcx + 04h], r10w
mov al, byte ptr [rdx + 06h]
mov byte ptr [rcx + 06h], al
ret
Copy8:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
ret
Copy9:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
mov al, byte ptr [rdx + 08h]
mov byte ptr [rcx + 08h], al
ret
Copy10:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
mov r10w, word ptr [rdx + 08h]
mov word ptr [rcx + 08h], r10w
ret
Copy11:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
mov r10w, word ptr [rdx + 08h]
mov word ptr [rcx + 08h], r10w
mov al, byte ptr [rdx + 0Ah]
mov byte ptr [rcx + 0Ah], al
ret
Copy12:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
mov r9d , dword ptr [rdx + 08h]
mov dword ptr [rcx + 08h], r9d
ret
Copy13:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
mov r9d , dword ptr [rdx + 08h]
mov dword ptr [rcx + 08h], r9d
mov al, byte ptr [rdx + 0Ch]
mov byte ptr [rcx + 0Ch], al
ret
Copy14:
mov r8 , qword ptr [rdx ]
mov r9d , dword ptr [rdx + 08h]
mov r10w, word ptr [rdx + 0Ch]
mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d
mov word ptr [rcx + 0Ch], r10w
ret
jmp @RestoreExit
memcpySSE endp
end

View File

@ -28,7 +28,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA
#if defined(NATIVE_MEMCPY)
#define memcpySSE memcpy
#elif defined(MSVC)
#elif defined(_MSC_VER)
extern "C" void memcpySSE(void *dst, const void * src, size_t length);
#elif (defined(__GNUC__) || defined(__GNUG__)) && defined(__i386__)
inline static void memcpySSE(void *dst, const void * src, size_t length)