From e8b1b8fbdfea1945b3989593633cc2c7161a5747 Mon Sep 17 00:00:00 2001 From: Geoffrey McRae Date: Sat, 19 May 2018 21:40:13 +1000 Subject: [PATCH] [common] tune windows memcpySSE asm implementation:wq --- common/memcpySSE.asm | 441 ++++++++++++++++++------------------------- common/memcpySSE2.h | 2 +- 2 files changed, 184 insertions(+), 259 deletions(-) diff --git a/common/memcpySSE.asm b/common/memcpySSE.asm index 498ca483..7a0c78e0 100644 --- a/common/memcpySSE.asm +++ b/common/memcpySSE.asm @@ -1,44 +1,27 @@ .code - memcpySSE proc ; dst = rcx ; src = rdx ; len = r8 - test r8 , r8 - jne OK - ret + test r8, r8 + jz @Exit + cmp rcx, rdx + je @Exit - OK: - ; void * end = dst + (length & ~0x100); + sub rsp, 8 + 2*16 + 4*8 + movdqa oword ptr [rsp + 4*8 + 00 ], xmm6 + movdqa oword ptr [rsp + 4*8 + 16 ], xmm7 + + ; void * end = dst + (length & ~0x7F); ; end = r10 mov r9 , r8 - and r9 , -0100h + and r9 , -07Fh + jz @RemainingBlocks mov r10, rcx add r10, r9 - ; size_t rem = (length & 0xFF) >> 4); - ; rem = r11 - mov r11, r8 - and r11, 0FFh - shr r11, 4 - - sub rsp, 8 + 10*16 + 4*8 - movdqa oword ptr [rsp + 4*8 + 00 ], xmm6 - movdqa oword ptr [rsp + 4*8 + 16 ], xmm7 - movdqa oword ptr [rsp + 4*8 + 32 ], xmm8 - movdqa oword ptr [rsp + 4*8 + 48 ], xmm9 - movdqa oword ptr [rsp + 4*8 + 64 ], xmm10 - movdqa oword ptr [rsp + 4*8 + 80 ], xmm11 - movdqa oword ptr [rsp + 4*8 + 96 ], xmm12 - movdqa oword ptr [rsp + 4*8 + 112], xmm13 - movdqa oword ptr [rsp + 4*8 + 128], xmm14 - movdqa oword ptr [rsp + 4*8 + 144], xmm15 - - cmp rcx, r10 - je RemainingBlocks - - FullLoop: + @FullLoop: vmovaps xmm0 , xmmword ptr [rdx + 000h] vmovaps xmm1 , xmmword ptr [rdx + 010h] vmovaps xmm2 , xmmword ptr [rdx + 020h] @@ -47,14 +30,6 @@ memcpySSE proc vmovaps xmm5 , xmmword ptr [rdx + 050h] vmovaps xmm6 , xmmword ptr [rdx + 060h] vmovaps xmm7 , xmmword ptr [rdx + 070h] - vmovaps xmm8 , xmmword ptr [rdx + 080h] - vmovaps xmm9 , xmmword ptr [rdx + 090h] - vmovaps xmm10, xmmword ptr [rdx + 0A0h] - vmovaps xmm11, xmmword ptr [rdx + 0B0h] - vmovaps xmm12, xmmword ptr [rdx + 0C0h] - vmovaps xmm13, xmmword ptr [rdx + 0D0h] - vmovaps xmm14, xmmword ptr [rdx + 0E0h] - vmovaps xmm15, xmmword ptr [rdx + 0F0h] vmovntdq xmmword ptr [rcx + 000h], xmm0 vmovntdq xmmword ptr [rcx + 010h], xmm1 vmovntdq xmmword ptr [rcx + 020h], xmm2 @@ -63,140 +38,191 @@ memcpySSE proc vmovntdq xmmword ptr [rcx + 050h], xmm5 vmovntdq xmmword ptr [rcx + 060h], xmm6 vmovntdq xmmword ptr [rcx + 070h], xmm7 - vmovntdq xmmword ptr [rcx + 080h], xmm8 - vmovntdq xmmword ptr [rcx + 090h], xmm9 - vmovntdq xmmword ptr [rcx + 0A0h], xmm10 - vmovntdq xmmword ptr [rcx + 0B0h], xmm11 - vmovntdq xmmword ptr [rcx + 0C0h], xmm12 - vmovntdq xmmword ptr [rcx + 0D0h], xmm13 - vmovntdq xmmword ptr [rcx + 0E0h], xmm14 - vmovntdq xmmword ptr [rcx + 0F0h], xmm15 - add rdx, 0100h - add rcx, 0100h + add rdx, 080h + add rcx, 080h cmp rcx, r10 - jne FullLoop + jne @FullLoop - RemainingBlocks: - lea r9 , JumpTable - mov r10, 15 + @RemainingBlocks: + ; size_t rem = (length & 0x7F) >> 4); + ; rem = r11 + mov r11, r8 + and r11, 07Fh + jz @RestoreExit + shr r11, 4 + + mov r10, 7 sub r10, r11 - imul r10, 5 + imul r10, 10 + lea r9 , @FinalBlocks add r9 , r10 jmp r9 - JumpTable: - jmp Block15 - jmp Block14 - jmp Block13 - jmp Block12 - jmp Block11 - jmp Block10 - jmp Block9 - jmp Block8 - jmp Block7 - jmp Block6 - jmp Block5 - jmp Block4 - jmp Block3 - jmp Block2 - jmp Block1 - jmp Block0 - - ; ensure we generate near jumps - padding1 db 127 dup(090h) - - Block15: - vmovaps xmm14, xmmword ptr [rdx + 0E0h] - vmovntdq xmmword ptr [rcx + 0E0h], xmm14 - Block14: - vmovaps xmm13, xmmword ptr [rdx + 0D0h] - vmovntdq xmmword ptr [rcx + 0D0h], xmm13 - Block13: - vmovaps xmm12, xmmword ptr [rdx + 0C0h] - vmovntdq xmmword ptr [rcx + 0C0h], xmm12 - Block12: - vmovaps xmm11, xmmword ptr [rdx + 0B0h] - vmovntdq xmmword ptr [rcx + 0B0h], xmm11 - Block11: - vmovaps xmm10, xmmword ptr [rdx + 0A0h] - vmovntdq xmmword ptr [rcx + 0A0h], xmm10 - Block10: - vmovaps xmm9 , xmmword ptr [rdx + 090h] - vmovntdq xmmword ptr [rcx + 090h], xmm9 - Block9: - vmovaps xmm8 , xmmword ptr [rdx + 080h] - vmovntdq xmmword ptr [rcx + 080h], xmm8 - Block8: - vmovaps xmm7 , xmmword ptr [rdx + 070h] - vmovntdq xmmword ptr [rcx + 070h], xmm7 - Block7: - vmovaps xmm6 , xmmword ptr [rdx + 060h] - vmovntdq xmmword ptr [rcx + 060h], xmm6 - Block6: - vmovaps xmm5 , xmmword ptr [rdx + 050h] - vmovntdq xmmword ptr [rcx + 050h], xmm5 - Block5: - vmovaps xmm4 , xmmword ptr [rdx + 040h] - vmovntdq xmmword ptr [rcx + 040h], xmm4 - Block4: - vmovaps xmm3 , xmmword ptr [rdx + 030h] - vmovntdq xmmword ptr [rcx + 030h], xmm3 - Block3: - vmovaps xmm2 , xmmword ptr [rdx + 020h] - vmovntdq xmmword ptr [rcx + 020h], xmm2 - Block2: - vmovaps xmm1 , xmmword ptr [rdx + 010h] - vmovntdq xmmword ptr [rcx + 010h], xmm1 - Block1: - vmovaps xmm0 , xmmword ptr [rdx + 000h] - vmovntdq xmmword ptr [rcx + 000h], xmm0 - - imul r11, 16 - add rdx, r11 - add rcx, r11 - - Block0: + @RestoreExit: movdqa xmm6 , oword ptr [rsp + 4*8 + 00 ] movdqa xmm7 , oword ptr [rsp + 4*8 + 16 ] - movdqa xmm8 , oword ptr [rsp + 4*8 + 32 ] - movdqa xmm9 , oword ptr [rsp + 4*8 + 48 ] - movdqa xmm10, oword ptr [rsp + 4*8 + 64 ] - movdqa xmm11, oword ptr [rsp + 4*8 + 80 ] - movdqa xmm12, oword ptr [rsp + 4*8 + 96 ] - movdqa xmm13, oword ptr [rsp + 4*8 + 112] - movdqa xmm14, oword ptr [rsp + 4*8 + 128] - movdqa xmm15, oword ptr [rsp + 4*8 + 144] - add rsp, 8 + 10*16 + 4*8 + add rsp, 8 + 2*16 + 4*8 - and r8, 0Fh - imul r8, 5 - lea r9, CopyTable - add r9, r8 - jmp r9 - - CopyTable: + @Exit: + sfence ret - nop - nop + + @FinalBlocks: + vmovaps xmm6 , xmmword ptr [rdx + 060h] + vmovntdq xmmword ptr [rcx + 060h], xmm6 + vmovaps xmm5 , xmmword ptr [rdx + 050h] + vmovntdq xmmword ptr [rcx + 050h], xmm5 + vmovaps xmm4 , xmmword ptr [rdx + 040h] + vmovntdq xmmword ptr [rcx + 040h], xmm4 + vmovaps xmm3 , xmmword ptr [rdx + 030h] + vmovntdq xmmword ptr [rcx + 030h], xmm3 + vmovaps xmm2 , xmmword ptr [rdx + 020h] + vmovntdq xmmword ptr [rcx + 020h], xmm2 + vmovaps xmm1 , xmmword ptr [rdx + 010h] + vmovntdq xmmword ptr [rcx + 010h], xmm1 + vmovaps xmm0 , xmmword ptr [rdx + 000h] + vmovntdq xmmword ptr [rcx + 000h], xmm0 nop nop - jmp Copy1 - jmp Copy2 - jmp Copy3 - jmp Copy4 - jmp Copy5 - jmp Copy6 - jmp Copy7 - jmp Copy8 - jmp Copy9 - jmp Copy10 - jmp Copy11 - jmp Copy12 - jmp Copy13 - jmp Copy14 + imul r9, 16 + add rdx, r9 + add rcx, r9 + + @EndBlocks: + and r8, 0Fh + test r8, r8 + je @RestoreExit + + cmp r8, 2 + je @Copy2 + cmp r8, 3 + je @Copy3 + cmp r8, 4 + je @Copy4 + cmp r8, 5 + je @Copy5 + cmp r8, 6 + je @Copy6 + cmp r8, 7 + je @Copy7 + cmp r8, 8 + je @Copy8 + cmp r8, 9 + je @Copy9 + cmp r8, 10 + je @Copy10 + cmp r8, 11 + je @Copy11 + cmp r8, 12 + je @Copy12 + cmp r8, 13 + je @Copy13 + cmp r8, 14 + je @Copy14 + cmp r8, 15 + je @Copy15 + + ; fall through - 1 byte + mov al, byte ptr [rdx] + mov byte ptr [rcx], al + jmp @RestoreExit + + @Copy2: + mov r10w, word ptr [rdx] + mov word ptr [rcx], r10w + jmp @RestoreExit + + @Copy3: + mov r10w, word ptr [rdx] + mov word ptr [rcx], r10w + mov al, byte ptr [rdx + 02h] + mov byte ptr [rcx + 02h], al + jmp @RestoreExit + + @Copy4: + mov r9d, dword ptr [rdx] + mov dword ptr [rcx], r9d + jmp @RestoreExit + + @Copy5: + mov r9d, dword ptr [rdx ] + mov al , byte ptr [rdx + 04h] + mov dword ptr [rcx ], r9d + mov byte ptr [rcx + 04h], al + jmp @RestoreExit + + @Copy6: + mov r9d , dword ptr [rdx ] + mov r10w, word ptr [rdx + 04h] + mov dword ptr [rcx ], r9d + mov word ptr [rcx + 04h], r10w + jmp @RestoreExit + + @Copy7: + mov r9d , dword ptr [rdx ] + mov r10w, word ptr [rdx + 04h] + mov al , byte ptr [rdx + 06h] + mov dword ptr [rcx ], r9d + mov word ptr [rcx + 04h], r10w + mov byte ptr [rcx + 06h], al + jmp @RestoreExit + + @Copy8: + mov r8, qword ptr [rdx] + mov qword ptr [rcx], r8 + jmp @RestoreExit + + @Copy9: + mov r8, qword ptr [rdx ] + mov al, byte ptr [rdx + 08h] + mov qword ptr [rcx ], r8 + mov byte ptr [rcx + 08h], al + jmp @RestoreExit + + @Copy10: + mov r8 , qword ptr [rdx ] + mov r10w, word ptr [rdx + 08h] + mov qword ptr [rcx ], r8 + mov word ptr [rcx + 08h], r10w + jmp @RestoreExit + + @Copy11: + mov r8 , qword ptr [rdx ] + mov r10w, word ptr [rdx + 08h] + mov al , byte ptr [rdx + 0Ah] + mov qword ptr [rcx ], r8 + mov word ptr [rcx + 08h], r10w + mov byte ptr [rcx + 0Ah], al + jmp @RestoreExit + + @Copy12: + mov r8 , qword ptr [rdx ] + mov r9d, dword ptr [rdx + 08h] + mov qword ptr [rcx ], r8 + mov dword ptr [rcx + 08h], r9d + jmp @RestoreExit + + @Copy13: + mov r8 , qword ptr [rdx ] + mov r9d, dword ptr [rdx + 08h] + mov al , byte ptr [rdx + 0Ch] + mov qword ptr [rcx ], r8 + mov dword ptr [rcx + 08h], r9d + mov byte ptr [rcx + 0Ch], al + jmp @RestoreExit + + @Copy14: + mov r8 , qword ptr [rdx ] + mov r9d , dword ptr [rdx + 08h] + mov r10w, word ptr [rdx + 0Ch] + mov qword ptr [rcx ], r8 + mov dword ptr [rcx + 08h], r9d + mov word ptr [rcx + 0Ch], r10w + jmp @RestoreExit ; copy 15 + @Copy15: mov r8 , qword ptr [rdx + 00h] mov r9d , dword ptr [rdx + 08h] mov r10w, word ptr [rdx + 0Ch] @@ -205,108 +231,7 @@ memcpySSE proc mov dword ptr [rcx + 08h], r9d mov word ptr [rcx + 0Ch], r10w mov byte ptr [rcx + 0Eh], al - ret - - ; ensure we generate near jumps - padding2 db 127 dup(090h) - - Copy1: - mov al, byte ptr [rdx] - mov byte ptr [rcx], al - ret - - Copy2: - mov r10w, word ptr [rdx] - mov word ptr [rcx], r10w - ret - - Copy3: - mov r10w, word ptr [rdx] - mov word ptr [rcx], r10w - mov al, byte ptr [rdx + 02h] - mov byte ptr [rcx + 02h], al - ret - - Copy4: - mov r9d , dword ptr [rdx] - mov dword ptr [rcx], r9d - ret - - Copy5: - mov r9d , dword ptr [rdx] - mov dword ptr [rcx], r9d - mov al, byte ptr [rdx + 04h] - mov byte ptr [rcx + 04h], al - ret - - Copy6: - mov r9d , dword ptr [rdx] - mov dword ptr [rcx], r9d - mov r10w, word ptr [rdx + 04h] - mov word ptr [rcx + 04h], r10w - ret - - Copy7: - mov r9d , dword ptr [rdx] - mov dword ptr [rcx], r9d - mov r10w, word ptr [rdx + 04h] - mov word ptr [rcx + 04h], r10w - mov al, byte ptr [rdx + 06h] - mov byte ptr [rcx + 06h], al - ret - - Copy8: - mov r8, qword ptr [rdx] - mov qword ptr [rcx], r8 - ret - - Copy9: - mov r8, qword ptr [rdx] - mov qword ptr [rcx], r8 - mov al, byte ptr [rdx + 08h] - mov byte ptr [rcx + 08h], al - ret - - Copy10: - mov r8, qword ptr [rdx] - mov qword ptr [rcx], r8 - mov r10w, word ptr [rdx + 08h] - mov word ptr [rcx + 08h], r10w - ret - - Copy11: - mov r8, qword ptr [rdx] - mov qword ptr [rcx], r8 - mov r10w, word ptr [rdx + 08h] - mov word ptr [rcx + 08h], r10w - mov al, byte ptr [rdx + 0Ah] - mov byte ptr [rcx + 0Ah], al - ret - - Copy12: - mov r8, qword ptr [rdx] - mov qword ptr [rcx], r8 - mov r9d , dword ptr [rdx + 08h] - mov dword ptr [rcx + 08h], r9d - ret - - Copy13: - mov r8, qword ptr [rdx] - mov qword ptr [rcx], r8 - mov r9d , dword ptr [rdx + 08h] - mov dword ptr [rcx + 08h], r9d - mov al, byte ptr [rdx + 0Ch] - mov byte ptr [rcx + 0Ch], al - ret - - Copy14: - mov r8 , qword ptr [rdx ] - mov r9d , dword ptr [rdx + 08h] - mov r10w, word ptr [rdx + 0Ch] - mov qword ptr [rcx ], r8 - mov dword ptr [rcx + 08h], r9d - mov word ptr [rcx + 0Ch], r10w - ret + jmp @RestoreExit memcpySSE endp end \ No newline at end of file diff --git a/common/memcpySSE2.h b/common/memcpySSE2.h index b0b83ce9..e4bcc89f 100644 --- a/common/memcpySSE2.h +++ b/common/memcpySSE2.h @@ -28,7 +28,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #if defined(NATIVE_MEMCPY) #define memcpySSE memcpy -#elif defined(MSVC) +#elif defined(_MSC_VER) extern "C" void memcpySSE(void *dst, const void * src, size_t length); #elif (defined(__GNUC__) || defined(__GNUG__)) && defined(__i386__) inline static void memcpySSE(void *dst, const void * src, size_t length)