NASM version of a SSE2 memcpy

This commit is contained in:
Geoffrey McRae 2018-05-18 18:50:07 +10:00
parent ffec6c2014
commit 3c77c1eb2b
2 changed files with 334 additions and 18 deletions

312
common/memcpySSE.asm Normal file
View File

@ -0,0 +1,312 @@
.code
memcpySSE proc
; dst = rcx
; src = rdx
; len = r8
test r8 , r8
jne OK
ret
OK:
; void * end = dst + (length & ~0x100);
; end = r10
mov r9 , r8
and r9 , -0100h
mov r10, rcx
add r10, r9
; size_t rem = (length & 0xFF) >> 4);
; rem = r11
mov r11, r8
and r11, 0FFh
shr r11, 4
sub rsp, 8 + 10*16 + 4*8
movdqa oword ptr [rsp + 4*8 + 00 ], xmm6
movdqa oword ptr [rsp + 4*8 + 16 ], xmm7
movdqa oword ptr [rsp + 4*8 + 32 ], xmm8
movdqa oword ptr [rsp + 4*8 + 48 ], xmm9
movdqa oword ptr [rsp + 4*8 + 64 ], xmm10
movdqa oword ptr [rsp + 4*8 + 80 ], xmm11
movdqa oword ptr [rsp + 4*8 + 96 ], xmm12
movdqa oword ptr [rsp + 4*8 + 112], xmm13
movdqa oword ptr [rsp + 4*8 + 128], xmm14
movdqa oword ptr [rsp + 4*8 + 144], xmm15
cmp rcx, r10
je RemainingBlocks
FullLoop:
vmovaps xmm0 , xmmword ptr [rdx + 000h]
vmovaps xmm1 , xmmword ptr [rdx + 010h]
vmovaps xmm2 , xmmword ptr [rdx + 020h]
vmovaps xmm3 , xmmword ptr [rdx + 030h]
vmovaps xmm4 , xmmword ptr [rdx + 040h]
vmovaps xmm5 , xmmword ptr [rdx + 050h]
vmovaps xmm6 , xmmword ptr [rdx + 060h]
vmovaps xmm7 , xmmword ptr [rdx + 070h]
vmovaps xmm8 , xmmword ptr [rdx + 080h]
vmovaps xmm9 , xmmword ptr [rdx + 090h]
vmovaps xmm10, xmmword ptr [rdx + 0A0h]
vmovaps xmm11, xmmword ptr [rdx + 0B0h]
vmovaps xmm12, xmmword ptr [rdx + 0C0h]
vmovaps xmm13, xmmword ptr [rdx + 0D0h]
vmovaps xmm14, xmmword ptr [rdx + 0E0h]
vmovaps xmm15, xmmword ptr [rdx + 0F0h]
vmovntdq xmmword ptr [rcx + 000h], xmm0
vmovntdq xmmword ptr [rcx + 010h], xmm1
vmovntdq xmmword ptr [rcx + 020h], xmm2
vmovntdq xmmword ptr [rcx + 030h], xmm3
vmovntdq xmmword ptr [rcx + 040h], xmm4
vmovntdq xmmword ptr [rcx + 050h], xmm5
vmovntdq xmmword ptr [rcx + 060h], xmm6
vmovntdq xmmword ptr [rcx + 070h], xmm7
vmovntdq xmmword ptr [rcx + 080h], xmm8
vmovntdq xmmword ptr [rcx + 090h], xmm9
vmovntdq xmmword ptr [rcx + 0A0h], xmm10
vmovntdq xmmword ptr [rcx + 0B0h], xmm11
vmovntdq xmmword ptr [rcx + 0C0h], xmm12
vmovntdq xmmword ptr [rcx + 0D0h], xmm13
vmovntdq xmmword ptr [rcx + 0E0h], xmm14
vmovntdq xmmword ptr [rcx + 0F0h], xmm15
add rdx, 0100h
add rcx, 0100h
cmp rcx, r10
jne FullLoop
RemainingBlocks:
lea r9 , JumpTable
mov r10, 15
sub r10, r11
imul r10, 5
add r9 , r10
jmp r9
JumpTable:
jmp Block15
jmp Block14
jmp Block13
jmp Block12
jmp Block11
jmp Block10
jmp Block9
jmp Block8
jmp Block7
jmp Block6
jmp Block5
jmp Block4
jmp Block3
jmp Block2
jmp Block1
jmp Block0
; ensure we generate near jumps
padding1 db 127 dup(090h)
Block15:
vmovaps xmm14, xmmword ptr [rdx + 0E0h]
vmovntdq xmmword ptr [rcx + 0E0h], xmm14
Block14:
vmovaps xmm13, xmmword ptr [rdx + 0D0h]
vmovntdq xmmword ptr [rcx + 0D0h], xmm13
Block13:
vmovaps xmm12, xmmword ptr [rdx + 0C0h]
vmovntdq xmmword ptr [rcx + 0C0h], xmm12
Block12:
vmovaps xmm11, xmmword ptr [rdx + 0B0h]
vmovntdq xmmword ptr [rcx + 0B0h], xmm11
Block11:
vmovaps xmm10, xmmword ptr [rdx + 0A0h]
vmovntdq xmmword ptr [rcx + 0A0h], xmm10
Block10:
vmovaps xmm9 , xmmword ptr [rdx + 090h]
vmovntdq xmmword ptr [rcx + 090h], xmm9
Block9:
vmovaps xmm8 , xmmword ptr [rdx + 080h]
vmovntdq xmmword ptr [rcx + 080h], xmm8
Block8:
vmovaps xmm7 , xmmword ptr [rdx + 070h]
vmovntdq xmmword ptr [rcx + 070h], xmm7
Block7:
vmovaps xmm6 , xmmword ptr [rdx + 060h]
vmovntdq xmmword ptr [rcx + 060h], xmm6
Block6:
vmovaps xmm5 , xmmword ptr [rdx + 050h]
vmovntdq xmmword ptr [rcx + 050h], xmm5
Block5:
vmovaps xmm4 , xmmword ptr [rdx + 040h]
vmovntdq xmmword ptr [rcx + 040h], xmm4
Block4:
vmovaps xmm3 , xmmword ptr [rdx + 030h]
vmovntdq xmmword ptr [rcx + 030h], xmm3
Block3:
vmovaps xmm2 , xmmword ptr [rdx + 020h]
vmovntdq xmmword ptr [rcx + 020h], xmm2
Block2:
vmovaps xmm1 , xmmword ptr [rdx + 010h]
vmovntdq xmmword ptr [rcx + 010h], xmm1
Block1:
vmovaps xmm0 , xmmword ptr [rdx + 000h]
vmovntdq xmmword ptr [rcx + 000h], xmm0
imul r11, 16
add rdx, r11
add rcx, r11
Block0:
movdqa xmm6 , oword ptr [rsp + 4*8 + 00 ]
movdqa xmm7 , oword ptr [rsp + 4*8 + 16 ]
movdqa xmm8 , oword ptr [rsp + 4*8 + 32 ]
movdqa xmm9 , oword ptr [rsp + 4*8 + 48 ]
movdqa xmm10, oword ptr [rsp + 4*8 + 64 ]
movdqa xmm11, oword ptr [rsp + 4*8 + 80 ]
movdqa xmm12, oword ptr [rsp + 4*8 + 96 ]
movdqa xmm13, oword ptr [rsp + 4*8 + 112]
movdqa xmm14, oword ptr [rsp + 4*8 + 128]
movdqa xmm15, oword ptr [rsp + 4*8 + 144]
add rsp, 8 + 10*16 + 4*8
and r8, 0Fh
imul r8, 5
lea r9, CopyTable
add r9, r8
jmp r9
CopyTable:
ret
nop
nop
nop
nop
jmp Copy1
jmp Copy2
jmp Copy3
jmp Copy4
jmp Copy5
jmp Copy6
jmp Copy7
jmp Copy8
jmp Copy9
jmp Copy10
jmp Copy11
jmp Copy12
jmp Copy13
jmp Copy14
; copy 15
mov r8 , qword ptr [rdx + 00h]
mov r9d , dword ptr [rdx + 08h]
mov r10w, word ptr [rdx + 0Ch]
mov al , byte ptr [rdx + 0Eh]
mov qword ptr [rcx + 00h], r8
mov dword ptr [rcx + 08h], r9d
mov word ptr [rcx + 0Ch], r10w
mov byte ptr [rcx + 0Eh], al
ret
; ensure we generate near jumps
padding2 db 127 dup(090h)
Copy1:
mov al, byte ptr [rdx]
mov byte ptr [rcx], al
ret
Copy2:
mov r10w, word ptr [rdx]
mov word ptr [rcx], r10w
ret
Copy3:
mov r10w, word ptr [rdx]
mov word ptr [rcx], r10w
mov al, byte ptr [rdx + 02h]
mov byte ptr [rcx + 02h], al
ret
Copy4:
mov r9d , dword ptr [rdx]
mov dword ptr [rcx], r9d
ret
Copy5:
mov r9d , dword ptr [rdx]
mov dword ptr [rcx], r9d
mov al, byte ptr [rdx + 04h]
mov byte ptr [rcx + 04h], al
ret
Copy6:
mov r9d , dword ptr [rdx]
mov dword ptr [rcx], r9d
mov r10w, word ptr [rdx + 04h]
mov word ptr [rcx + 04h], r10w
ret
Copy7:
mov r9d , dword ptr [rdx]
mov dword ptr [rcx], r9d
mov r10w, word ptr [rdx + 04h]
mov word ptr [rcx + 04h], r10w
mov al, byte ptr [rdx + 06h]
mov byte ptr [rcx + 06h], al
ret
Copy8:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
ret
Copy9:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
mov al, byte ptr [rdx + 08h]
mov byte ptr [rcx + 08h], al
ret
Copy10:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
mov r10w, word ptr [rdx + 08h]
mov word ptr [rcx + 08h], r10w
ret
Copy11:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
mov r10w, word ptr [rdx + 08h]
mov word ptr [rcx + 08h], r10w
mov al, byte ptr [rdx + 0Ah]
mov byte ptr [rcx + 0Ah], al
ret
Copy12:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
mov r9d , dword ptr [rdx + 08h]
mov dword ptr [rcx + 08h], r9d
ret
Copy13:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
mov r9d , dword ptr [rdx + 08h]
mov dword ptr [rcx + 08h], r9d
mov al, byte ptr [rdx + 0Ch]
mov byte ptr [rcx + 0Ch], al
ret
Copy14:
mov r8 , qword ptr [rdx ]
mov r9d , dword ptr [rdx + 08h]
mov r10w, word ptr [rdx + 0Ch]
mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d
mov word ptr [rcx + 0Ch], r10w
ret
memcpySSE endp
end

View File

@ -26,6 +26,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA
#include "debug.h"
#if defined(__GNUC___) || defined(__GNUG__)
#define OP(...) #__VA_ARGS__ "\n\t"
inline static void memcpySSE(void *dst, const void * src, size_t length)
@ -79,7 +80,7 @@ inline static void memcpySSE(void *dst, const void * src, size_t length)
OP(add %[rem],%[end])
OP(jmp *%[end])
// jump table
// jump table
OP(vmovaps 0x60(%[src]),%%xmm0)
OP(vmovntdq %%xmm0,0x60(%[dst]))
OP(vmovaps 0x50(%[src]),%%xmm1)
@ -95,28 +96,28 @@ inline static void memcpySSE(void *dst, const void * src, size_t length)
OP(vmovaps 0x00(%[src]),%%xmm6)
OP(vmovntdq %%xmm6,0x00(%[dst]))
// alignment as the previous two instructions are only 4 bytes
// alignment as the previous two instructions are only 4 bytes
OP(nop)
OP(nop)
// restore the registers
// restore the registers
OP(pop %[end])
OP(pop %[src])
OP(pop %[dst])
:
: [dst]"r" (dst),
[src]"r" (src),
[end]"c" (end),
[rem]"d" (rem)
: "xmm0",
"xmm1",
"xmm2",
"xmm3",
"xmm4",
"xmm5",
"xmm6",
"xmm7",
"memory"
:
: [dst]"r" (dst),
[src]"r" (src),
[end]"c" (end),
[rem]"d" (rem)
: "xmm0",
"xmm1",
"xmm2",
"xmm3",
"xmm4",
"xmm5",
"xmm6",
"xmm7",
"memory"
);
//copy any remaining bytes
@ -126,4 +127,7 @@ inline static void memcpySSE(void *dst, const void * src, size_t length)
#else
memcpy(dst, src, length);
#endif
}
}
#else
extern "C" void __fastcall memcpySSE(void *dst, const void * src, size_t length);
#endif