mirror of
https://github.com/gnif/LookingGlass.git
synced 2024-11-10 00:28:20 +00:00
NASM version of a SSE2 memcpy
This commit is contained in:
parent
ffec6c2014
commit
3c77c1eb2b
312
common/memcpySSE.asm
Normal file
312
common/memcpySSE.asm
Normal file
@ -0,0 +1,312 @@
|
||||
.code
|
||||
|
||||
memcpySSE proc
|
||||
; dst = rcx
|
||||
; src = rdx
|
||||
; len = r8
|
||||
|
||||
test r8 , r8
|
||||
jne OK
|
||||
ret
|
||||
|
||||
OK:
|
||||
; void * end = dst + (length & ~0x100);
|
||||
; end = r10
|
||||
mov r9 , r8
|
||||
and r9 , -0100h
|
||||
mov r10, rcx
|
||||
add r10, r9
|
||||
|
||||
; size_t rem = (length & 0xFF) >> 4);
|
||||
; rem = r11
|
||||
mov r11, r8
|
||||
and r11, 0FFh
|
||||
shr r11, 4
|
||||
|
||||
sub rsp, 8 + 10*16 + 4*8
|
||||
movdqa oword ptr [rsp + 4*8 + 00 ], xmm6
|
||||
movdqa oword ptr [rsp + 4*8 + 16 ], xmm7
|
||||
movdqa oword ptr [rsp + 4*8 + 32 ], xmm8
|
||||
movdqa oword ptr [rsp + 4*8 + 48 ], xmm9
|
||||
movdqa oword ptr [rsp + 4*8 + 64 ], xmm10
|
||||
movdqa oword ptr [rsp + 4*8 + 80 ], xmm11
|
||||
movdqa oword ptr [rsp + 4*8 + 96 ], xmm12
|
||||
movdqa oword ptr [rsp + 4*8 + 112], xmm13
|
||||
movdqa oword ptr [rsp + 4*8 + 128], xmm14
|
||||
movdqa oword ptr [rsp + 4*8 + 144], xmm15
|
||||
|
||||
cmp rcx, r10
|
||||
je RemainingBlocks
|
||||
|
||||
FullLoop:
|
||||
vmovaps xmm0 , xmmword ptr [rdx + 000h]
|
||||
vmovaps xmm1 , xmmword ptr [rdx + 010h]
|
||||
vmovaps xmm2 , xmmword ptr [rdx + 020h]
|
||||
vmovaps xmm3 , xmmword ptr [rdx + 030h]
|
||||
vmovaps xmm4 , xmmword ptr [rdx + 040h]
|
||||
vmovaps xmm5 , xmmword ptr [rdx + 050h]
|
||||
vmovaps xmm6 , xmmword ptr [rdx + 060h]
|
||||
vmovaps xmm7 , xmmword ptr [rdx + 070h]
|
||||
vmovaps xmm8 , xmmword ptr [rdx + 080h]
|
||||
vmovaps xmm9 , xmmword ptr [rdx + 090h]
|
||||
vmovaps xmm10, xmmword ptr [rdx + 0A0h]
|
||||
vmovaps xmm11, xmmword ptr [rdx + 0B0h]
|
||||
vmovaps xmm12, xmmword ptr [rdx + 0C0h]
|
||||
vmovaps xmm13, xmmword ptr [rdx + 0D0h]
|
||||
vmovaps xmm14, xmmword ptr [rdx + 0E0h]
|
||||
vmovaps xmm15, xmmword ptr [rdx + 0F0h]
|
||||
vmovntdq xmmword ptr [rcx + 000h], xmm0
|
||||
vmovntdq xmmword ptr [rcx + 010h], xmm1
|
||||
vmovntdq xmmword ptr [rcx + 020h], xmm2
|
||||
vmovntdq xmmword ptr [rcx + 030h], xmm3
|
||||
vmovntdq xmmword ptr [rcx + 040h], xmm4
|
||||
vmovntdq xmmword ptr [rcx + 050h], xmm5
|
||||
vmovntdq xmmword ptr [rcx + 060h], xmm6
|
||||
vmovntdq xmmword ptr [rcx + 070h], xmm7
|
||||
vmovntdq xmmword ptr [rcx + 080h], xmm8
|
||||
vmovntdq xmmword ptr [rcx + 090h], xmm9
|
||||
vmovntdq xmmword ptr [rcx + 0A0h], xmm10
|
||||
vmovntdq xmmword ptr [rcx + 0B0h], xmm11
|
||||
vmovntdq xmmword ptr [rcx + 0C0h], xmm12
|
||||
vmovntdq xmmword ptr [rcx + 0D0h], xmm13
|
||||
vmovntdq xmmword ptr [rcx + 0E0h], xmm14
|
||||
vmovntdq xmmword ptr [rcx + 0F0h], xmm15
|
||||
add rdx, 0100h
|
||||
add rcx, 0100h
|
||||
cmp rcx, r10
|
||||
jne FullLoop
|
||||
|
||||
RemainingBlocks:
|
||||
lea r9 , JumpTable
|
||||
mov r10, 15
|
||||
sub r10, r11
|
||||
imul r10, 5
|
||||
add r9 , r10
|
||||
jmp r9
|
||||
|
||||
JumpTable:
|
||||
jmp Block15
|
||||
jmp Block14
|
||||
jmp Block13
|
||||
jmp Block12
|
||||
jmp Block11
|
||||
jmp Block10
|
||||
jmp Block9
|
||||
jmp Block8
|
||||
jmp Block7
|
||||
jmp Block6
|
||||
jmp Block5
|
||||
jmp Block4
|
||||
jmp Block3
|
||||
jmp Block2
|
||||
jmp Block1
|
||||
jmp Block0
|
||||
|
||||
; ensure we generate near jumps
|
||||
padding1 db 127 dup(090h)
|
||||
|
||||
Block15:
|
||||
vmovaps xmm14, xmmword ptr [rdx + 0E0h]
|
||||
vmovntdq xmmword ptr [rcx + 0E0h], xmm14
|
||||
Block14:
|
||||
vmovaps xmm13, xmmword ptr [rdx + 0D0h]
|
||||
vmovntdq xmmword ptr [rcx + 0D0h], xmm13
|
||||
Block13:
|
||||
vmovaps xmm12, xmmword ptr [rdx + 0C0h]
|
||||
vmovntdq xmmword ptr [rcx + 0C0h], xmm12
|
||||
Block12:
|
||||
vmovaps xmm11, xmmword ptr [rdx + 0B0h]
|
||||
vmovntdq xmmword ptr [rcx + 0B0h], xmm11
|
||||
Block11:
|
||||
vmovaps xmm10, xmmword ptr [rdx + 0A0h]
|
||||
vmovntdq xmmword ptr [rcx + 0A0h], xmm10
|
||||
Block10:
|
||||
vmovaps xmm9 , xmmword ptr [rdx + 090h]
|
||||
vmovntdq xmmword ptr [rcx + 090h], xmm9
|
||||
Block9:
|
||||
vmovaps xmm8 , xmmword ptr [rdx + 080h]
|
||||
vmovntdq xmmword ptr [rcx + 080h], xmm8
|
||||
Block8:
|
||||
vmovaps xmm7 , xmmword ptr [rdx + 070h]
|
||||
vmovntdq xmmword ptr [rcx + 070h], xmm7
|
||||
Block7:
|
||||
vmovaps xmm6 , xmmword ptr [rdx + 060h]
|
||||
vmovntdq xmmword ptr [rcx + 060h], xmm6
|
||||
Block6:
|
||||
vmovaps xmm5 , xmmword ptr [rdx + 050h]
|
||||
vmovntdq xmmword ptr [rcx + 050h], xmm5
|
||||
Block5:
|
||||
vmovaps xmm4 , xmmword ptr [rdx + 040h]
|
||||
vmovntdq xmmword ptr [rcx + 040h], xmm4
|
||||
Block4:
|
||||
vmovaps xmm3 , xmmword ptr [rdx + 030h]
|
||||
vmovntdq xmmword ptr [rcx + 030h], xmm3
|
||||
Block3:
|
||||
vmovaps xmm2 , xmmword ptr [rdx + 020h]
|
||||
vmovntdq xmmword ptr [rcx + 020h], xmm2
|
||||
Block2:
|
||||
vmovaps xmm1 , xmmword ptr [rdx + 010h]
|
||||
vmovntdq xmmword ptr [rcx + 010h], xmm1
|
||||
Block1:
|
||||
vmovaps xmm0 , xmmword ptr [rdx + 000h]
|
||||
vmovntdq xmmword ptr [rcx + 000h], xmm0
|
||||
|
||||
imul r11, 16
|
||||
add rdx, r11
|
||||
add rcx, r11
|
||||
|
||||
Block0:
|
||||
movdqa xmm6 , oword ptr [rsp + 4*8 + 00 ]
|
||||
movdqa xmm7 , oword ptr [rsp + 4*8 + 16 ]
|
||||
movdqa xmm8 , oword ptr [rsp + 4*8 + 32 ]
|
||||
movdqa xmm9 , oword ptr [rsp + 4*8 + 48 ]
|
||||
movdqa xmm10, oword ptr [rsp + 4*8 + 64 ]
|
||||
movdqa xmm11, oword ptr [rsp + 4*8 + 80 ]
|
||||
movdqa xmm12, oword ptr [rsp + 4*8 + 96 ]
|
||||
movdqa xmm13, oword ptr [rsp + 4*8 + 112]
|
||||
movdqa xmm14, oword ptr [rsp + 4*8 + 128]
|
||||
movdqa xmm15, oword ptr [rsp + 4*8 + 144]
|
||||
add rsp, 8 + 10*16 + 4*8
|
||||
|
||||
and r8, 0Fh
|
||||
imul r8, 5
|
||||
lea r9, CopyTable
|
||||
add r9, r8
|
||||
jmp r9
|
||||
|
||||
CopyTable:
|
||||
ret
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
|
||||
jmp Copy1
|
||||
jmp Copy2
|
||||
jmp Copy3
|
||||
jmp Copy4
|
||||
jmp Copy5
|
||||
jmp Copy6
|
||||
jmp Copy7
|
||||
jmp Copy8
|
||||
jmp Copy9
|
||||
jmp Copy10
|
||||
jmp Copy11
|
||||
jmp Copy12
|
||||
jmp Copy13
|
||||
jmp Copy14
|
||||
|
||||
; copy 15
|
||||
mov r8 , qword ptr [rdx + 00h]
|
||||
mov r9d , dword ptr [rdx + 08h]
|
||||
mov r10w, word ptr [rdx + 0Ch]
|
||||
mov al , byte ptr [rdx + 0Eh]
|
||||
mov qword ptr [rcx + 00h], r8
|
||||
mov dword ptr [rcx + 08h], r9d
|
||||
mov word ptr [rcx + 0Ch], r10w
|
||||
mov byte ptr [rcx + 0Eh], al
|
||||
ret
|
||||
|
||||
; ensure we generate near jumps
|
||||
padding2 db 127 dup(090h)
|
||||
|
||||
Copy1:
|
||||
mov al, byte ptr [rdx]
|
||||
mov byte ptr [rcx], al
|
||||
ret
|
||||
|
||||
Copy2:
|
||||
mov r10w, word ptr [rdx]
|
||||
mov word ptr [rcx], r10w
|
||||
ret
|
||||
|
||||
Copy3:
|
||||
mov r10w, word ptr [rdx]
|
||||
mov word ptr [rcx], r10w
|
||||
mov al, byte ptr [rdx + 02h]
|
||||
mov byte ptr [rcx + 02h], al
|
||||
ret
|
||||
|
||||
Copy4:
|
||||
mov r9d , dword ptr [rdx]
|
||||
mov dword ptr [rcx], r9d
|
||||
ret
|
||||
|
||||
Copy5:
|
||||
mov r9d , dword ptr [rdx]
|
||||
mov dword ptr [rcx], r9d
|
||||
mov al, byte ptr [rdx + 04h]
|
||||
mov byte ptr [rcx + 04h], al
|
||||
ret
|
||||
|
||||
Copy6:
|
||||
mov r9d , dword ptr [rdx]
|
||||
mov dword ptr [rcx], r9d
|
||||
mov r10w, word ptr [rdx + 04h]
|
||||
mov word ptr [rcx + 04h], r10w
|
||||
ret
|
||||
|
||||
Copy7:
|
||||
mov r9d , dword ptr [rdx]
|
||||
mov dword ptr [rcx], r9d
|
||||
mov r10w, word ptr [rdx + 04h]
|
||||
mov word ptr [rcx + 04h], r10w
|
||||
mov al, byte ptr [rdx + 06h]
|
||||
mov byte ptr [rcx + 06h], al
|
||||
ret
|
||||
|
||||
Copy8:
|
||||
mov r8, qword ptr [rdx]
|
||||
mov qword ptr [rcx], r8
|
||||
ret
|
||||
|
||||
Copy9:
|
||||
mov r8, qword ptr [rdx]
|
||||
mov qword ptr [rcx], r8
|
||||
mov al, byte ptr [rdx + 08h]
|
||||
mov byte ptr [rcx + 08h], al
|
||||
ret
|
||||
|
||||
Copy10:
|
||||
mov r8, qword ptr [rdx]
|
||||
mov qword ptr [rcx], r8
|
||||
mov r10w, word ptr [rdx + 08h]
|
||||
mov word ptr [rcx + 08h], r10w
|
||||
ret
|
||||
|
||||
Copy11:
|
||||
mov r8, qword ptr [rdx]
|
||||
mov qword ptr [rcx], r8
|
||||
mov r10w, word ptr [rdx + 08h]
|
||||
mov word ptr [rcx + 08h], r10w
|
||||
mov al, byte ptr [rdx + 0Ah]
|
||||
mov byte ptr [rcx + 0Ah], al
|
||||
ret
|
||||
|
||||
Copy12:
|
||||
mov r8, qword ptr [rdx]
|
||||
mov qword ptr [rcx], r8
|
||||
mov r9d , dword ptr [rdx + 08h]
|
||||
mov dword ptr [rcx + 08h], r9d
|
||||
ret
|
||||
|
||||
Copy13:
|
||||
mov r8, qword ptr [rdx]
|
||||
mov qword ptr [rcx], r8
|
||||
mov r9d , dword ptr [rdx + 08h]
|
||||
mov dword ptr [rcx + 08h], r9d
|
||||
mov al, byte ptr [rdx + 0Ch]
|
||||
mov byte ptr [rcx + 0Ch], al
|
||||
ret
|
||||
|
||||
Copy14:
|
||||
mov r8 , qword ptr [rdx ]
|
||||
mov r9d , dword ptr [rdx + 08h]
|
||||
mov r10w, word ptr [rdx + 0Ch]
|
||||
mov qword ptr [rcx ], r8
|
||||
mov dword ptr [rcx + 08h], r9d
|
||||
mov word ptr [rcx + 0Ch], r10w
|
||||
ret
|
||||
|
||||
memcpySSE endp
|
||||
end
|
@ -26,6 +26,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
#include "debug.h"
|
||||
|
||||
#if defined(__GNUC___) || defined(__GNUG__)
|
||||
#define OP(...) #__VA_ARGS__ "\n\t"
|
||||
|
||||
inline static void memcpySSE(void *dst, const void * src, size_t length)
|
||||
@ -79,7 +80,7 @@ inline static void memcpySSE(void *dst, const void * src, size_t length)
|
||||
OP(add %[rem],%[end])
|
||||
OP(jmp *%[end])
|
||||
|
||||
// jump table
|
||||
// jump table
|
||||
OP(vmovaps 0x60(%[src]),%%xmm0)
|
||||
OP(vmovntdq %%xmm0,0x60(%[dst]))
|
||||
OP(vmovaps 0x50(%[src]),%%xmm1)
|
||||
@ -95,28 +96,28 @@ inline static void memcpySSE(void *dst, const void * src, size_t length)
|
||||
OP(vmovaps 0x00(%[src]),%%xmm6)
|
||||
OP(vmovntdq %%xmm6,0x00(%[dst]))
|
||||
|
||||
// alignment as the previous two instructions are only 4 bytes
|
||||
// alignment as the previous two instructions are only 4 bytes
|
||||
OP(nop)
|
||||
OP(nop)
|
||||
|
||||
// restore the registers
|
||||
// restore the registers
|
||||
OP(pop %[end])
|
||||
OP(pop %[src])
|
||||
OP(pop %[dst])
|
||||
:
|
||||
: [dst]"r" (dst),
|
||||
[src]"r" (src),
|
||||
[end]"c" (end),
|
||||
[rem]"d" (rem)
|
||||
: "xmm0",
|
||||
"xmm1",
|
||||
"xmm2",
|
||||
"xmm3",
|
||||
"xmm4",
|
||||
"xmm5",
|
||||
"xmm6",
|
||||
"xmm7",
|
||||
"memory"
|
||||
:
|
||||
: [dst]"r" (dst),
|
||||
[src]"r" (src),
|
||||
[end]"c" (end),
|
||||
[rem]"d" (rem)
|
||||
: "xmm0",
|
||||
"xmm1",
|
||||
"xmm2",
|
||||
"xmm3",
|
||||
"xmm4",
|
||||
"xmm5",
|
||||
"xmm6",
|
||||
"xmm7",
|
||||
"memory"
|
||||
);
|
||||
|
||||
//copy any remaining bytes
|
||||
@ -126,4 +127,7 @@ inline static void memcpySSE(void *dst, const void * src, size_t length)
|
||||
#else
|
||||
memcpy(dst, src, length);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
#else
|
||||
extern "C" void __fastcall memcpySSE(void *dst, const void * src, size_t length);
|
||||
#endif
|
Loading…
Reference in New Issue
Block a user