LookingGlass/common/memcpySSE.asm

237 lines
5.0 KiB
NASM

.code
memcpySSE proc
; dst = rcx
; src = rdx
; len = r8
test r8, r8
jz @Exit
cmp rcx, rdx
je @Exit
sub rsp, 8 + 2*16 + 4*8
movdqa oword ptr [rsp + 4*8 + 00 ], xmm6
movdqa oword ptr [rsp + 4*8 + 16 ], xmm7
; void * end = dst + (length & ~0x7F);
; end = r10
mov r9 , r8
and r9 , -07Fh
jz @RemainingBlocks
mov r10, rcx
add r10, r9
@FullLoop:
vmovaps xmm0 , xmmword ptr [rdx + 000h]
vmovaps xmm1 , xmmword ptr [rdx + 010h]
vmovaps xmm2 , xmmword ptr [rdx + 020h]
vmovaps xmm3 , xmmword ptr [rdx + 030h]
vmovaps xmm4 , xmmword ptr [rdx + 040h]
vmovaps xmm5 , xmmword ptr [rdx + 050h]
vmovaps xmm6 , xmmword ptr [rdx + 060h]
vmovaps xmm7 , xmmword ptr [rdx + 070h]
vmovntdq xmmword ptr [rcx + 000h], xmm0
vmovntdq xmmword ptr [rcx + 010h], xmm1
vmovntdq xmmword ptr [rcx + 020h], xmm2
vmovntdq xmmword ptr [rcx + 030h], xmm3
vmovntdq xmmword ptr [rcx + 040h], xmm4
vmovntdq xmmword ptr [rcx + 050h], xmm5
vmovntdq xmmword ptr [rcx + 060h], xmm6
vmovntdq xmmword ptr [rcx + 070h], xmm7
add rdx, 080h
add rcx, 080h
cmp rcx, r10
jne @FullLoop
@RemainingBlocks:
; size_t rem = (length & 0x7F) >> 4);
; rem = r11
mov r11, r8
and r11, 07Fh
jz @RestoreExit
shr r11, 4
mov r10, 7
sub r10, r11
imul r10, 10
lea r9 , @FinalBlocks
add r9 , r10
jmp r9
@RestoreExit:
movdqa xmm6 , oword ptr [rsp + 4*8 + 00 ]
movdqa xmm7 , oword ptr [rsp + 4*8 + 16 ]
add rsp, 8 + 2*16 + 4*8
@Exit:
sfence
ret
@FinalBlocks:
vmovaps xmm6 , xmmword ptr [rdx + 060h]
vmovntdq xmmword ptr [rcx + 060h], xmm6
vmovaps xmm5 , xmmword ptr [rdx + 050h]
vmovntdq xmmword ptr [rcx + 050h], xmm5
vmovaps xmm4 , xmmword ptr [rdx + 040h]
vmovntdq xmmword ptr [rcx + 040h], xmm4
vmovaps xmm3 , xmmword ptr [rdx + 030h]
vmovntdq xmmword ptr [rcx + 030h], xmm3
vmovaps xmm2 , xmmword ptr [rdx + 020h]
vmovntdq xmmword ptr [rcx + 020h], xmm2
vmovaps xmm1 , xmmword ptr [rdx + 010h]
vmovntdq xmmword ptr [rcx + 010h], xmm1
vmovaps xmm0 , xmmword ptr [rdx + 000h]
vmovntdq xmmword ptr [rcx + 000h], xmm0
nop
nop
imul r9, 16
add rdx, r9
add rcx, r9
@EndBlocks:
and r8, 0Fh
test r8, r8
je @RestoreExit
cmp r8, 2
je @Copy2
cmp r8, 3
je @Copy3
cmp r8, 4
je @Copy4
cmp r8, 5
je @Copy5
cmp r8, 6
je @Copy6
cmp r8, 7
je @Copy7
cmp r8, 8
je @Copy8
cmp r8, 9
je @Copy9
cmp r8, 10
je @Copy10
cmp r8, 11
je @Copy11
cmp r8, 12
je @Copy12
cmp r8, 13
je @Copy13
cmp r8, 14
je @Copy14
cmp r8, 15
je @Copy15
; fall through - 1 byte
mov al, byte ptr [rdx]
mov byte ptr [rcx], al
jmp @RestoreExit
@Copy2:
mov r10w, word ptr [rdx]
mov word ptr [rcx], r10w
jmp @RestoreExit
@Copy3:
mov r10w, word ptr [rdx]
mov word ptr [rcx], r10w
mov al, byte ptr [rdx + 02h]
mov byte ptr [rcx + 02h], al
jmp @RestoreExit
@Copy4:
mov r9d, dword ptr [rdx]
mov dword ptr [rcx], r9d
jmp @RestoreExit
@Copy5:
mov r9d, dword ptr [rdx ]
mov al , byte ptr [rdx + 04h]
mov dword ptr [rcx ], r9d
mov byte ptr [rcx + 04h], al
jmp @RestoreExit
@Copy6:
mov r9d , dword ptr [rdx ]
mov r10w, word ptr [rdx + 04h]
mov dword ptr [rcx ], r9d
mov word ptr [rcx + 04h], r10w
jmp @RestoreExit
@Copy7:
mov r9d , dword ptr [rdx ]
mov r10w, word ptr [rdx + 04h]
mov al , byte ptr [rdx + 06h]
mov dword ptr [rcx ], r9d
mov word ptr [rcx + 04h], r10w
mov byte ptr [rcx + 06h], al
jmp @RestoreExit
@Copy8:
mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8
jmp @RestoreExit
@Copy9:
mov r8, qword ptr [rdx ]
mov al, byte ptr [rdx + 08h]
mov qword ptr [rcx ], r8
mov byte ptr [rcx + 08h], al
jmp @RestoreExit
@Copy10:
mov r8 , qword ptr [rdx ]
mov r10w, word ptr [rdx + 08h]
mov qword ptr [rcx ], r8
mov word ptr [rcx + 08h], r10w
jmp @RestoreExit
@Copy11:
mov r8 , qword ptr [rdx ]
mov r10w, word ptr [rdx + 08h]
mov al , byte ptr [rdx + 0Ah]
mov qword ptr [rcx ], r8
mov word ptr [rcx + 08h], r10w
mov byte ptr [rcx + 0Ah], al
jmp @RestoreExit
@Copy12:
mov r8 , qword ptr [rdx ]
mov r9d, dword ptr [rdx + 08h]
mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d
jmp @RestoreExit
@Copy13:
mov r8 , qword ptr [rdx ]
mov r9d, dword ptr [rdx + 08h]
mov al , byte ptr [rdx + 0Ch]
mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d
mov byte ptr [rcx + 0Ch], al
jmp @RestoreExit
@Copy14:
mov r8 , qword ptr [rdx ]
mov r9d , dword ptr [rdx + 08h]
mov r10w, word ptr [rdx + 0Ch]
mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d
mov word ptr [rcx + 0Ch], r10w
jmp @RestoreExit
; copy 15
@Copy15:
mov r8 , qword ptr [rdx + 00h]
mov r9d , dword ptr [rdx + 08h]
mov r10w, word ptr [rdx + 0Ch]
mov al , byte ptr [rdx + 0Eh]
mov qword ptr [rcx + 00h], r8
mov dword ptr [rcx + 08h], r9d
mov word ptr [rcx + 0Ch], r10w
mov byte ptr [rcx + 0Eh], al
jmp @RestoreExit
memcpySSE endp
end