diff --git a/common/memcpySSE.asm b/common/memcpySSE.asm index 7a0c78e0..ad2c85e8 100644 --- a/common/memcpySSE.asm +++ b/common/memcpySSE.asm @@ -4,8 +4,11 @@ memcpySSE proc ; src = rdx ; len = r8 + mov rax, rcx + test r8, r8 jz @Exit + cmp rcx, rdx je @Exit @@ -16,7 +19,7 @@ memcpySSE proc ; void * end = dst + (length & ~0x7F); ; end = r10 mov r9 , r8 - and r9 , -07Fh + and r9 , 0FFFFFFFFFFFFFF80h jz @RemainingBlocks mov r10, rcx add r10, r9 @@ -50,6 +53,7 @@ memcpySSE proc and r11, 07Fh jz @RestoreExit shr r11, 4 + jz @FinalBytes mov r10, 7 sub r10, r11 @@ -59,12 +63,11 @@ memcpySSE proc jmp r9 @RestoreExit: - movdqa xmm6 , oword ptr [rsp + 4*8 + 00 ] - movdqa xmm7 , oword ptr [rsp + 4*8 + 16 ] + movdqa xmm6 , oword ptr [rsp + 4*8 + 00] + movdqa xmm7 , oword ptr [rsp + 4*8 + 16] add rsp, 8 + 2*16 + 4*8 @Exit: - sfence ret @FinalBlocks: @@ -82,135 +85,132 @@ memcpySSE proc vmovntdq xmmword ptr [rcx + 010h], xmm1 vmovaps xmm0 , xmmword ptr [rdx + 000h] vmovntdq xmmword ptr [rcx + 000h], xmm0 - nop - nop - imul r9, 16 - add rdx, r9 - add rcx, r9 + movdqa xmm6 , oword ptr [rsp + 4*8 + 00] + movdqa xmm7 , oword ptr [rsp + 4*8 + 16] + add rsp, 8 + 2*16 + 4*8 + sfence - @EndBlocks: + shl r11, 4 + add rdx, r11 + add rcx, r11 + + @FinalBytes: and r8, 0Fh - test r8, r8 - je @RestoreExit + jz @Exit + imul r8, 5 + lea r9, @FinalBytesTable + add r9, r8 + jmp r9 - cmp r8, 2 - je @Copy2 - cmp r8, 3 - je @Copy3 - cmp r8, 4 - je @Copy4 - cmp r8, 5 - je @Copy5 - cmp r8, 6 - je @Copy6 - cmp r8, 7 - je @Copy7 - cmp r8, 8 - je @Copy8 - cmp r8, 9 - je @Copy9 - cmp r8, 10 - je @Copy10 - cmp r8, 11 - je @Copy11 - cmp r8, 12 - je @Copy12 - cmp r8, 13 - je @Copy13 - cmp r8, 14 - je @Copy14 - cmp r8, 15 - je @Copy15 + @FinalBytesTable: + jmp @Copy1 + jmp @Copy2 + jmp @Copy3 + jmp @Copy4 + jmp @Copy5 + jmp @Copy6 + jmp @Copy7 + jmp @Copy8 + jmp @Copy9 + jmp @Copy10 + jmp @Copy11 + jmp @Copy12 + jmp @Copy13 + jmp @Copy14 + jmp @Copy15 + + db 128 DUP(0CCh) ; fall through - 1 byte + @Copy1: mov al, byte ptr [rdx] mov byte ptr [rcx], al - jmp @RestoreExit + ret @Copy2: mov r10w, word ptr [rdx] mov word ptr [rcx], r10w - jmp @RestoreExit + ret @Copy3: mov r10w, word ptr [rdx] mov word ptr [rcx], r10w - mov al, byte ptr [rdx + 02h] - mov byte ptr [rcx + 02h], al - jmp @RestoreExit + mov r11b, byte ptr [rdx + 02h] + mov byte ptr [rcx + 02h], r11b + ret @Copy4: mov r9d, dword ptr [rdx] mov dword ptr [rcx], r9d - jmp @RestoreExit + ret @Copy5: mov r9d, dword ptr [rdx ] - mov al , byte ptr [rdx + 04h] + mov r11b , byte ptr [rdx + 04h] mov dword ptr [rcx ], r9d - mov byte ptr [rcx + 04h], al - jmp @RestoreExit + mov byte ptr [rcx + 04h], r11b + ret @Copy6: mov r9d , dword ptr [rdx ] mov r10w, word ptr [rdx + 04h] mov dword ptr [rcx ], r9d mov word ptr [rcx + 04h], r10w - jmp @RestoreExit + ret @Copy7: mov r9d , dword ptr [rdx ] mov r10w, word ptr [rdx + 04h] - mov al , byte ptr [rdx + 06h] + mov r11b, byte ptr [rdx + 06h] mov dword ptr [rcx ], r9d mov word ptr [rcx + 04h], r10w - mov byte ptr [rcx + 06h], al - jmp @RestoreExit + mov byte ptr [rcx + 06h], r11b + ret @Copy8: mov r8, qword ptr [rdx] mov qword ptr [rcx], r8 - jmp @RestoreExit + ret @Copy9: - mov r8, qword ptr [rdx ] - mov al, byte ptr [rdx + 08h] + mov r8 , qword ptr [rdx ] + mov r11b, byte ptr [rdx + 08h] mov qword ptr [rcx ], r8 - mov byte ptr [rcx + 08h], al - jmp @RestoreExit + mov byte ptr [rcx + 08h], r11b + ret @Copy10: mov r8 , qword ptr [rdx ] mov r10w, word ptr [rdx + 08h] mov qword ptr [rcx ], r8 mov word ptr [rcx + 08h], r10w - jmp @RestoreExit + ret @Copy11: mov r8 , qword ptr [rdx ] mov r10w, word ptr [rdx + 08h] - mov al , byte ptr [rdx + 0Ah] + mov r11b, byte ptr [rdx + 0Ah] mov qword ptr [rcx ], r8 mov word ptr [rcx + 08h], r10w - mov byte ptr [rcx + 0Ah], al - jmp @RestoreExit + mov byte ptr [rcx + 0Ah], r11b + ret @Copy12: mov r8 , qword ptr [rdx ] mov r9d, dword ptr [rdx + 08h] mov qword ptr [rcx ], r8 mov dword ptr [rcx + 08h], r9d - jmp @RestoreExit + ret @Copy13: - mov r8 , qword ptr [rdx ] - mov r9d, dword ptr [rdx + 08h] - mov al , byte ptr [rdx + 0Ch] + mov r8 , qword ptr [rdx ] + mov r9d , dword ptr [rdx + 08h] + mov r11b, byte ptr [rdx + 0Ch] mov qword ptr [rcx ], r8 mov dword ptr [rcx + 08h], r9d - mov byte ptr [rcx + 0Ch], al - jmp @RestoreExit + mov byte ptr [rcx + 0Ch], r11b + ret @Copy14: mov r8 , qword ptr [rdx ] @@ -219,19 +219,19 @@ memcpySSE proc mov qword ptr [rcx ], r8 mov dword ptr [rcx + 08h], r9d mov word ptr [rcx + 0Ch], r10w - jmp @RestoreExit + ret ; copy 15 @Copy15: mov r8 , qword ptr [rdx + 00h] mov r9d , dword ptr [rdx + 08h] mov r10w, word ptr [rdx + 0Ch] - mov al , byte ptr [rdx + 0Eh] + mov r11b, byte ptr [rdx + 0Eh] mov qword ptr [rcx + 00h], r8 mov dword ptr [rcx + 08h], r9d mov word ptr [rcx + 0Ch], r10w - mov byte ptr [rcx + 0Eh], al - jmp @RestoreExit + mov byte ptr [rcx + 0Eh], r11b + ret memcpySSE endp end \ No newline at end of file diff --git a/common/memcpySSE.h b/common/memcpySSE.h index 2fab7c46..61e8b6ca 100644 --- a/common/memcpySSE.h +++ b/common/memcpySSE.h @@ -26,108 +26,110 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include "debug.h" -static inline void memcpySSE(void * dst, const void * src, size_t length) -{ - // check if we can't perform an aligned copy - if (((uintptr_t)src & 0xF) != ((uintptr_t)dst & 0xF)) +#if defined(NATIVE_MEMCPY) + #define memcpySSE memcpy +#elif defined(_MSC_VER) + extern "C" void * memcpySSE(void *dst, const void * src, size_t length); +#elif (defined(__GNUC__) || defined(__GNUG__)) && defined(__i386__) + inline static void * memcpySSE(void *dst, const void * src, size_t length) { + if (length == 0 || dst == src) + return; - static bool unalignedDstWarn = false; - if (!unalignedDstWarn) + // copies under 1MB are faster with the inlined memcpy + // tell the dev to use that instead + if (length < 1048576) { - DEBUG_WARN("Memcpy64 unable to perform aligned copy, performance will suffer"); - unalignedDstWarn = true; - } - - // fallback to system memcpy - memcpy(dst, src, length); - return; - } - - // check if the source needs alignment - { - uint8_t * _src = (uint8_t *)src; - unsigned int count = (16 - ((uintptr_t)src & 0xF)) & 0xF; - - static bool unalignedSrcWarn = false; - if (count > 0) - { - if (!unalignedSrcWarn) + static bool smallBufferWarn = false; + if (!smallBufferWarn) { - DEBUG_WARN("Memcpy64 unaligned source, performance will suffer"); - unalignedSrcWarn = true; + DEBUG_WARN("Do not use memcpySSE for copies under 1MB in size!"); + smallBufferWarn = true; } - - uint8_t * _dst = (uint8_t *)dst; - for (unsigned int i = count; i > 0; --i) - *_dst++ = *_src++; - src = _src; - dst = _dst; - length -= count; + memcpy(dst, src, length); + return; } + + const void * end = dst + (length & ~0x7F); + const size_t off = (7 - ((length & 0x7F) >> 4)) * 10; + + __asm__ __volatile__ ( + "cmp %[dst],%[end] \n\t" + "je Remain_%= \n\t" + + // perform SIMD block copy + "loop_%=: \n\t" + "vmovaps 0x00(%[src]),%%xmm0 \n\t" + "vmovaps 0x10(%[src]),%%xmm1 \n\t" + "vmovaps 0x20(%[src]),%%xmm2 \n\t" + "vmovaps 0x30(%[src]),%%xmm3 \n\t" + "vmovaps 0x40(%[src]),%%xmm4 \n\t" + "vmovaps 0x50(%[src]),%%xmm5 \n\t" + "vmovaps 0x60(%[src]),%%xmm6 \n\t" + "vmovaps 0x70(%[src]),%%xmm7 \n\t" + "vmovntdq %%xmm0 ,0x00(%[dst]) \n\t" + "vmovntdq %%xmm1 ,0x10(%[dst]) \n\t" + "vmovntdq %%xmm2 ,0x20(%[dst]) \n\t" + "vmovntdq %%xmm3 ,0x30(%[dst]) \n\t" + "vmovntdq %%xmm4 ,0x40(%[dst]) \n\t" + "vmovntdq %%xmm5 ,0x50(%[dst]) \n\t" + "vmovntdq %%xmm6 ,0x60(%[dst]) \n\t" + "vmovntdq %%xmm7 ,0x70(%[dst]) \n\t" + "add $0x80,%[dst] \n\t" + "add $0x80,%[src] \n\t" + "cmp %[dst],%[end] \n\t" + "jne loop_%= \n\t" + + "Remain_%=: \n\t" + + // copy any remaining 16 byte blocks + "call GetPC_%=\n\t" + "Offset_%=:\n\t" + "add $(BlockTable_%= - Offset_%=), %%eax \n\t" + "add %[off],%%eax \n\t" + "jmp *%%eax \n\t" + + "GetPC_%=:\n\t" + "mov (%%esp), %%eax \n\t" + "ret \n\t" + + "BlockTable_%=:\n\t" + "vmovaps 0x60(%[src]),%%xmm6 \n\t" + "vmovntdq %%xmm6 ,0x60(%[dst]) \n\t" + "vmovaps 0x50(%[src]),%%xmm5 \n\t" + "vmovntdq %%xmm5 ,0x50(%[dst]) \n\t" + "vmovaps 0x40(%[src]),%%xmm4 \n\t" + "vmovntdq %%xmm4 ,0x40(%[dst]) \n\t" + "vmovaps 0x30(%[src]),%%xmm3 \n\t" + "vmovntdq %%xmm3 ,0x30(%[dst]) \n\t" + "vmovaps 0x20(%[src]),%%xmm2 \n\t" + "vmovntdq %%xmm2 ,0x20(%[dst]) \n\t" + "vmovaps 0x10(%[src]),%%xmm1 \n\t" + "vmovntdq %%xmm1 ,0x10(%[dst]) \n\t" + "vmovaps 0x00(%[src]),%%xmm0 \n\t" + "vmovntdq %%xmm0 ,0x00(%[dst]) \n\t" + "nop\n\t" + "nop\n\t" + + : [dst]"+r" (dst), + [src]"+r" (src) + : [off]"r" (off), + [end]"r" (end) + : "eax", + "xmm0", + "xmm1", + "xmm2", + "xmm3", + "xmm4", + "xmm5", + "xmm6", + "xmm7", + "memory" + ); + + //copy any remaining bytes + memcpy(dst, src, length & 0xF); } - - __m128i * _src = (__m128i *)src; - __m128i * _dst = (__m128i *)dst; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - - const size_t sselen = length & ~0x7F; - const __m128i * _end = (__m128i *)((uintptr_t)src + sselen); - for (; _src != _end; _src += 8, _dst += 8) - { - _mm_prefetch(((char *)(_src + 8 )), _MM_HINT_NTA); - _mm_prefetch(((char *)(_src + 9 )), _MM_HINT_NTA); - _mm_prefetch(((char *)(_src + 10)), _MM_HINT_NTA); - _mm_prefetch(((char *)(_src + 11)), _MM_HINT_NTA); - - v0 = _mm_load_si128(_src + 0); - v1 = _mm_load_si128(_src + 1); - v2 = _mm_load_si128(_src + 2); - v3 = _mm_load_si128(_src + 3); - v4 = _mm_load_si128(_src + 4); - v5 = _mm_load_si128(_src + 5); - v6 = _mm_load_si128(_src + 6); - v7 = _mm_load_si128(_src + 7); - - _mm_stream_si128(_dst + 0, v0); - _mm_stream_si128(_dst + 1, v1); - _mm_stream_si128(_dst + 2, v2); - _mm_stream_si128(_dst + 3, v3); - _mm_stream_si128(_dst + 4, v4); - _mm_stream_si128(_dst + 5, v5); - _mm_stream_si128(_dst + 6, v6); - _mm_stream_si128(_dst + 7, v7); - } - - const size_t remain = length - sselen; - switch (remain & ~0xF) - { - case 112: v0 = _mm_load_si128(_src++); - case 96: v1 = _mm_load_si128(_src++); - case 80: v2 = _mm_load_si128(_src++); - case 64: v3 = _mm_load_si128(_src++); - case 48: v4 = _mm_load_si128(_src++); - case 32: v5 = _mm_load_si128(_src++); - case 16: v6 = _mm_load_si128(_src++); - } - - switch (remain & ~0xF) - { - case 112: _mm_stream_si128(_dst++, v0); - case 96: _mm_stream_si128(_dst++, v1); - case 80: _mm_stream_si128(_dst++, v2); - case 64: _mm_stream_si128(_dst++, v3); - case 48: _mm_stream_si128(_dst++, v4); - case 32: _mm_stream_si128(_dst++, v5); - case 16: _mm_stream_si128(_dst++, v6); - } - - // copy any remaining data - if (remain & 0xF) - { - uint8_t * rsrc = (uint8_t *)_src; - uint8_t * rdst = (uint8_t *)_dst; - for (size_t i = remain & 0xF; i > 0; --i) - *rdst++ = *rsrc++; - } -} \ No newline at end of file +#else + #define memcpySSE memcpy +#endif \ No newline at end of file diff --git a/common/memcpySSE2.h b/common/memcpySSE2.h deleted file mode 100644 index e4bcc89f..00000000 --- a/common/memcpySSE2.h +++ /dev/null @@ -1,135 +0,0 @@ -/* -KVMGFX Client - A KVM Client for VGA Passthrough -Copyright (C) 2017 Geoffrey McRae -https://looking-glass.hostfission.com - -This program is free software; you can redistribute it and/or modify it under -the terms of the GNU General Public License as published by the Free Software -Foundation; either version 2 of the License, or (at your option) any later -version. - -This program is distributed in the hope that it will be useful, but WITHOUT ANY -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A -PARTICULAR PURPOSE. See the GNU General Public License for more details. - -You should have received a copy of the GNU General Public License along with -this program; if not, write to the Free Software Foundation, Inc., 59 Temple -Place, Suite 330, Boston, MA 02111-1307 USA -*/ - -#pragma once -#include -#include -#include -#include -#include - -#include "debug.h" - -#if defined(NATIVE_MEMCPY) - #define memcpySSE memcpy -#elif defined(_MSC_VER) - extern "C" void memcpySSE(void *dst, const void * src, size_t length); -#elif (defined(__GNUC__) || defined(__GNUG__)) && defined(__i386__) - inline static void memcpySSE(void *dst, const void * src, size_t length) - { - if (length == 0 || dst == src) - return; - - // copies under 1MB are faster with the inlined memcpy - // tell the dev to use that instead - if (length < 1048576) - { - static bool smallBufferWarn = false; - if (!smallBufferWarn) - { - DEBUG_WARN("Do not use memcpySSE for copies under 1MB in size!"); - smallBufferWarn = true; - } - memcpy(dst, src, length); - return; - } - - const void * end = dst + (length & ~0x7F); - const size_t off = (7 - ((length & 0x7F) >> 4)) * 10; - - __asm__ __volatile__ ( - "cmp %[dst],%[end] \n\t" - "je Remain_%= \n\t" - - // perform SIMD block copy - "loop_%=: \n\t" - "vmovaps 0x00(%[src]),%%xmm0 \n\t" - "vmovaps 0x10(%[src]),%%xmm1 \n\t" - "vmovaps 0x20(%[src]),%%xmm2 \n\t" - "vmovaps 0x30(%[src]),%%xmm3 \n\t" - "vmovaps 0x40(%[src]),%%xmm4 \n\t" - "vmovaps 0x50(%[src]),%%xmm5 \n\t" - "vmovaps 0x60(%[src]),%%xmm6 \n\t" - "vmovaps 0x70(%[src]),%%xmm7 \n\t" - "vmovntdq %%xmm0 ,0x00(%[dst]) \n\t" - "vmovntdq %%xmm1 ,0x10(%[dst]) \n\t" - "vmovntdq %%xmm2 ,0x20(%[dst]) \n\t" - "vmovntdq %%xmm3 ,0x30(%[dst]) \n\t" - "vmovntdq %%xmm4 ,0x40(%[dst]) \n\t" - "vmovntdq %%xmm5 ,0x50(%[dst]) \n\t" - "vmovntdq %%xmm6 ,0x60(%[dst]) \n\t" - "vmovntdq %%xmm7 ,0x70(%[dst]) \n\t" - "add $0x80,%[dst] \n\t" - "add $0x80,%[src] \n\t" - "cmp %[dst],%[end] \n\t" - "jne loop_%= \n\t" - - "Remain_%=: \n\t" - - // copy any remaining 16 byte blocks - "call GetPC_%=\n\t" - "Offset_%=:\n\t" - "add $(BlockTable_%= - Offset_%=), %%eax \n\t" - "add %[off],%%eax \n\t" - "jmp *%%eax \n\t" - - "GetPC_%=:\n\t" - "mov (%%esp), %%eax \n\t" - "ret \n\t" - - "BlockTable_%=:\n\t" - "vmovaps 0x60(%[src]),%%xmm6 \n\t" - "vmovntdq %%xmm6 ,0x60(%[dst]) \n\t" - "vmovaps 0x50(%[src]),%%xmm5 \n\t" - "vmovntdq %%xmm5 ,0x50(%[dst]) \n\t" - "vmovaps 0x40(%[src]),%%xmm4 \n\t" - "vmovntdq %%xmm4 ,0x40(%[dst]) \n\t" - "vmovaps 0x30(%[src]),%%xmm3 \n\t" - "vmovntdq %%xmm3 ,0x30(%[dst]) \n\t" - "vmovaps 0x20(%[src]),%%xmm2 \n\t" - "vmovntdq %%xmm2 ,0x20(%[dst]) \n\t" - "vmovaps 0x10(%[src]),%%xmm1 \n\t" - "vmovntdq %%xmm1 ,0x10(%[dst]) \n\t" - "vmovaps 0x00(%[src]),%%xmm0 \n\t" - "vmovntdq %%xmm0 ,0x00(%[dst]) \n\t" - "nop\n\t" - "nop\n\t" - - : [dst]"+r" (dst), - [src]"+r" (src) - : [off]"r" (off), - [end]"r" (end) - : "eax", - "xmm0", - "xmm1", - "xmm2", - "xmm3", - "xmm4", - "xmm5", - "xmm6", - "xmm7", - "memory" - ); - - //copy any remaining bytes - memcpy(dst, src, length & 0xF); - } -#else - #define memcpySSE memcpy -#endif \ No newline at end of file diff --git a/host/Capture/DXGI.cpp b/host/Capture/DXGI.cpp index ab44b683..180942c1 100644 --- a/host/Capture/DXGI.cpp +++ b/host/Capture/DXGI.cpp @@ -651,7 +651,7 @@ GrabStatus Capture::DXGI::GrabFrameTexture(FrameInfo & frame, ID3D11Texture2DPtr if ( m_lastMousePos.x != frameInfo.PointerPosition.Position.x || m_lastMousePos.y != frameInfo.PointerPosition.Position.y - ) { + ) { cursorUpdate = true; frame.cursor.hasPos = true; frame.cursor.x = frameInfo.PointerPosition.Position.x; @@ -761,7 +761,9 @@ GrabStatus Capture::DXGI::GrabFrameRaw(FrameInfo & frame) while(true) { + TRACE_START("GrabFrame"); result = GrabFrameTexture(frame, src, timeout); + TRACE_END; if (result != GRAB_STATUS_OK) return result; @@ -773,7 +775,7 @@ GrabStatus Capture::DXGI::GrabFrameRaw(FrameInfo & frame) // send the last frame again if we timeout to prevent the client stalling on restart frame.pitch = m_mapping.RowPitch; - frame.stride = m_mapping.RowPitch / 4; + frame.stride = m_mapping.RowPitch >> 2; unsigned int size = m_height * m_mapping.RowPitch; m_memcpy.Copy(frame.buffer, m_mapping.pData, LG_MIN(size, frame.bufferSize)); diff --git a/host/looking-glass-host.vcxproj b/host/looking-glass-host.vcxproj index c90635e5..c40135bd 100644 --- a/host/looking-glass-host.vcxproj +++ b/host/looking-glass-host.vcxproj @@ -92,6 +92,7 @@ + @@ -351,7 +352,11 @@ + + + + \ No newline at end of file diff --git a/host/looking-glass-host.vcxproj.filters b/host/looking-glass-host.vcxproj.filters index a6816a3d..9bba02a4 100644 --- a/host/looking-glass-host.vcxproj.filters +++ b/host/looking-glass-host.vcxproj.filters @@ -81,4 +81,9 @@ Header Files + + + Source Files + + \ No newline at end of file