[host] use the new memcpySSE implementation

This commit is contained in:
Geoffrey McRae 2018-05-22 18:58:56 +10:00
parent 6f141fe393
commit 15a337fee8
6 changed files with 186 additions and 307 deletions

View File

@ -4,8 +4,11 @@ memcpySSE proc
; src = rdx ; src = rdx
; len = r8 ; len = r8
mov rax, rcx
test r8, r8 test r8, r8
jz @Exit jz @Exit
cmp rcx, rdx cmp rcx, rdx
je @Exit je @Exit
@ -16,7 +19,7 @@ memcpySSE proc
; void * end = dst + (length & ~0x7F); ; void * end = dst + (length & ~0x7F);
; end = r10 ; end = r10
mov r9 , r8 mov r9 , r8
and r9 , -07Fh and r9 , 0FFFFFFFFFFFFFF80h
jz @RemainingBlocks jz @RemainingBlocks
mov r10, rcx mov r10, rcx
add r10, r9 add r10, r9
@ -50,6 +53,7 @@ memcpySSE proc
and r11, 07Fh and r11, 07Fh
jz @RestoreExit jz @RestoreExit
shr r11, 4 shr r11, 4
jz @FinalBytes
mov r10, 7 mov r10, 7
sub r10, r11 sub r10, r11
@ -59,12 +63,11 @@ memcpySSE proc
jmp r9 jmp r9
@RestoreExit: @RestoreExit:
movdqa xmm6 , oword ptr [rsp + 4*8 + 00 ] movdqa xmm6 , oword ptr [rsp + 4*8 + 00]
movdqa xmm7 , oword ptr [rsp + 4*8 + 16 ] movdqa xmm7 , oword ptr [rsp + 4*8 + 16]
add rsp, 8 + 2*16 + 4*8 add rsp, 8 + 2*16 + 4*8
@Exit: @Exit:
sfence
ret ret
@FinalBlocks: @FinalBlocks:
@ -82,135 +85,132 @@ memcpySSE proc
vmovntdq xmmword ptr [rcx + 010h], xmm1 vmovntdq xmmword ptr [rcx + 010h], xmm1
vmovaps xmm0 , xmmword ptr [rdx + 000h] vmovaps xmm0 , xmmword ptr [rdx + 000h]
vmovntdq xmmword ptr [rcx + 000h], xmm0 vmovntdq xmmword ptr [rcx + 000h], xmm0
nop
nop
imul r9, 16 movdqa xmm6 , oword ptr [rsp + 4*8 + 00]
add rdx, r9 movdqa xmm7 , oword ptr [rsp + 4*8 + 16]
add rcx, r9 add rsp, 8 + 2*16 + 4*8
sfence
@EndBlocks: shl r11, 4
add rdx, r11
add rcx, r11
@FinalBytes:
and r8, 0Fh and r8, 0Fh
test r8, r8 jz @Exit
je @RestoreExit imul r8, 5
lea r9, @FinalBytesTable
add r9, r8
jmp r9
cmp r8, 2 @FinalBytesTable:
je @Copy2 jmp @Copy1
cmp r8, 3 jmp @Copy2
je @Copy3 jmp @Copy3
cmp r8, 4 jmp @Copy4
je @Copy4 jmp @Copy5
cmp r8, 5 jmp @Copy6
je @Copy5 jmp @Copy7
cmp r8, 6 jmp @Copy8
je @Copy6 jmp @Copy9
cmp r8, 7 jmp @Copy10
je @Copy7 jmp @Copy11
cmp r8, 8 jmp @Copy12
je @Copy8 jmp @Copy13
cmp r8, 9 jmp @Copy14
je @Copy9 jmp @Copy15
cmp r8, 10
je @Copy10 db 128 DUP(0CCh)
cmp r8, 11
je @Copy11
cmp r8, 12
je @Copy12
cmp r8, 13
je @Copy13
cmp r8, 14
je @Copy14
cmp r8, 15
je @Copy15
; fall through - 1 byte ; fall through - 1 byte
@Copy1:
mov al, byte ptr [rdx] mov al, byte ptr [rdx]
mov byte ptr [rcx], al mov byte ptr [rcx], al
jmp @RestoreExit ret
@Copy2: @Copy2:
mov r10w, word ptr [rdx] mov r10w, word ptr [rdx]
mov word ptr [rcx], r10w mov word ptr [rcx], r10w
jmp @RestoreExit ret
@Copy3: @Copy3:
mov r10w, word ptr [rdx] mov r10w, word ptr [rdx]
mov word ptr [rcx], r10w mov word ptr [rcx], r10w
mov al, byte ptr [rdx + 02h] mov r11b, byte ptr [rdx + 02h]
mov byte ptr [rcx + 02h], al mov byte ptr [rcx + 02h], r11b
jmp @RestoreExit ret
@Copy4: @Copy4:
mov r9d, dword ptr [rdx] mov r9d, dword ptr [rdx]
mov dword ptr [rcx], r9d mov dword ptr [rcx], r9d
jmp @RestoreExit ret
@Copy5: @Copy5:
mov r9d, dword ptr [rdx ] mov r9d, dword ptr [rdx ]
mov al , byte ptr [rdx + 04h] mov r11b , byte ptr [rdx + 04h]
mov dword ptr [rcx ], r9d mov dword ptr [rcx ], r9d
mov byte ptr [rcx + 04h], al mov byte ptr [rcx + 04h], r11b
jmp @RestoreExit ret
@Copy6: @Copy6:
mov r9d , dword ptr [rdx ] mov r9d , dword ptr [rdx ]
mov r10w, word ptr [rdx + 04h] mov r10w, word ptr [rdx + 04h]
mov dword ptr [rcx ], r9d mov dword ptr [rcx ], r9d
mov word ptr [rcx + 04h], r10w mov word ptr [rcx + 04h], r10w
jmp @RestoreExit ret
@Copy7: @Copy7:
mov r9d , dword ptr [rdx ] mov r9d , dword ptr [rdx ]
mov r10w, word ptr [rdx + 04h] mov r10w, word ptr [rdx + 04h]
mov al , byte ptr [rdx + 06h] mov r11b, byte ptr [rdx + 06h]
mov dword ptr [rcx ], r9d mov dword ptr [rcx ], r9d
mov word ptr [rcx + 04h], r10w mov word ptr [rcx + 04h], r10w
mov byte ptr [rcx + 06h], al mov byte ptr [rcx + 06h], r11b
jmp @RestoreExit ret
@Copy8: @Copy8:
mov r8, qword ptr [rdx] mov r8, qword ptr [rdx]
mov qword ptr [rcx], r8 mov qword ptr [rcx], r8
jmp @RestoreExit ret
@Copy9: @Copy9:
mov r8, qword ptr [rdx ] mov r8 , qword ptr [rdx ]
mov al, byte ptr [rdx + 08h] mov r11b, byte ptr [rdx + 08h]
mov qword ptr [rcx ], r8 mov qword ptr [rcx ], r8
mov byte ptr [rcx + 08h], al mov byte ptr [rcx + 08h], r11b
jmp @RestoreExit ret
@Copy10: @Copy10:
mov r8 , qword ptr [rdx ] mov r8 , qword ptr [rdx ]
mov r10w, word ptr [rdx + 08h] mov r10w, word ptr [rdx + 08h]
mov qword ptr [rcx ], r8 mov qword ptr [rcx ], r8
mov word ptr [rcx + 08h], r10w mov word ptr [rcx + 08h], r10w
jmp @RestoreExit ret
@Copy11: @Copy11:
mov r8 , qword ptr [rdx ] mov r8 , qword ptr [rdx ]
mov r10w, word ptr [rdx + 08h] mov r10w, word ptr [rdx + 08h]
mov al , byte ptr [rdx + 0Ah] mov r11b, byte ptr [rdx + 0Ah]
mov qword ptr [rcx ], r8 mov qword ptr [rcx ], r8
mov word ptr [rcx + 08h], r10w mov word ptr [rcx + 08h], r10w
mov byte ptr [rcx + 0Ah], al mov byte ptr [rcx + 0Ah], r11b
jmp @RestoreExit ret
@Copy12: @Copy12:
mov r8 , qword ptr [rdx ] mov r8 , qword ptr [rdx ]
mov r9d, dword ptr [rdx + 08h] mov r9d, dword ptr [rdx + 08h]
mov qword ptr [rcx ], r8 mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d mov dword ptr [rcx + 08h], r9d
jmp @RestoreExit ret
@Copy13: @Copy13:
mov r8 , qword ptr [rdx ] mov r8 , qword ptr [rdx ]
mov r9d, dword ptr [rdx + 08h] mov r9d , dword ptr [rdx + 08h]
mov al , byte ptr [rdx + 0Ch] mov r11b, byte ptr [rdx + 0Ch]
mov qword ptr [rcx ], r8 mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d mov dword ptr [rcx + 08h], r9d
mov byte ptr [rcx + 0Ch], al mov byte ptr [rcx + 0Ch], r11b
jmp @RestoreExit ret
@Copy14: @Copy14:
mov r8 , qword ptr [rdx ] mov r8 , qword ptr [rdx ]
@ -219,19 +219,19 @@ memcpySSE proc
mov qword ptr [rcx ], r8 mov qword ptr [rcx ], r8
mov dword ptr [rcx + 08h], r9d mov dword ptr [rcx + 08h], r9d
mov word ptr [rcx + 0Ch], r10w mov word ptr [rcx + 0Ch], r10w
jmp @RestoreExit ret
; copy 15 ; copy 15
@Copy15: @Copy15:
mov r8 , qword ptr [rdx + 00h] mov r8 , qword ptr [rdx + 00h]
mov r9d , dword ptr [rdx + 08h] mov r9d , dword ptr [rdx + 08h]
mov r10w, word ptr [rdx + 0Ch] mov r10w, word ptr [rdx + 0Ch]
mov al , byte ptr [rdx + 0Eh] mov r11b, byte ptr [rdx + 0Eh]
mov qword ptr [rcx + 00h], r8 mov qword ptr [rcx + 00h], r8
mov dword ptr [rcx + 08h], r9d mov dword ptr [rcx + 08h], r9d
mov word ptr [rcx + 0Ch], r10w mov word ptr [rcx + 0Ch], r10w
mov byte ptr [rcx + 0Eh], al mov byte ptr [rcx + 0Eh], r11b
jmp @RestoreExit ret
memcpySSE endp memcpySSE endp
end end

View File

@ -26,108 +26,110 @@ Place, Suite 330, Boston, MA 02111-1307 USA
#include "debug.h" #include "debug.h"
static inline void memcpySSE(void * dst, const void * src, size_t length) #if defined(NATIVE_MEMCPY)
{ #define memcpySSE memcpy
// check if we can't perform an aligned copy #elif defined(_MSC_VER)
if (((uintptr_t)src & 0xF) != ((uintptr_t)dst & 0xF)) extern "C" void * memcpySSE(void *dst, const void * src, size_t length);
#elif (defined(__GNUC__) || defined(__GNUG__)) && defined(__i386__)
inline static void * memcpySSE(void *dst, const void * src, size_t length)
{ {
if (length == 0 || dst == src)
return;
static bool unalignedDstWarn = false; // copies under 1MB are faster with the inlined memcpy
if (!unalignedDstWarn) // tell the dev to use that instead
if (length < 1048576)
{ {
DEBUG_WARN("Memcpy64 unable to perform aligned copy, performance will suffer"); static bool smallBufferWarn = false;
unalignedDstWarn = true; if (!smallBufferWarn)
}
// fallback to system memcpy
memcpy(dst, src, length);
return;
}
// check if the source needs alignment
{
uint8_t * _src = (uint8_t *)src;
unsigned int count = (16 - ((uintptr_t)src & 0xF)) & 0xF;
static bool unalignedSrcWarn = false;
if (count > 0)
{
if (!unalignedSrcWarn)
{ {
DEBUG_WARN("Memcpy64 unaligned source, performance will suffer"); DEBUG_WARN("Do not use memcpySSE for copies under 1MB in size!");
unalignedSrcWarn = true; smallBufferWarn = true;
} }
memcpy(dst, src, length);
uint8_t * _dst = (uint8_t *)dst; return;
for (unsigned int i = count; i > 0; --i)
*_dst++ = *_src++;
src = _src;
dst = _dst;
length -= count;
} }
const void * end = dst + (length & ~0x7F);
const size_t off = (7 - ((length & 0x7F) >> 4)) * 10;
__asm__ __volatile__ (
"cmp %[dst],%[end] \n\t"
"je Remain_%= \n\t"
// perform SIMD block copy
"loop_%=: \n\t"
"vmovaps 0x00(%[src]),%%xmm0 \n\t"
"vmovaps 0x10(%[src]),%%xmm1 \n\t"
"vmovaps 0x20(%[src]),%%xmm2 \n\t"
"vmovaps 0x30(%[src]),%%xmm3 \n\t"
"vmovaps 0x40(%[src]),%%xmm4 \n\t"
"vmovaps 0x50(%[src]),%%xmm5 \n\t"
"vmovaps 0x60(%[src]),%%xmm6 \n\t"
"vmovaps 0x70(%[src]),%%xmm7 \n\t"
"vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
"vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
"vmovntdq %%xmm7 ,0x70(%[dst]) \n\t"
"add $0x80,%[dst] \n\t"
"add $0x80,%[src] \n\t"
"cmp %[dst],%[end] \n\t"
"jne loop_%= \n\t"
"Remain_%=: \n\t"
// copy any remaining 16 byte blocks
"call GetPC_%=\n\t"
"Offset_%=:\n\t"
"add $(BlockTable_%= - Offset_%=), %%eax \n\t"
"add %[off],%%eax \n\t"
"jmp *%%eax \n\t"
"GetPC_%=:\n\t"
"mov (%%esp), %%eax \n\t"
"ret \n\t"
"BlockTable_%=:\n\t"
"vmovaps 0x60(%[src]),%%xmm6 \n\t"
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
"vmovaps 0x50(%[src]),%%xmm5 \n\t"
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
"vmovaps 0x40(%[src]),%%xmm4 \n\t"
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
"vmovaps 0x30(%[src]),%%xmm3 \n\t"
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
"vmovaps 0x20(%[src]),%%xmm2 \n\t"
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
"vmovaps 0x10(%[src]),%%xmm1 \n\t"
"vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
"vmovaps 0x00(%[src]),%%xmm0 \n\t"
"vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
"nop\n\t"
"nop\n\t"
: [dst]"+r" (dst),
[src]"+r" (src)
: [off]"r" (off),
[end]"r" (end)
: "eax",
"xmm0",
"xmm1",
"xmm2",
"xmm3",
"xmm4",
"xmm5",
"xmm6",
"xmm7",
"memory"
);
//copy any remaining bytes
memcpy(dst, src, length & 0xF);
} }
#else
__m128i * _src = (__m128i *)src; #define memcpySSE memcpy
__m128i * _dst = (__m128i *)dst; #endif
__m128i v0, v1, v2, v3, v4, v5, v6, v7;
const size_t sselen = length & ~0x7F;
const __m128i * _end = (__m128i *)((uintptr_t)src + sselen);
for (; _src != _end; _src += 8, _dst += 8)
{
_mm_prefetch(((char *)(_src + 8 )), _MM_HINT_NTA);
_mm_prefetch(((char *)(_src + 9 )), _MM_HINT_NTA);
_mm_prefetch(((char *)(_src + 10)), _MM_HINT_NTA);
_mm_prefetch(((char *)(_src + 11)), _MM_HINT_NTA);
v0 = _mm_load_si128(_src + 0);
v1 = _mm_load_si128(_src + 1);
v2 = _mm_load_si128(_src + 2);
v3 = _mm_load_si128(_src + 3);
v4 = _mm_load_si128(_src + 4);
v5 = _mm_load_si128(_src + 5);
v6 = _mm_load_si128(_src + 6);
v7 = _mm_load_si128(_src + 7);
_mm_stream_si128(_dst + 0, v0);
_mm_stream_si128(_dst + 1, v1);
_mm_stream_si128(_dst + 2, v2);
_mm_stream_si128(_dst + 3, v3);
_mm_stream_si128(_dst + 4, v4);
_mm_stream_si128(_dst + 5, v5);
_mm_stream_si128(_dst + 6, v6);
_mm_stream_si128(_dst + 7, v7);
}
const size_t remain = length - sselen;
switch (remain & ~0xF)
{
case 112: v0 = _mm_load_si128(_src++);
case 96: v1 = _mm_load_si128(_src++);
case 80: v2 = _mm_load_si128(_src++);
case 64: v3 = _mm_load_si128(_src++);
case 48: v4 = _mm_load_si128(_src++);
case 32: v5 = _mm_load_si128(_src++);
case 16: v6 = _mm_load_si128(_src++);
}
switch (remain & ~0xF)
{
case 112: _mm_stream_si128(_dst++, v0);
case 96: _mm_stream_si128(_dst++, v1);
case 80: _mm_stream_si128(_dst++, v2);
case 64: _mm_stream_si128(_dst++, v3);
case 48: _mm_stream_si128(_dst++, v4);
case 32: _mm_stream_si128(_dst++, v5);
case 16: _mm_stream_si128(_dst++, v6);
}
// copy any remaining data
if (remain & 0xF)
{
uint8_t * rsrc = (uint8_t *)_src;
uint8_t * rdst = (uint8_t *)_dst;
for (size_t i = remain & 0xF; i > 0; --i)
*rdst++ = *rsrc++;
}
}

View File

@ -1,135 +0,0 @@
/*
KVMGFX Client - A KVM Client for VGA Passthrough
Copyright (C) 2017 Geoffrey McRae <geoff@hostfission.com>
https://looking-glass.hostfission.com
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; either version 2 of the License, or (at your option) any later
version.
This program is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU General Public License for more details.
You should have received a copy of the GNU General Public License along with
this program; if not, write to the Free Software Foundation, Inc., 59 Temple
Place, Suite 330, Boston, MA 02111-1307 USA
*/
#pragma once
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <tmmintrin.h>
#include <immintrin.h>
#include "debug.h"
#if defined(NATIVE_MEMCPY)
#define memcpySSE memcpy
#elif defined(_MSC_VER)
extern "C" void memcpySSE(void *dst, const void * src, size_t length);
#elif (defined(__GNUC__) || defined(__GNUG__)) && defined(__i386__)
inline static void memcpySSE(void *dst, const void * src, size_t length)
{
if (length == 0 || dst == src)
return;
// copies under 1MB are faster with the inlined memcpy
// tell the dev to use that instead
if (length < 1048576)
{
static bool smallBufferWarn = false;
if (!smallBufferWarn)
{
DEBUG_WARN("Do not use memcpySSE for copies under 1MB in size!");
smallBufferWarn = true;
}
memcpy(dst, src, length);
return;
}
const void * end = dst + (length & ~0x7F);
const size_t off = (7 - ((length & 0x7F) >> 4)) * 10;
__asm__ __volatile__ (
"cmp %[dst],%[end] \n\t"
"je Remain_%= \n\t"
// perform SIMD block copy
"loop_%=: \n\t"
"vmovaps 0x00(%[src]),%%xmm0 \n\t"
"vmovaps 0x10(%[src]),%%xmm1 \n\t"
"vmovaps 0x20(%[src]),%%xmm2 \n\t"
"vmovaps 0x30(%[src]),%%xmm3 \n\t"
"vmovaps 0x40(%[src]),%%xmm4 \n\t"
"vmovaps 0x50(%[src]),%%xmm5 \n\t"
"vmovaps 0x60(%[src]),%%xmm6 \n\t"
"vmovaps 0x70(%[src]),%%xmm7 \n\t"
"vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
"vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
"vmovntdq %%xmm7 ,0x70(%[dst]) \n\t"
"add $0x80,%[dst] \n\t"
"add $0x80,%[src] \n\t"
"cmp %[dst],%[end] \n\t"
"jne loop_%= \n\t"
"Remain_%=: \n\t"
// copy any remaining 16 byte blocks
"call GetPC_%=\n\t"
"Offset_%=:\n\t"
"add $(BlockTable_%= - Offset_%=), %%eax \n\t"
"add %[off],%%eax \n\t"
"jmp *%%eax \n\t"
"GetPC_%=:\n\t"
"mov (%%esp), %%eax \n\t"
"ret \n\t"
"BlockTable_%=:\n\t"
"vmovaps 0x60(%[src]),%%xmm6 \n\t"
"vmovntdq %%xmm6 ,0x60(%[dst]) \n\t"
"vmovaps 0x50(%[src]),%%xmm5 \n\t"
"vmovntdq %%xmm5 ,0x50(%[dst]) \n\t"
"vmovaps 0x40(%[src]),%%xmm4 \n\t"
"vmovntdq %%xmm4 ,0x40(%[dst]) \n\t"
"vmovaps 0x30(%[src]),%%xmm3 \n\t"
"vmovntdq %%xmm3 ,0x30(%[dst]) \n\t"
"vmovaps 0x20(%[src]),%%xmm2 \n\t"
"vmovntdq %%xmm2 ,0x20(%[dst]) \n\t"
"vmovaps 0x10(%[src]),%%xmm1 \n\t"
"vmovntdq %%xmm1 ,0x10(%[dst]) \n\t"
"vmovaps 0x00(%[src]),%%xmm0 \n\t"
"vmovntdq %%xmm0 ,0x00(%[dst]) \n\t"
"nop\n\t"
"nop\n\t"
: [dst]"+r" (dst),
[src]"+r" (src)
: [off]"r" (off),
[end]"r" (end)
: "eax",
"xmm0",
"xmm1",
"xmm2",
"xmm3",
"xmm4",
"xmm5",
"xmm6",
"xmm7",
"memory"
);
//copy any remaining bytes
memcpy(dst, src, length & 0xF);
}
#else
#define memcpySSE memcpy
#endif

View File

@ -651,7 +651,7 @@ GrabStatus Capture::DXGI::GrabFrameTexture(FrameInfo & frame, ID3D11Texture2DPtr
if ( if (
m_lastMousePos.x != frameInfo.PointerPosition.Position.x || m_lastMousePos.x != frameInfo.PointerPosition.Position.x ||
m_lastMousePos.y != frameInfo.PointerPosition.Position.y m_lastMousePos.y != frameInfo.PointerPosition.Position.y
) { ) {
cursorUpdate = true; cursorUpdate = true;
frame.cursor.hasPos = true; frame.cursor.hasPos = true;
frame.cursor.x = frameInfo.PointerPosition.Position.x; frame.cursor.x = frameInfo.PointerPosition.Position.x;
@ -761,7 +761,9 @@ GrabStatus Capture::DXGI::GrabFrameRaw(FrameInfo & frame)
while(true) while(true)
{ {
TRACE_START("GrabFrame");
result = GrabFrameTexture(frame, src, timeout); result = GrabFrameTexture(frame, src, timeout);
TRACE_END;
if (result != GRAB_STATUS_OK) if (result != GRAB_STATUS_OK)
return result; return result;
@ -773,7 +775,7 @@ GrabStatus Capture::DXGI::GrabFrameRaw(FrameInfo & frame)
// send the last frame again if we timeout to prevent the client stalling on restart // send the last frame again if we timeout to prevent the client stalling on restart
frame.pitch = m_mapping.RowPitch; frame.pitch = m_mapping.RowPitch;
frame.stride = m_mapping.RowPitch / 4; frame.stride = m_mapping.RowPitch >> 2;
unsigned int size = m_height * m_mapping.RowPitch; unsigned int size = m_height * m_mapping.RowPitch;
m_memcpy.Copy(frame.buffer, m_mapping.pData, LG_MIN(size, frame.bufferSize)); m_memcpy.Copy(frame.buffer, m_mapping.pData, LG_MIN(size, frame.bufferSize));

View File

@ -92,6 +92,7 @@
</PropertyGroup> </PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings"> <ImportGroup Label="ExtensionSettings">
<Import Project="$(VCTargetsPath)\BuildCustomizations\masm.props" />
</ImportGroup> </ImportGroup>
<ImportGroup Label="Shared"> <ImportGroup Label="Shared">
</ImportGroup> </ImportGroup>
@ -351,7 +352,11 @@
<ClInclude Include="TraceUtil.h" /> <ClInclude Include="TraceUtil.h" />
<ClInclude Include="Util.h" /> <ClInclude Include="Util.h" />
</ItemGroup> </ItemGroup>
<ItemGroup>
<MASM Include="..\common\memcpySSE.asm" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" /> <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets"> <ImportGroup Label="ExtensionTargets">
<Import Project="$(VCTargetsPath)\BuildCustomizations\masm.targets" />
</ImportGroup> </ImportGroup>
</Project> </Project>

View File

@ -81,4 +81,9 @@
<Filter>Header Files</Filter> <Filter>Header Files</Filter>
</ClInclude> </ClInclude>
</ItemGroup> </ItemGroup>
<ItemGroup>
<MASM Include="..\common\memcpySSE.asm">
<Filter>Source Files</Filter>
</MASM>
</ItemGroup>
</Project> </Project>