diff --git a/host/Capture/NvFBC.cpp b/host/Capture/NvFBC.cpp index 2f0cc141..b337f636 100644 --- a/host/Capture/NvFBC.cpp +++ b/host/Capture/NvFBC.cpp @@ -22,6 +22,7 @@ using namespace Capture; #include #include "common\debug.h" +#include "common\memcpySSE.h" #include "Util.h" #ifdef _WIN64 @@ -260,7 +261,7 @@ bool NvFBC::GrabFrame(struct FrameInfo & frame) uint8_t *src = (uint8_t *)m_frameBuffer + dataOffset; uint8_t *dst = (uint8_t *)frame.buffer; for(unsigned int y = 0; y < frame.height; ++y, dst += dataWidth, src += m_grabInfo.dwBufferWidth * 3) - Util::Memcpy64(dst, src, dataWidth); + memcpySSE(dst, src, dataWidth); return true; } diff --git a/host/Util.h b/host/Util.h index 69428961..71f5e9b1 100644 --- a/host/Util.h +++ b/host/Util.h @@ -103,71 +103,4 @@ public: _mm_stream_si128((__m128i *)&dest[32], v2); } } - - static inline void Memcpy64(void * dst, void * src, size_t length) - { - // check if we can't perform an aligned copy - if (((uintptr_t)src & 0xF) != ((uintptr_t)dst & 0xF)) - { - - static bool unalignedDstWarn = false; - if (!unalignedDstWarn) - { - DEBUG_WARN("Memcpy64 unable to perform aligned copy, performance will suffer"); - unalignedDstWarn = true; - } - - memcpy(dst, src, length); - return; - } - - // check if the source needs slight alignment - { - uint8_t * _src = (uint8_t *)src; - unsigned int count = (16 - ((uintptr_t)src & 0xF)) & 0xF; - - static bool unalignedSrcWarn = false; - if (count > 0) - { - if (!unalignedSrcWarn) - { - DEBUG_WARN("Memcpy64 unaligned source, performance will suffer"); - unalignedSrcWarn = true; - } - - uint8_t * _dst = (uint8_t *)dst; - for (unsigned int i = count; i > 0; --i) - *_dst++ = *_src++; - src = _src; - dst = _dst; - length -= count; - } - } - - // perform the SMID copy - __m128i * _src = (__m128i *)src; - __m128i * _dst = (__m128i *)dst; - __m128i * _end = (__m128i *)src + (length / 16); - for (; _src != _end; _src += 8, _dst += 8) - { - _mm_prefetch((char *)(_src + 16), _MM_HINT_NTA); - _mm_prefetch((char *)(_src + 24), _MM_HINT_NTA); - __m128i v0 = _mm_load_si128(_src + 0); - __m128i v1 = _mm_load_si128(_src + 1); - __m128i v2 = _mm_load_si128(_src + 2); - __m128i v3 = _mm_load_si128(_src + 3); - __m128i v4 = _mm_load_si128(_src + 4); - __m128i v5 = _mm_load_si128(_src + 5); - __m128i v6 = _mm_load_si128(_src + 6); - __m128i v7 = _mm_load_si128(_src + 7); - _mm_stream_si128(_dst + 0, v0); - _mm_stream_si128(_dst + 1, v1); - _mm_stream_si128(_dst + 2, v2); - _mm_stream_si128(_dst + 3, v3); - _mm_stream_si128(_dst + 4, v4); - _mm_stream_si128(_dst + 5, v5); - _mm_stream_si128(_dst + 6, v6); - _mm_stream_si128(_dst + 7, v7); - } - } }; \ No newline at end of file