mirror of
https://github.com/gnif/LookingGlass.git
synced 2025-10-12 18:38:12 +00:00
[common] add runtime detection and selection of AVX/AVX2 support
This commit is contained in:
@@ -19,6 +19,7 @@
|
||||
*/
|
||||
|
||||
#include "common/framebuffer.h"
|
||||
#include "common/cpuinfo.h"
|
||||
#include "common/debug.h"
|
||||
|
||||
//#define FB_PROFILE
|
||||
@@ -29,6 +30,7 @@
|
||||
#include <string.h>
|
||||
#include <emmintrin.h>
|
||||
#include <smmintrin.h>
|
||||
#include <immintrin.h>
|
||||
#include <unistd.h>
|
||||
|
||||
bool framebuffer_wait(const FrameBuffer * frame, size_t size)
|
||||
@@ -165,7 +167,8 @@ void framebuffer_prepare(FrameBuffer * frame)
|
||||
atomic_store_explicit(&frame->wp, 0, memory_order_release);
|
||||
}
|
||||
|
||||
bool framebuffer_write(FrameBuffer * frame, const void * restrict src, size_t size)
|
||||
static bool framebuffer_write_sse4_1(FrameBuffer * frame,
|
||||
const void * restrict src, size_t size)
|
||||
{
|
||||
#ifdef FB_PROFILE
|
||||
static RunningAvg ra = NULL;
|
||||
@@ -222,6 +225,100 @@ bool framebuffer_write(FrameBuffer * frame, const void * restrict src, size_t si
|
||||
return true;
|
||||
}
|
||||
|
||||
#pragma GCC push_options
|
||||
#pragma GCC target ("avx2")
|
||||
bool framebuffer_write_avx2(FrameBuffer * frame,
|
||||
const void * restrict src, size_t size)
|
||||
{
|
||||
#ifdef FB_PROFILE
|
||||
static RunningAvg ra = NULL;
|
||||
static int raCount = 0;
|
||||
const uint64_t ts = microtime();
|
||||
if (!ra)
|
||||
ra = runningavg_new(100);
|
||||
#endif
|
||||
|
||||
__m256i *restrict s = (__m256i *)src;
|
||||
__m256i *restrict d = (__m256i *)frame->data;
|
||||
size_t wp = 0;
|
||||
|
||||
_mm_mfence();
|
||||
|
||||
/* copy in chunks */
|
||||
while (size > 127)
|
||||
{
|
||||
__m256i *_d = (__m256i *)d;
|
||||
__m256i *_s = (__m256i *)s;
|
||||
__m256i v1 = _mm256_stream_load_si256(_s + 0);
|
||||
__m256i v2 = _mm256_stream_load_si256(_s + 1);
|
||||
__m256i v3 = _mm256_stream_load_si256(_s + 2);
|
||||
__m256i v4 = _mm256_stream_load_si256(_s + 3);
|
||||
|
||||
_mm256_stream_si256(_d + 0, v1);
|
||||
_mm256_stream_si256(_d + 1, v2);
|
||||
_mm256_stream_si256(_d + 2, v3);
|
||||
_mm256_stream_si256(_d + 3, v4);
|
||||
|
||||
s += 4;
|
||||
d += 4;
|
||||
size -= 128;
|
||||
wp += 128;
|
||||
|
||||
if (wp % FB_CHUNK_SIZE == 0)
|
||||
atomic_store_explicit(&frame->wp, wp, memory_order_release);
|
||||
}
|
||||
|
||||
if (size > 63)
|
||||
{
|
||||
__m256i *_d = (__m256i *)d;
|
||||
__m256i *_s = (__m256i *)s;
|
||||
__m256i v1 = _mm256_stream_load_si256(_s);
|
||||
__m256i v2 = _mm256_stream_load_si256(_s + 1);
|
||||
|
||||
_mm256_stream_si256(_d, v1);
|
||||
_mm256_stream_si256(_d + 1, v2);
|
||||
|
||||
s += 2;
|
||||
d += 2;
|
||||
size -= 64;
|
||||
wp += 64;
|
||||
|
||||
if (wp % FB_CHUNK_SIZE == 0)
|
||||
atomic_store_explicit(&frame->wp, wp, memory_order_release);
|
||||
}
|
||||
|
||||
if (size)
|
||||
{
|
||||
memcpy(frame->data + wp, s, size);
|
||||
wp += size;
|
||||
}
|
||||
|
||||
atomic_store_explicit(&frame->wp, wp, memory_order_release);
|
||||
|
||||
#ifdef FB_PROFILE
|
||||
runningavg_push(ra, microtime() - ts);
|
||||
if (++raCount % 100 == 0)
|
||||
DEBUG_INFO("Average Copy Time: %.2fμs", runningavg_calc(ra));
|
||||
#endif
|
||||
|
||||
return true;
|
||||
}
|
||||
#pragma GCC pop_options
|
||||
|
||||
static bool _framebuffer_write(FrameBuffer * frame,
|
||||
const void * restrict src, size_t size)
|
||||
{
|
||||
if (cpuInfo_getFeatures()->avx2)
|
||||
framebuffer_write = &framebuffer_write_avx2;
|
||||
else
|
||||
framebuffer_write = &framebuffer_write_sse4_1;
|
||||
|
||||
return framebuffer_write(frame, src, size);
|
||||
}
|
||||
|
||||
bool (*framebuffer_write)(FrameBuffer * frame,
|
||||
const void * restrict src, size_t size) = &_framebuffer_write;
|
||||
|
||||
const uint8_t * framebuffer_get_buffer(const FrameBuffer * frame)
|
||||
{
|
||||
return frame->data;
|
||||
|
Reference in New Issue
Block a user