From 3330f83af606fdfbe591b3dc4e4ec764fa7b8241 Mon Sep 17 00:00:00 2001 From: Geoffrey McRae Date: Sun, 19 Nov 2023 02:52:11 +1100 Subject: [PATCH] [common] add runtime detection and selection of AVX/AVX2 support --- common/include/common/cpuinfo.h | 15 +++++ common/include/common/framebuffer.h | 3 +- common/include/common/rects.h | 14 +--- common/src/cpuinfo.c | 71 +++++++++++++++++++++ common/src/framebuffer.c | 99 ++++++++++++++++++++++++++++- common/src/rects.c | 57 +++++++++++++++++ 6 files changed, 245 insertions(+), 14 deletions(-) diff --git a/common/include/common/cpuinfo.h b/common/include/common/cpuinfo.h index e0a00c83..0a0ad58e 100644 --- a/common/include/common/cpuinfo.h +++ b/common/include/common/cpuinfo.h @@ -29,4 +29,19 @@ bool cpuInfo_get(char * model, size_t modelSize, int * procs, int * cores, void cpuInfo_log(void); +typedef struct +{ + bool sse, sse2, sse3, ssse3; + bool fma; + bool sse4_1, sse4_2; + bool popcnt; + bool aes; + bool xsave, osxsave; + bool avx, avx2; + bool bmi1, bmi2; +} +CPUInfoFeatures; + +const CPUInfoFeatures * cpuInfo_getFeatures(void); + #endif diff --git a/common/include/common/framebuffer.h b/common/include/common/framebuffer.h index 5a6714a1..0941befa 100644 --- a/common/include/common/framebuffer.h +++ b/common/include/common/framebuffer.h @@ -70,7 +70,8 @@ void framebuffer_prepare(FrameBuffer * frame); /** * Write data from the src buffer into the KVMFRFrame */ -bool framebuffer_write(FrameBuffer * frame, const void * src, size_t size); +extern bool (*framebuffer_write)(FrameBuffer * frame, + const void * restrict src, size_t size); /** * Gets the underlying data buffer of the framebuffer. diff --git a/common/include/common/rects.h b/common/include/common/rects.h index b238c711..df7e93fb 100644 --- a/common/include/common/rects.h +++ b/common/include/common/rects.h @@ -27,18 +27,8 @@ #include "common/framebuffer.h" #include "common/types.h" -inline static void rectCopyUnaligned(uint8_t * dst, const uint8_t * src, - int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) -{ - src += ystart * srcPitch + dx; - dst += ystart * dstPitch + dx; - for (int i = ystart; i < yend; ++i) - { - memcpy(dst, src, width); - src += srcPitch; - dst += dstPitch; - } -} +extern void (*rectCopyUnaligned)(uint8_t * dst, const uint8_t * src, + int ystart, int yend, int dx, int dstPitch, int srcPitch, int width); void rectsBufferToFramebuffer(FrameDamageRect * rects, int count, int bpp, FrameBuffer * frame, int dstPitch, int height, diff --git a/common/src/cpuinfo.c b/common/src/cpuinfo.c index 10f75f06..7fd557b4 100644 --- a/common/src/cpuinfo.c +++ b/common/src/cpuinfo.c @@ -20,6 +20,7 @@ #include "common/cpuinfo.h" #include "common/debug.h" +#include "common/util.h" void cpuInfo_log(void) { @@ -37,3 +38,73 @@ void cpuInfo_log(void) DEBUG_INFO("CPU Model: %s", model); DEBUG_INFO("CPU: %d sockets, %d cores, %d threads", sockets, cores, procs); } + +const CPUInfoFeatures * cpuInfo_getFeatures(void) +{ + static bool initialized = false; + static CPUInfoFeatures features; + + if (likely(initialized)) + return &features; + + int cpuid[4] = {0}; + + // leaf1 + asm volatile + ( + "cpuid;" + : "=a" (cpuid[0]), + "=b" (cpuid[1]), + "=c" (cpuid[2]), + "=d" (cpuid[3]) + : "a" (1) + ); + + features.sse = cpuid[3] & (1 << 25); + features.sse2 = cpuid[3] & (1 << 26); + features.sse3 = cpuid[2] & (1 << 0); + features.ssse3 = cpuid[2] & (1 << 9); + features.fma = cpuid[2] & (1 << 12); + features.sse4_1 = cpuid[2] & (1 << 19); + features.sse4_2 = cpuid[2] & (1 << 20); + features.popcnt = cpuid[2] & (1 << 23); + features.aes = cpuid[2] & (1 << 25); + features.xsave = cpuid[2] & (1 << 26); + features.osxsave = cpuid[2] & (1 << 27); + features.avx = cpuid[2] & (1 << 28); + + // leaf7 + asm volatile + ( + "cpuid;" + : "=a" (cpuid[0]), + "=b" (cpuid[1]), + "=c" (cpuid[2]), + "=d" (cpuid[3]) + : "a" (7), "c" (0) + ); + + features.avx2 = cpuid[1] & (1 << 5); + features.bmi1 = cpuid[2] & (1 << 3); + features.bmi2 = cpuid[2] & (1 << 8); + + if (features.osxsave && features.avx) + { + int xgetbv = 0; + asm volatile + ( + "xgetbv;" + : "=a" (xgetbv) + : "c" (0) + : "edx" + ); + + if (!(xgetbv & 0x6)) + { + features.avx = false; + features.avx2 = false; + } + } + + return &features; +}; diff --git a/common/src/framebuffer.c b/common/src/framebuffer.c index c743d5cf..d7b7a93f 100644 --- a/common/src/framebuffer.c +++ b/common/src/framebuffer.c @@ -19,6 +19,7 @@ */ #include "common/framebuffer.h" +#include "common/cpuinfo.h" #include "common/debug.h" //#define FB_PROFILE @@ -29,6 +30,7 @@ #include #include #include +#include #include bool framebuffer_wait(const FrameBuffer * frame, size_t size) @@ -165,7 +167,8 @@ void framebuffer_prepare(FrameBuffer * frame) atomic_store_explicit(&frame->wp, 0, memory_order_release); } -bool framebuffer_write(FrameBuffer * frame, const void * restrict src, size_t size) +static bool framebuffer_write_sse4_1(FrameBuffer * frame, + const void * restrict src, size_t size) { #ifdef FB_PROFILE static RunningAvg ra = NULL; @@ -222,6 +225,100 @@ bool framebuffer_write(FrameBuffer * frame, const void * restrict src, size_t si return true; } +#pragma GCC push_options +#pragma GCC target ("avx2") +bool framebuffer_write_avx2(FrameBuffer * frame, + const void * restrict src, size_t size) +{ +#ifdef FB_PROFILE + static RunningAvg ra = NULL; + static int raCount = 0; + const uint64_t ts = microtime(); + if (!ra) + ra = runningavg_new(100); +#endif + + __m256i *restrict s = (__m256i *)src; + __m256i *restrict d = (__m256i *)frame->data; + size_t wp = 0; + + _mm_mfence(); + + /* copy in chunks */ + while (size > 127) + { + __m256i *_d = (__m256i *)d; + __m256i *_s = (__m256i *)s; + __m256i v1 = _mm256_stream_load_si256(_s + 0); + __m256i v2 = _mm256_stream_load_si256(_s + 1); + __m256i v3 = _mm256_stream_load_si256(_s + 2); + __m256i v4 = _mm256_stream_load_si256(_s + 3); + + _mm256_stream_si256(_d + 0, v1); + _mm256_stream_si256(_d + 1, v2); + _mm256_stream_si256(_d + 2, v3); + _mm256_stream_si256(_d + 3, v4); + + s += 4; + d += 4; + size -= 128; + wp += 128; + + if (wp % FB_CHUNK_SIZE == 0) + atomic_store_explicit(&frame->wp, wp, memory_order_release); + } + + if (size > 63) + { + __m256i *_d = (__m256i *)d; + __m256i *_s = (__m256i *)s; + __m256i v1 = _mm256_stream_load_si256(_s); + __m256i v2 = _mm256_stream_load_si256(_s + 1); + + _mm256_stream_si256(_d, v1); + _mm256_stream_si256(_d + 1, v2); + + s += 2; + d += 2; + size -= 64; + wp += 64; + + if (wp % FB_CHUNK_SIZE == 0) + atomic_store_explicit(&frame->wp, wp, memory_order_release); + } + + if (size) + { + memcpy(frame->data + wp, s, size); + wp += size; + } + + atomic_store_explicit(&frame->wp, wp, memory_order_release); + +#ifdef FB_PROFILE + runningavg_push(ra, microtime() - ts); + if (++raCount % 100 == 0) + DEBUG_INFO("Average Copy Time: %.2fμs", runningavg_calc(ra)); +#endif + + return true; +} +#pragma GCC pop_options + +static bool _framebuffer_write(FrameBuffer * frame, + const void * restrict src, size_t size) +{ + if (cpuInfo_getFeatures()->avx2) + framebuffer_write = &framebuffer_write_avx2; + else + framebuffer_write = &framebuffer_write_sse4_1; + + return framebuffer_write(frame, src, size); +} + +bool (*framebuffer_write)(FrameBuffer * frame, + const void * restrict src, size_t size) = &_framebuffer_write; + const uint8_t * framebuffer_get_buffer(const FrameBuffer * frame) { return frame->data; diff --git a/common/src/rects.c b/common/src/rects.c index eaa645ae..996ca1d2 100644 --- a/common/src/rects.c +++ b/common/src/rects.c @@ -20,8 +20,10 @@ #include "common/rects.h" #include "common/util.h" +#include "common/cpuinfo.h" #include +#include struct Corner { @@ -298,3 +300,58 @@ int rectsRejectContained(FrameDamageRect * rects, int count) return removeRects(rects, count, removed); } + +static void rectCopyUnaligned_memcpy(uint8_t * dst, const uint8_t * src, + int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) +{ + src += ystart * srcPitch + dx; + dst += ystart * dstPitch + dx; + for (int i = ystart; i < yend; ++i) + { + memcpy(dst, src, width); + src += srcPitch; + dst += dstPitch; + } +} + +#pragma GCC push_options +#pragma GCC target ("avx2") +static void rectCopyUnaligned_avx(uint8_t * dst, const uint8_t * src, + int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) +{ + src += ystart * srcPitch + dx; + dst += ystart * dstPitch + dx; + for (int i = ystart; i < yend; ++i) + { + int col; + for(col = 0; col <= width - 32; col += 32) + { + _mm_prefetch(src + col + 256, _MM_HINT_T0); + __m256i srcData = _mm256_loadu_si256((__m256i*)(src + col)); + _mm256_storeu_si256((__m256i*)(dst + col), srcData); + } + + for(; col < width; ++col) + dst[col] = src[col]; + + src += srcPitch; + dst += dstPitch; + } +} +#pragma GCC pop_options + +static void _rectCopyUnaligned(uint8_t * dst, const uint8_t * src, + int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) +{ + if (cpuInfo_getFeatures()->avx) + rectCopyUnaligned = &rectCopyUnaligned_avx; + else + rectCopyUnaligned = &rectCopyUnaligned_memcpy; + + return rectCopyUnaligned( + dst, src, ystart, yend, dx, dstPitch, srcPitch, width); +} + +void (*rectCopyUnaligned)(uint8_t * dst, const uint8_t * src, + int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) = + &_rectCopyUnaligned;