From e61678ef1b898b8d443e6a4cb00fe912f52d4647 Mon Sep 17 00:00:00 2001 From: Geoffrey McRae Date: Sun, 19 Nov 2023 00:09:42 +1100 Subject: [PATCH] [common] add AVX/AVX2 memory copy implementations --- common/include/common/rects.h | 14 +----- common/src/framebuffer.c | 84 ++++++++++++++++++++++++++++++++++- common/src/rects.c | 39 ++++++++++++++++ 3 files changed, 124 insertions(+), 13 deletions(-) diff --git a/common/include/common/rects.h b/common/include/common/rects.h index b238c711..738530dd 100644 --- a/common/include/common/rects.h +++ b/common/include/common/rects.h @@ -27,18 +27,8 @@ #include "common/framebuffer.h" #include "common/types.h" -inline static void rectCopyUnaligned(uint8_t * dst, const uint8_t * src, - int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) -{ - src += ystart * srcPitch + dx; - dst += ystart * dstPitch + dx; - for (int i = ystart; i < yend; ++i) - { - memcpy(dst, src, width); - src += srcPitch; - dst += dstPitch; - } -} +void rectCopyUnaligned(uint8_t * dst, const uint8_t * src, + int ystart, int yend, int dx, int dstPitch, int srcPitch, int width); void rectsBufferToFramebuffer(FrameDamageRect * rects, int count, int bpp, FrameBuffer * frame, int dstPitch, int height, diff --git a/common/src/framebuffer.c b/common/src/framebuffer.c index c743d5cf..8f6f379a 100644 --- a/common/src/framebuffer.c +++ b/common/src/framebuffer.c @@ -29,6 +29,7 @@ #include #include #include +#include #include bool framebuffer_wait(const FrameBuffer * frame, size_t size) @@ -165,7 +166,9 @@ void framebuffer_prepare(FrameBuffer * frame) atomic_store_explicit(&frame->wp, 0, memory_order_release); } -bool framebuffer_write(FrameBuffer * frame, const void * restrict src, size_t size) +__attribute__((target("default"))) +bool framebuffer_write(FrameBuffer * frame, const void * restrict src, + size_t size) { #ifdef FB_PROFILE static RunningAvg ra = NULL; @@ -222,6 +225,85 @@ bool framebuffer_write(FrameBuffer * frame, const void * restrict src, size_t si return true; } +#if 1 +__attribute__((target("avx2"))) +bool framebuffer_write(FrameBuffer *frame, const void *restrict src, size_t size) +{ +#ifdef FB_PROFILE + static RunningAvg ra = NULL; + static int raCount = 0; + const uint64_t ts = microtime(); + if (!ra) + ra = runningavg_new(100); +#endif + + __m256i *restrict s = (__m256i *)src; + __m256i *restrict d = (__m256i *)frame->data; + size_t wp = 0; + + _mm_mfence(); + + /* copy in chunks */ + while (size > 127) + { + __m256i *_d = (__m256i *)d; + __m256i *_s = (__m256i *)s; + __m256i v1 = _mm256_stream_load_si256(_s + 0); + __m256i v2 = _mm256_stream_load_si256(_s + 1); + __m256i v3 = _mm256_stream_load_si256(_s + 2); + __m256i v4 = _mm256_stream_load_si256(_s + 3); + + _mm256_stream_si256(_d + 0, v1); + _mm256_stream_si256(_d + 1, v2); + _mm256_stream_si256(_d + 2, v3); + _mm256_stream_si256(_d + 3, v4); + + s += 4; + d += 4; + size -= 128; + wp += 128; + + if (wp % FB_CHUNK_SIZE == 0) + atomic_store_explicit(&frame->wp, wp, memory_order_release); + } + + if (size > 63) + { + __m256i *_d = (__m256i *)d; + __m256i *_s = (__m256i *)s; + __m256i v1 = _mm256_stream_load_si256(_s); + __m256i v2 = _mm256_stream_load_si256(_s + 1); + + _mm256_stream_si256(_d, v1); + _mm256_stream_si256(_d + 1, v2); + + s += 2; + d += 2; + size -= 64; + wp += 64; + + if (wp % FB_CHUNK_SIZE == 0) + atomic_store_explicit(&frame->wp, wp, memory_order_release); + } + + if (size) + { + memcpy(frame->data + wp, s, size); + wp += size; + } + + atomic_store_explicit(&frame->wp, wp, memory_order_release); + +#ifdef FB_PROFILE + runningavg_push(ra, microtime() - ts); + if (++raCount % 100 == 0) + DEBUG_INFO("Average Copy Time: %.2fμs", runningavg_calc(ra)); +#endif + + return true; +} +#endif + const uint8_t * framebuffer_get_buffer(const FrameBuffer * frame) { return frame->data; diff --git a/common/src/rects.c b/common/src/rects.c index eaa645ae..59be63f8 100644 --- a/common/src/rects.c +++ b/common/src/rects.c @@ -22,6 +22,45 @@ #include "common/util.h" #include +#include + +__attribute__((target("default"))) +void rectCopyUnaligned(uint8_t * dst, const uint8_t * src, + int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) +{ + src += ystart * srcPitch + dx; + dst += ystart * dstPitch + dx; + for (int i = ystart; i < yend; ++i) + { + memcpy(dst, src, width); + src += srcPitch; + dst += dstPitch; + } +} + +__attribute__((target("avx"))) +void rectCopyUnaligned(uint8_t * dst, const uint8_t * src, + int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) +{ + src += ystart * srcPitch + dx; + dst += ystart * dstPitch + dx; + for (int i = ystart; i < yend; ++i) + { + int col; + for(col = 0; col <= width - 32; col += 32) + { + _mm_prefetch(src + col + 256, _MM_HINT_T0); + __m256i srcData = _mm256_loadu_si256((__m256i*)(src + col)); + _mm256_storeu_si256((__m256i*)(dst + col), srcData); + } + + for(; col < width; ++col) + dst[col] = src[col]; + + src += srcPitch; + dst += dstPitch; + } +} struct Corner {