From 196050bd2381f4d7dbf6149c3347932951a47ebf Mon Sep 17 00:00:00 2001 From: Geoffrey McRae Date: Tue, 8 Jun 2021 15:08:13 +1000 Subject: [PATCH] [common] framebuffer: improve client framebuffer read performance Extensive profiling reveals that the glibc memcpy performs up to 2x faster then the existing SIMD implementation that was in use here. This patch also will copy large 1MB chunks if the pitch of the source and destination match further increasing throughput. --- common/include/common/framebuffer.h | 2 +- common/src/framebuffer.c | 104 +++++++++++++--------------- 2 files changed, 48 insertions(+), 58 deletions(-) diff --git a/common/include/common/framebuffer.h b/common/include/common/framebuffer.h index 69f106f7..857fb058 100644 --- a/common/include/common/framebuffer.h +++ b/common/include/common/framebuffer.h @@ -37,7 +37,7 @@ extern const size_t FrameBufferStructSize; /** * Wait for the framebuffer to fill to the specified size */ -void framebuffer_wait(const FrameBuffer * frame, size_t size); +bool framebuffer_wait(const FrameBuffer * frame, size_t size); /** * Read data from the KVMFRFrame into the dst buffer diff --git a/common/src/framebuffer.c b/common/src/framebuffer.c index 5e910914..4fb5a547 100644 --- a/common/src/framebuffer.c +++ b/common/src/framebuffer.c @@ -21,6 +21,11 @@ #include "common/framebuffer.h" #include "common/debug.h" +//#define FB_PROFILE +#ifdef FB_PROFILE +#include "common/runningavg.h" +#endif + #include #include #include @@ -38,7 +43,7 @@ struct stFrameBuffer const size_t FrameBufferStructSize = sizeof(FrameBuffer); -void framebuffer_wait(const FrameBuffer * frame, size_t size) +bool framebuffer_wait(const FrameBuffer * frame, size_t size) { while(atomic_load_explicit(&frame->wp, memory_order_acquire) < size) { @@ -46,79 +51,64 @@ void framebuffer_wait(const FrameBuffer * frame, size_t size) while(frame->wp < size) { if (++spinCount == FB_SPIN_LIMIT) - return; + return false; usleep(1); } } + + return true; } bool framebuffer_read(const FrameBuffer * frame, void * restrict dst, size_t dstpitch, size_t height, size_t width, size_t bpp, size_t pitch) { +#ifdef FB_PROFILE + static RunningAvg ra = NULL; + static int raCount = 0; + if (!ra) + ra = runningavg_new(100); + const uint64_t ts = microtime(); +#endif + uint8_t * restrict d = (uint8_t*)dst; uint_least32_t rp = 0; - size_t y = 0; - const size_t linewidth = width * bpp; - while(y < height) + // copy in large 1MB chunks if the pitches match + if (dstpitch == pitch) { - uint_least32_t wp; - int spinCount = 0; - - /* spinlock */ - wp = atomic_load_explicit(&frame->wp, memory_order_acquire); - while(wp - rp < linewidth) + size_t remaining = (height - 1) * pitch + dstpitch; + while(remaining) { - if (++spinCount == FB_SPIN_LIMIT) + const size_t copy = remaining < FB_CHUNK_SIZE ? remaining : FB_CHUNK_SIZE; + if (!framebuffer_wait(frame, rp + copy)) return false; - usleep(1); - wp = atomic_load_explicit(&frame->wp, memory_order_acquire); + memcpy(d, frame->data + rp, copy); + remaining -= copy; + rp += copy; + d += copy; } - - /* copy any unaligned bytes */ - const uint8_t * src = frame->data + rp; - const size_t unaligned = (uintptr_t)src & 0xF; - if (unaligned) - { - memcpy(d, src, unaligned); - src += unaligned; - d += unaligned; - } - - const size_t blocks = (linewidth - unaligned) / 64; - const size_t left = (linewidth - unaligned) % 64; - - _mm_mfence(); - __m128i * restrict s = (__m128i *)src; - for(int i = 0; i < blocks; ++i) - { - __m128i *_d = (__m128i *)d; - __m128i *_s = (__m128i *)s; - __m128i v1 = _mm_stream_load_si128(_s + 0); - __m128i v2 = _mm_stream_load_si128(_s + 1); - __m128i v3 = _mm_stream_load_si128(_s + 2); - __m128i v4 = _mm_stream_load_si128(_s + 3); - - _mm_storeu_si128(_d + 0, v1); - _mm_storeu_si128(_d + 1, v2); - _mm_storeu_si128(_d + 2, v3); - _mm_storeu_si128(_d + 3, v4); - - d += 64; - s += 4; - } - - if (left) - { - memcpy(d, s, left); - d += left; - } - - rp += pitch; - d += dstpitch - linewidth; - ++y; } + else + { + // copy per line to match the pitch of the destination buffer + const size_t linewidth = width * bpp; + for(size_t y = 0; y < height; ++y) + { + if (!framebuffer_wait(frame, rp + linewidth)) + return false; + + memcpy(d, frame->data + rp, dstpitch); + rp += linewidth; + d += dstpitch; + } + } + +#ifdef FB_PROFILE + runningavg_push(ra, microtime() - ts); + if (++raCount % 100 == 0) + DEBUG_INFO("Average Copy Time: %.2fμs", runningavg_calc(ra)); +#endif return true; }