[common] framebuffer: improve client framebuffer read performance

Extensive profiling reveals that the glibc memcpy performs up to 2x faster then the existing SIMD implementation that was in use here. This patch also will copy large 1MB chunks if the pitch of the source and destination match further increasing throughput.
2026-01-06 20:02:28 +00:00 · 2021-06-08 15:08:13 +10:00
parent 7300d00f66
commit 196050bd23
2 changed files with 48 additions and 58 deletions
--- a/common/include/common/framebuffer.h
+++ b/common/include/common/framebuffer.h
@@ -37,7 +37,7 @@ extern const size_t FrameBufferStructSize;
 /**
 * Wait for the framebuffer to fill to the specified size
 */
-void framebuffer_wait(const FrameBuffer * frame, size_t size);
+bool framebuffer_wait(const FrameBuffer * frame, size_t size);
 /**
 * Read data from the KVMFRFrame into the dst buffer
--- a/common/src/framebuffer.c
+++ b/common/src/framebuffer.c
@@ -21,6 +21,11 @@
 #include "common/framebuffer.h"
 #include "common/debug.h"
 //#define FB_PROFILE
 #ifdef FB_PROFILE
 #include "common/runningavg.h"
 #endif
 #include <string.h>
 #include <stdatomic.h>
 #include <emmintrin.h>
@@ -38,7 +43,7 @@ struct stFrameBuffer
 const size_t FrameBufferStructSize = sizeof(FrameBuffer);
-void framebuffer_wait(const FrameBuffer * frame, size_t size)
+bool framebuffer_wait(const FrameBuffer * frame, size_t size)
 {
  while(atomic_load_explicit(&frame->wp, memory_order_acquire) < size)
  {
@@ -46,79 +51,64 @@ void framebuffer_wait(const FrameBuffer * frame, size_t size)
    while(frame->wp < size)
    {
      if (++spinCount == FB_SPIN_LIMIT)
-        return;
+        return false;
      usleep(1);
    }
  }
  return true;
 }
 bool framebuffer_read(const FrameBuffer * frame, void * restrict dst,
    size_t dstpitch, size_t height, size_t width, size_t bpp, size_t pitch)
 {
 #ifdef FB_PROFILE
  static RunningAvg ra = NULL;
  static int raCount = 0;
  if (!ra)
    ra = runningavg_new(100);
  const uint64_t ts = microtime();
 #endif
  uint8_t * restrict d     = (uint8_t*)dst;
  uint_least32_t rp        = 0;
  size_t         y         = 0;
  const size_t   linewidth = width * bpp;
-  while(y < height)
+  // copy in large 1MB chunks if the pitches match
  if (dstpitch == pitch)
  {
-    uint_least32_t wp;
+    size_t remaining = (height - 1) * pitch + dstpitch;
-    int spinCount = 0;
+    while(remaining)
    /* spinlock */
    wp = atomic_load_explicit(&frame->wp, memory_order_acquire);
    while(wp - rp < linewidth)
    {
-      if (++spinCount == FB_SPIN_LIMIT)
+      const size_t copy = remaining < FB_CHUNK_SIZE ? remaining : FB_CHUNK_SIZE;
      if (!framebuffer_wait(frame, rp + copy))
        return false;
-      usleep(1);
+      memcpy(d, frame->data + rp, copy);
-      wp = atomic_load_explicit(&frame->wp, memory_order_acquire);
+      remaining -= copy;
      rp        += copy;
      d         += copy;
    }
    /* copy any unaligned bytes */
    const uint8_t * src = frame->data + rp;
    const size_t unaligned = (uintptr_t)src & 0xF;
    if (unaligned)
    {
      memcpy(d, src, unaligned);
      src += unaligned;
      d   += unaligned;
    }
    const size_t blocks = (linewidth - unaligned) / 64;
    const size_t left   = (linewidth - unaligned) % 64;
    _mm_mfence();
    __m128i * restrict s = (__m128i *)src;
    for(int i = 0; i < blocks; ++i)
    {
      __m128i *_d = (__m128i *)d;
      __m128i *_s = (__m128i *)s;
      __m128i v1 = _mm_stream_load_si128(_s + 0);
      __m128i v2 = _mm_stream_load_si128(_s + 1);
      __m128i v3 = _mm_stream_load_si128(_s + 2);
      __m128i v4 = _mm_stream_load_si128(_s + 3);
      _mm_storeu_si128(_d + 0, v1);
      _mm_storeu_si128(_d + 1, v2);
      _mm_storeu_si128(_d + 2, v3);
      _mm_storeu_si128(_d + 3, v4);
      d += 64;
      s += 4;
    }
    if (left)
    {
      memcpy(d, s, left);
      d += left;
    }
    rp += pitch;
    d  += dstpitch - linewidth;
    ++y;
  }
  else
  {
    // copy per line to match the pitch of the destination buffer
    const size_t linewidth = width * bpp;
    for(size_t y = 0; y < height; ++y)
    {
      if (!framebuffer_wait(frame, rp + linewidth))
        return false;
      memcpy(d, frame->data + rp, dstpitch);
      rp += linewidth;
      d  += dstpitch;
    }
  }
 #ifdef FB_PROFILE
  runningavg_push(ra, microtime() - ts);
  if (++raCount % 100 == 0)
    DEBUG_INFO("Average Copy Time: %.2fμs", runningavg_calc(ra));
 #endif
  return true;
 }