From da655b86c3afe9870dbf86fd79ca47b65c7fb69b Mon Sep 17 00:00:00 2001 From: Geoffrey McRae Date: Mon, 3 Aug 2020 11:16:30 +1000 Subject: [PATCH] [common] improve frambuffer copy to avoid cache pollution (SIMD) --- VERSION | 2 +- common/src/framebuffer.c | 39 ++++++++++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/VERSION b/VERSION index d28c92d3..6f62d580 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -B2-rc2-11-gbd42445ea7+1 \ No newline at end of file +B2-rc2-13-gc5ff8bd4ce+1 \ No newline at end of file diff --git a/common/src/framebuffer.c b/common/src/framebuffer.c index 6b17a792..88f02d2e 100644 --- a/common/src/framebuffer.c +++ b/common/src/framebuffer.c @@ -22,6 +22,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA #include #include +#include #define FB_CHUNK_SIZE 1024 @@ -35,7 +36,7 @@ const size_t FrameBufferStructSize = sizeof(FrameBuffer); void framebuffer_wait(const FrameBuffer * frame, size_t size) { - while(atomic_load_explicit(&frame->wp, memory_order_relaxed) != size) {} + while(atomic_load_explicit(&frame->wp, memory_order_acquire) != size) {} } @@ -46,6 +47,8 @@ bool framebuffer_read(const FrameBuffer * frame, void * dst, size_t dstpitch, uint_least32_t rp = 0; size_t y = 0; const size_t linewidth = width * bpp; + const size_t blocks = linewidth / 16; + const size_t left = linewidth % 16; while(y < height) { @@ -53,13 +56,18 @@ bool framebuffer_read(const FrameBuffer * frame, void * dst, size_t dstpitch, /* spinlock */ do - wp = atomic_load_explicit(&frame->wp, memory_order_relaxed); + wp = atomic_load_explicit(&frame->wp, memory_order_acquire); while(wp - rp < pitch); - memcpy(d, frame->data + rp, linewidth); + __m128i * s = (__m128i *)(frame->data + rp); + for(int i = 0; i < blocks; ++i, ++s, d += 16) + _mm_stream_si128((__m128i *)d, _mm_load_si128(s)); + + if (left) + memcpy(d, frame->data + rp + blocks * 16, left); rp += pitch; - d += dstpitch; + d += dstpitch - blocks * 16; ++y; } @@ -79,7 +87,7 @@ bool framebuffer_read_fn(const FrameBuffer * frame, size_t height, size_t width, /* spinlock */ do - wp = atomic_load_explicit(&frame->wp, memory_order_relaxed); + wp = atomic_load_explicit(&frame->wp, memory_order_acquire); while(wp - rp < pitch); if (!fn(opaque, frame->data + rp, linewidth)) @@ -97,18 +105,27 @@ bool framebuffer_read_fn(const FrameBuffer * frame, size_t height, size_t width, */ void framebuffer_prepare(FrameBuffer * frame) { - atomic_store(&frame->wp, 0); + atomic_store_explicit(&frame->wp, 0, memory_order_release); } bool framebuffer_write(FrameBuffer * frame, const void * src, size_t size) { + __m128i * s = (__m128i *)src; + /* copy in chunks */ - while(size) + while(size > 15) { - size_t copy = size < FB_CHUNK_SIZE ? FB_CHUNK_SIZE : size; - memcpy(frame->data + frame->wp, src, copy); - atomic_fetch_add(&frame->wp, copy); - size -= copy; + _mm_stream_si128((__m128i *)(frame->data + frame->wp), _mm_load_si128(s)); + atomic_fetch_add_explicit(&frame->wp, 16, memory_order_release); + ++s; + size -= 16; } + + if(size) + { + memcpy(frame->data + frame->wp, s, size); + atomic_fetch_add_explicit(&frame->wp, size, memory_order_release); + } + return true; }