[common] improve frambuffer copy to avoid cache pollution (SIMD)

This commit is contained in:
Geoffrey McRae 2020-08-03 11:16:30 +10:00
parent c5ff8bd4ce
commit da655b86c3
2 changed files with 29 additions and 12 deletions

View File

@ -1 +1 @@
B2-rc2-11-gbd42445ea7+1 B2-rc2-13-gc5ff8bd4ce+1

View File

@ -22,6 +22,7 @@ Place, Suite 330, Boston, MA 02111-1307 USA
#include <string.h> #include <string.h>
#include <stdatomic.h> #include <stdatomic.h>
#include <emmintrin.h>
#define FB_CHUNK_SIZE 1024 #define FB_CHUNK_SIZE 1024
@ -35,7 +36,7 @@ const size_t FrameBufferStructSize = sizeof(FrameBuffer);
void framebuffer_wait(const FrameBuffer * frame, size_t size) void framebuffer_wait(const FrameBuffer * frame, size_t size)
{ {
while(atomic_load_explicit(&frame->wp, memory_order_relaxed) != size) {} while(atomic_load_explicit(&frame->wp, memory_order_acquire) != size) {}
} }
@ -46,6 +47,8 @@ bool framebuffer_read(const FrameBuffer * frame, void * dst, size_t dstpitch,
uint_least32_t rp = 0; uint_least32_t rp = 0;
size_t y = 0; size_t y = 0;
const size_t linewidth = width * bpp; const size_t linewidth = width * bpp;
const size_t blocks = linewidth / 16;
const size_t left = linewidth % 16;
while(y < height) while(y < height)
{ {
@ -53,13 +56,18 @@ bool framebuffer_read(const FrameBuffer * frame, void * dst, size_t dstpitch,
/* spinlock */ /* spinlock */
do do
wp = atomic_load_explicit(&frame->wp, memory_order_relaxed); wp = atomic_load_explicit(&frame->wp, memory_order_acquire);
while(wp - rp < pitch); while(wp - rp < pitch);
memcpy(d, frame->data + rp, linewidth); __m128i * s = (__m128i *)(frame->data + rp);
for(int i = 0; i < blocks; ++i, ++s, d += 16)
_mm_stream_si128((__m128i *)d, _mm_load_si128(s));
if (left)
memcpy(d, frame->data + rp + blocks * 16, left);
rp += pitch; rp += pitch;
d += dstpitch; d += dstpitch - blocks * 16;
++y; ++y;
} }
@ -79,7 +87,7 @@ bool framebuffer_read_fn(const FrameBuffer * frame, size_t height, size_t width,
/* spinlock */ /* spinlock */
do do
wp = atomic_load_explicit(&frame->wp, memory_order_relaxed); wp = atomic_load_explicit(&frame->wp, memory_order_acquire);
while(wp - rp < pitch); while(wp - rp < pitch);
if (!fn(opaque, frame->data + rp, linewidth)) if (!fn(opaque, frame->data + rp, linewidth))
@ -97,18 +105,27 @@ bool framebuffer_read_fn(const FrameBuffer * frame, size_t height, size_t width,
*/ */
void framebuffer_prepare(FrameBuffer * frame) void framebuffer_prepare(FrameBuffer * frame)
{ {
atomic_store(&frame->wp, 0); atomic_store_explicit(&frame->wp, 0, memory_order_release);
} }
bool framebuffer_write(FrameBuffer * frame, const void * src, size_t size) bool framebuffer_write(FrameBuffer * frame, const void * src, size_t size)
{ {
__m128i * s = (__m128i *)src;
/* copy in chunks */ /* copy in chunks */
while(size) while(size > 15)
{ {
size_t copy = size < FB_CHUNK_SIZE ? FB_CHUNK_SIZE : size; _mm_stream_si128((__m128i *)(frame->data + frame->wp), _mm_load_si128(s));
memcpy(frame->data + frame->wp, src, copy); atomic_fetch_add_explicit(&frame->wp, 16, memory_order_release);
atomic_fetch_add(&frame->wp, copy); ++s;
size -= copy; size -= 16;
} }
if(size)
{
memcpy(frame->data + frame->wp, s, size);
atomic_fetch_add_explicit(&frame->wp, size, memory_order_release);
}
return true; return true;
} }