/** * Looking Glass * Copyright © 2017-2024 The Looking Glass Authors * https://looking-glass.io * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., 59 * Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include "common/framebuffer.h" #include "common/cpuinfo.h" #include "common/debug.h" //#define FB_PROFILE #ifdef FB_PROFILE #include "common/runningavg.h" #endif #include #include #include #include #include bool framebuffer_wait(const FrameBuffer * frame, size_t size) { while(atomic_load_explicit(&frame->wp, memory_order_acquire) < size) { int spinCount = 0; while(frame->wp < size) { if (++spinCount == FB_SPIN_LIMIT) return false; usleep(1); } } return true; } bool framebuffer_read_linear(const FrameBuffer * frame, void * restrict dst, size_t size) { #ifdef FB_PROFILE static RunningAvg ra = NULL; static int raCount = 0; const uint64_t ts = microtime(); if (!ra) ra = runningavg_new(100); #endif uint8_t * restrict d = (uint8_t*)dst; uint_least32_t rp = 0; // copy in large 1MB chunks if the pitches match while(size) { const size_t copy = size < FB_CHUNK_SIZE ? size : FB_CHUNK_SIZE; if (!framebuffer_wait(frame, rp + copy)) return false; memcpy(d, frame->data + rp, copy); size -= copy; rp += copy; d += copy; } #ifdef FB_PROFILE runningavg_push(ra, microtime() - ts); if (++raCount % 100 == 0) DEBUG_INFO("Average Copy Time: %.2fμs", runningavg_calc(ra)); #endif return true; } bool framebuffer_read(const FrameBuffer * frame, void * restrict dst, size_t dstpitch, size_t height, size_t width, size_t bpp, size_t pitch) { if (dstpitch == pitch) return framebuffer_read_linear(frame, dst, height * pitch); #ifdef FB_PROFILE static RunningAvg ra = NULL; static int raCount = 0; const uint64_t ts = microtime(); if (!ra) ra = runningavg_new(100); #endif uint8_t * restrict d = (uint8_t*)dst; uint_least32_t rp = 0; // copy per line to match the pitch of the destination buffer const size_t linewidth = width * bpp; for(size_t y = 0; y < height; ++y) { if (!framebuffer_wait(frame, rp + linewidth)) return false; memcpy(d, frame->data + rp, dstpitch); rp += pitch; d += dstpitch; } #ifdef FB_PROFILE runningavg_push(ra, microtime() - ts); if (++raCount % 100 == 0) DEBUG_INFO("Average Copy Time: %.2fμs", runningavg_calc(ra)); #endif return true; } bool framebuffer_read_fn(const FrameBuffer * frame, size_t height, size_t width, size_t bpp, size_t pitch, FrameBufferReadFn fn, void * opaque) { #ifdef FB_PROFILE static RunningAvg ra = NULL; static int raCount = 0; const uint64_t ts = microtime(); if (!ra) ra = runningavg_new(100); #endif uint_least32_t rp = 0; size_t y = 0; const size_t linewidth = width * bpp; while(y < height) { if (!framebuffer_wait(frame, rp + linewidth)) return false; if (!fn(opaque, frame->data + rp, linewidth)) return false; rp += pitch; ++y; } #ifdef FB_PROFILE runningavg_push(ra, microtime() - ts); if (++raCount % 100 == 0) DEBUG_INFO("Average Copy Time: %.2fμs", runningavg_calc(ra)); #endif return true; } /** * Prepare the framebuffer for writing */ void framebuffer_prepare(FrameBuffer * frame) { atomic_store_explicit(&frame->wp, 0, memory_order_release); } static bool framebuffer_write_sse4_1(FrameBuffer * frame, const void * restrict src, size_t size) { #ifdef FB_PROFILE static RunningAvg ra = NULL; static int raCount = 0; const uint64_t ts = microtime(); if (!ra) ra = runningavg_new(100); #endif __m128i * restrict s = (__m128i *)src; __m128i * restrict d = (__m128i *)frame->data; size_t wp = 0; _mm_mfence(); /* copy in chunks */ while(size > 63) { __m128i *_d = (__m128i *)d; __m128i *_s = (__m128i *)s; __m128i v1 = _mm_stream_load_si128(_s + 0); __m128i v2 = _mm_stream_load_si128(_s + 1); __m128i v3 = _mm_stream_load_si128(_s + 2); __m128i v4 = _mm_stream_load_si128(_s + 3); _mm_store_si128(_d + 0, v1); _mm_store_si128(_d + 1, v2); _mm_store_si128(_d + 2, v3); _mm_store_si128(_d + 3, v4); s += 4; d += 4; size -= 64; wp += 64; if (wp % FB_CHUNK_SIZE == 0) atomic_store_explicit(&frame->wp, wp, memory_order_release); } if(size) { memcpy(frame->data + wp, s, size); wp += size; } atomic_store_explicit(&frame->wp, wp, memory_order_release); #ifdef FB_PROFILE runningavg_push(ra, microtime() - ts); if (++raCount % 100 == 0) DEBUG_INFO("Average Copy Time: %.2fμs", runningavg_calc(ra)); #endif return true; } #ifdef __clang__ #pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function) #else #pragma GCC push_options #pragma GCC target ("avx2") #endif bool framebuffer_write_avx2(FrameBuffer * frame, const void * restrict src, size_t size) { #ifdef FB_PROFILE static RunningAvg ra = NULL; static int raCount = 0; const uint64_t ts = microtime(); if (!ra) ra = runningavg_new(100); #endif __m256i *restrict s = (__m256i *)src; __m256i *restrict d = (__m256i *)frame->data; size_t wp = 0; _mm_mfence(); /* copy in chunks */ while (size > 127) { __m256i v1 = _mm256_stream_load_si256(s + 0); __m256i v2 = _mm256_stream_load_si256(s + 1); __m256i v3 = _mm256_stream_load_si256(s + 2); __m256i v4 = _mm256_stream_load_si256(s + 3); _mm256_stream_si256(d + 0, v1); _mm256_stream_si256(d + 1, v2); _mm256_stream_si256(d + 2, v3); _mm256_stream_si256(d + 3, v4); s += 4; d += 4; size -= 128; wp += 128; if (wp % FB_CHUNK_SIZE == 0) atomic_store_explicit(&frame->wp, wp, memory_order_release); } if (size > 63) { __m256i v1 = _mm256_stream_load_si256(s); __m256i v2 = _mm256_stream_load_si256(s + 1); _mm256_stream_si256(d, v1); _mm256_stream_si256(d + 1, v2); s += 2; d += 2; size -= 64; wp += 64; if (wp % FB_CHUNK_SIZE == 0) atomic_store_explicit(&frame->wp, wp, memory_order_release); } if (size) { memcpy(frame->data + wp, s, size); wp += size; } atomic_store_explicit(&frame->wp, wp, memory_order_release); #ifdef FB_PROFILE runningavg_push(ra, microtime() - ts); if (++raCount % 100 == 0) DEBUG_INFO("Average Copy Time: %.2fμs", runningavg_calc(ra)); #endif return true; } #ifdef __clang__ #pragma clang attribute pop #else #pragma GCC pop_options #endif static bool _framebuffer_write(FrameBuffer * frame, const void * restrict src, size_t size) { if (cpuInfo_getFeatures()->avx2) framebuffer_write = &framebuffer_write_avx2; else framebuffer_write = &framebuffer_write_sse4_1; return framebuffer_write(frame, src, size); } bool (*framebuffer_write)(FrameBuffer * frame, const void * restrict src, size_t size) = &_framebuffer_write; const uint8_t * framebuffer_get_buffer(const FrameBuffer * frame) { return frame->data; } uint8_t * framebuffer_get_data(FrameBuffer * frame) { return frame->data; } void framebuffer_set_write_ptr(FrameBuffer * frame, size_t size) { atomic_store_explicit(&frame->wp, size, memory_order_release); }