mirror of
https://github.com/gnif/LookingGlass.git
synced 2024-11-21 21:17:19 +00:00
[common] rect/framebuffer: improve avx implementations
This commit is contained in:
parent
0ce4c34c37
commit
d3ee5bddde
@ -27,7 +27,8 @@
|
|||||||
#include "common/framebuffer.h"
|
#include "common/framebuffer.h"
|
||||||
#include "common/types.h"
|
#include "common/types.h"
|
||||||
|
|
||||||
extern void (*rectCopyUnaligned)(uint8_t * dst, const uint8_t * src,
|
extern void (*rectCopyUnaligned)(
|
||||||
|
uint8_t *restrict dst, const uint8_t *restrict src,
|
||||||
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width);
|
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width);
|
||||||
|
|
||||||
void rectsBufferToFramebuffer(FrameDamageRect * rects, int count, int bpp,
|
void rectsBufferToFramebuffer(FrameDamageRect * rects, int count, int bpp,
|
||||||
|
@ -251,22 +251,20 @@ bool framebuffer_write_avx2(FrameBuffer * frame,
|
|||||||
/* copy in chunks */
|
/* copy in chunks */
|
||||||
while (size > 127)
|
while (size > 127)
|
||||||
{
|
{
|
||||||
__m256i *_d = (__m256i *)d;
|
__m256i v1 = _mm256_stream_load_si256(s + 0);
|
||||||
__m256i *_s = (__m256i *)s;
|
__m256i v2 = _mm256_stream_load_si256(s + 1);
|
||||||
__m256i v1 = _mm256_stream_load_si256(_s + 0);
|
__m256i v3 = _mm256_stream_load_si256(s + 2);
|
||||||
__m256i v2 = _mm256_stream_load_si256(_s + 1);
|
__m256i v4 = _mm256_stream_load_si256(s + 3);
|
||||||
__m256i v3 = _mm256_stream_load_si256(_s + 2);
|
|
||||||
__m256i v4 = _mm256_stream_load_si256(_s + 3);
|
|
||||||
|
|
||||||
_mm256_stream_si256(_d + 0, v1);
|
_mm256_stream_si256(d + 0, v1);
|
||||||
_mm256_stream_si256(_d + 1, v2);
|
_mm256_stream_si256(d + 1, v2);
|
||||||
_mm256_stream_si256(_d + 2, v3);
|
_mm256_stream_si256(d + 2, v3);
|
||||||
_mm256_stream_si256(_d + 3, v4);
|
_mm256_stream_si256(d + 3, v4);
|
||||||
|
|
||||||
s += 4;
|
s += 4;
|
||||||
d += 4;
|
d += 4;
|
||||||
size -= 128;
|
size -= 128;
|
||||||
wp += 128;
|
wp += 128;
|
||||||
|
|
||||||
if (wp % FB_CHUNK_SIZE == 0)
|
if (wp % FB_CHUNK_SIZE == 0)
|
||||||
atomic_store_explicit(&frame->wp, wp, memory_order_release);
|
atomic_store_explicit(&frame->wp, wp, memory_order_release);
|
||||||
@ -274,18 +272,16 @@ bool framebuffer_write_avx2(FrameBuffer * frame,
|
|||||||
|
|
||||||
if (size > 63)
|
if (size > 63)
|
||||||
{
|
{
|
||||||
__m256i *_d = (__m256i *)d;
|
__m256i v1 = _mm256_stream_load_si256(s);
|
||||||
__m256i *_s = (__m256i *)s;
|
__m256i v2 = _mm256_stream_load_si256(s + 1);
|
||||||
__m256i v1 = _mm256_stream_load_si256(_s);
|
|
||||||
__m256i v2 = _mm256_stream_load_si256(_s + 1);
|
|
||||||
|
|
||||||
_mm256_stream_si256(_d, v1);
|
_mm256_stream_si256(d, v1);
|
||||||
_mm256_stream_si256(_d + 1, v2);
|
_mm256_stream_si256(d + 1, v2);
|
||||||
|
|
||||||
s += 2;
|
s += 2;
|
||||||
d += 2;
|
d += 2;
|
||||||
size -= 64;
|
size -= 64;
|
||||||
wp += 64;
|
wp += 64;
|
||||||
|
|
||||||
if (wp % FB_CHUNK_SIZE == 0)
|
if (wp % FB_CHUNK_SIZE == 0)
|
||||||
atomic_store_explicit(&frame->wp, wp, memory_order_release);
|
atomic_store_explicit(&frame->wp, wp, memory_order_release);
|
||||||
|
@ -301,7 +301,8 @@ int rectsRejectContained(FrameDamageRect * rects, int count)
|
|||||||
return removeRects(rects, count, removed);
|
return removeRects(rects, count, removed);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void rectCopyUnaligned_memcpy(uint8_t * dst, const uint8_t * src,
|
static void rectCopyUnaligned_memcpy(
|
||||||
|
uint8_t *restrict dst, const uint8_t *restrict src,
|
||||||
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
|
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
|
||||||
{
|
{
|
||||||
src += ystart * srcPitch + dx;
|
src += ystart * srcPitch + dx;
|
||||||
@ -320,27 +321,42 @@ static void rectCopyUnaligned_memcpy(uint8_t * dst, const uint8_t * src,
|
|||||||
#pragma GCC push_options
|
#pragma GCC push_options
|
||||||
#pragma GCC target ("avx")
|
#pragma GCC target ("avx")
|
||||||
#endif
|
#endif
|
||||||
static void rectCopyUnaligned_avx(uint8_t * dst, const uint8_t * src,
|
static void rectCopyUnaligned_avx(
|
||||||
|
uint8_t *restrict dst, const uint8_t *restrict src,
|
||||||
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
|
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
|
||||||
{
|
{
|
||||||
src += ystart * srcPitch + dx;
|
src += ystart * srcPitch + dx;
|
||||||
dst += ystart * dstPitch + dx;
|
dst += ystart * dstPitch + dx;
|
||||||
|
|
||||||
|
const int nvec = width / sizeof(__m256i);
|
||||||
|
const int rem = width % sizeof(__m256i);
|
||||||
|
|
||||||
for (int i = ystart; i < yend; ++i)
|
for (int i = ystart; i < yend; ++i)
|
||||||
{
|
{
|
||||||
int col;
|
const __m256i *restrict s = (__m256i*)src;
|
||||||
for(col = 0; col <= width - 32; col += 32)
|
__m256i *restrict d = (__m256i*)dst;
|
||||||
|
|
||||||
|
int vec;
|
||||||
|
for(vec = nvec; vec > 3; vec -= 4)
|
||||||
{
|
{
|
||||||
_mm_prefetch(src + col + 256, _MM_HINT_T0);
|
_mm256_stream_si256(d + 0, _mm256_load_si256(s + 0));
|
||||||
__m256i srcData = _mm256_loadu_si256((__m256i*)(src + col));
|
_mm256_stream_si256(d + 1, _mm256_load_si256(s + 1));
|
||||||
_mm256_storeu_si256((__m256i*)(dst + col), srcData);
|
_mm256_stream_si256(d + 2, _mm256_load_si256(s + 2));
|
||||||
|
_mm256_stream_si256(d + 3, _mm256_load_si256(s + 3));
|
||||||
|
|
||||||
|
s += 4;
|
||||||
|
d += 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(; col < width; ++col)
|
for(; vec > 0; --vec, ++d, ++s)
|
||||||
|
_mm256_stream_si256(d, _mm256_load_si256(s));
|
||||||
|
|
||||||
|
for(int col = width - rem; col < width; ++col)
|
||||||
dst[col] = src[col];
|
dst[col] = src[col];
|
||||||
|
|
||||||
src += srcPitch;
|
src += srcPitch;
|
||||||
dst += dstPitch;
|
dst += dstPitch;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#ifdef __clang__
|
#ifdef __clang__
|
||||||
#pragma clang attribute pop
|
#pragma clang attribute pop
|
||||||
@ -348,7 +364,8 @@ static void rectCopyUnaligned_avx(uint8_t * dst, const uint8_t * src,
|
|||||||
#pragma GCC pop_options
|
#pragma GCC pop_options
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void _rectCopyUnaligned(uint8_t * dst, const uint8_t * src,
|
static void _rectCopyUnaligned(
|
||||||
|
uint8_t *restrict dst, const uint8_t *restrict src,
|
||||||
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
|
int ystart, int yend, int dx, int dstPitch, int srcPitch, int width)
|
||||||
{
|
{
|
||||||
if (cpuInfo_getFeatures()->avx)
|
if (cpuInfo_getFeatures()->avx)
|
||||||
|
Loading…
Reference in New Issue
Block a user