mirror of
				https://github.com/gnif/LookingGlass.git
				synced 2025-10-31 12:42:02 +00:00 
			
		
		
		
	[common] add runtime detection and selection of AVX/AVX2 support
This commit is contained in:
		| @@ -29,4 +29,19 @@ bool cpuInfo_get(char * model, size_t modelSize, int * procs, int * cores, | ||||
|  | ||||
| void cpuInfo_log(void); | ||||
|  | ||||
| typedef struct | ||||
| { | ||||
|   bool sse, sse2, sse3, ssse3; | ||||
|   bool fma; | ||||
|   bool sse4_1, sse4_2; | ||||
|   bool popcnt; | ||||
|   bool aes; | ||||
|   bool xsave, osxsave; | ||||
|   bool avx, avx2; | ||||
|   bool bmi1, bmi2; | ||||
| } | ||||
| CPUInfoFeatures; | ||||
|  | ||||
| const CPUInfoFeatures * cpuInfo_getFeatures(void); | ||||
|  | ||||
| #endif | ||||
|   | ||||
| @@ -70,7 +70,8 @@ void framebuffer_prepare(FrameBuffer * frame); | ||||
| /** | ||||
|  * Write data from the src buffer into the KVMFRFrame | ||||
|  */ | ||||
| bool framebuffer_write(FrameBuffer * frame, const void * src, size_t size); | ||||
| extern bool (*framebuffer_write)(FrameBuffer * frame, | ||||
|     const void * restrict src, size_t size); | ||||
|  | ||||
| /** | ||||
|  * Gets the underlying data buffer of the framebuffer. | ||||
|   | ||||
| @@ -27,18 +27,8 @@ | ||||
| #include "common/framebuffer.h" | ||||
| #include "common/types.h" | ||||
|  | ||||
| inline static void rectCopyUnaligned(uint8_t * dst, const uint8_t * src, | ||||
|     int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) | ||||
| { | ||||
|   src += ystart * srcPitch + dx; | ||||
|   dst += ystart * dstPitch + dx; | ||||
|   for (int i = ystart; i < yend; ++i) | ||||
|   { | ||||
|     memcpy(dst, src, width); | ||||
|     src += srcPitch; | ||||
|     dst += dstPitch; | ||||
|  } | ||||
| } | ||||
| extern void (*rectCopyUnaligned)(uint8_t * dst, const uint8_t * src, | ||||
|     int ystart, int yend, int dx, int dstPitch, int srcPitch, int width); | ||||
|  | ||||
| void rectsBufferToFramebuffer(FrameDamageRect * rects, int count, int bpp, | ||||
|   FrameBuffer * frame, int dstPitch, int height, | ||||
|   | ||||
| @@ -20,6 +20,7 @@ | ||||
|  | ||||
| #include "common/cpuinfo.h" | ||||
| #include "common/debug.h" | ||||
| #include "common/util.h" | ||||
|  | ||||
| void cpuInfo_log(void) | ||||
| { | ||||
| @@ -37,3 +38,73 @@ void cpuInfo_log(void) | ||||
|   DEBUG_INFO("CPU Model: %s", model); | ||||
|   DEBUG_INFO("CPU: %d sockets, %d cores, %d threads", sockets, cores, procs); | ||||
| } | ||||
|  | ||||
| const CPUInfoFeatures * cpuInfo_getFeatures(void) | ||||
| { | ||||
|   static bool initialized = false; | ||||
|   static CPUInfoFeatures features; | ||||
|  | ||||
|   if (likely(initialized)) | ||||
|     return &features; | ||||
|  | ||||
|   int cpuid[4] = {0}; | ||||
|  | ||||
|   // leaf1 | ||||
|   asm volatile | ||||
|   ( | ||||
|     "cpuid;" | ||||
|     : "=a" (cpuid[0]), | ||||
|       "=b" (cpuid[1]), | ||||
|       "=c" (cpuid[2]), | ||||
|       "=d" (cpuid[3]) | ||||
|     : "a" (1) | ||||
|   ); | ||||
|  | ||||
|   features.sse     = cpuid[3] & (1 << 25); | ||||
|   features.sse2    = cpuid[3] & (1 << 26); | ||||
|   features.sse3    = cpuid[2] & (1 <<  0); | ||||
|   features.ssse3   = cpuid[2] & (1 <<  9); | ||||
|   features.fma     = cpuid[2] & (1 << 12); | ||||
|   features.sse4_1  = cpuid[2] & (1 << 19); | ||||
|   features.sse4_2  = cpuid[2] & (1 << 20); | ||||
|   features.popcnt  = cpuid[2] & (1 << 23); | ||||
|   features.aes     = cpuid[2] & (1 << 25); | ||||
|   features.xsave   = cpuid[2] & (1 << 26); | ||||
|   features.osxsave = cpuid[2] & (1 << 27); | ||||
|   features.avx     = cpuid[2] & (1 << 28); | ||||
|  | ||||
|   // leaf7 | ||||
|   asm volatile | ||||
|   ( | ||||
|     "cpuid;" | ||||
|     : "=a" (cpuid[0]), | ||||
|       "=b" (cpuid[1]), | ||||
|       "=c" (cpuid[2]), | ||||
|       "=d" (cpuid[3]) | ||||
|     : "a" (7), "c" (0) | ||||
|   ); | ||||
|  | ||||
|   features.avx2 = cpuid[1] & (1 << 5); | ||||
|   features.bmi1 = cpuid[2] & (1 << 3); | ||||
|   features.bmi2 = cpuid[2] & (1 << 8); | ||||
|  | ||||
|   if (features.osxsave && features.avx) | ||||
|   { | ||||
|     int xgetbv = 0; | ||||
|     asm volatile | ||||
|     ( | ||||
|       "xgetbv;" | ||||
|       : "=a" (xgetbv) | ||||
|       : "c" (0) | ||||
|       : "edx" | ||||
|     ); | ||||
|  | ||||
|     if (!(xgetbv & 0x6)) | ||||
|     { | ||||
|       features.avx  = false; | ||||
|       features.avx2 = false; | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   return &features; | ||||
| }; | ||||
|   | ||||
| @@ -19,6 +19,7 @@ | ||||
|  */ | ||||
|  | ||||
| #include "common/framebuffer.h" | ||||
| #include "common/cpuinfo.h" | ||||
| #include "common/debug.h" | ||||
|  | ||||
| //#define FB_PROFILE | ||||
| @@ -29,6 +30,7 @@ | ||||
| #include <string.h> | ||||
| #include <emmintrin.h> | ||||
| #include <smmintrin.h> | ||||
| #include <immintrin.h> | ||||
| #include <unistd.h> | ||||
|  | ||||
| bool framebuffer_wait(const FrameBuffer * frame, size_t size) | ||||
| @@ -165,7 +167,8 @@ void framebuffer_prepare(FrameBuffer * frame) | ||||
|   atomic_store_explicit(&frame->wp, 0, memory_order_release); | ||||
| } | ||||
|  | ||||
| bool framebuffer_write(FrameBuffer * frame, const void * restrict src, size_t size) | ||||
| static bool framebuffer_write_sse4_1(FrameBuffer * frame, | ||||
|     const void * restrict src, size_t size) | ||||
| { | ||||
| #ifdef FB_PROFILE | ||||
|   static RunningAvg ra = NULL; | ||||
| @@ -222,6 +225,100 @@ bool framebuffer_write(FrameBuffer * frame, const void * restrict src, size_t si | ||||
|   return true; | ||||
| } | ||||
|  | ||||
| #pragma GCC push_options | ||||
| #pragma GCC target ("avx2") | ||||
| bool framebuffer_write_avx2(FrameBuffer * frame, | ||||
|     const void * restrict src, size_t size) | ||||
| { | ||||
| #ifdef FB_PROFILE | ||||
|     static RunningAvg ra = NULL; | ||||
|     static int raCount = 0; | ||||
|     const uint64_t ts = microtime(); | ||||
|     if (!ra) | ||||
|         ra = runningavg_new(100); | ||||
| #endif | ||||
|  | ||||
|   __m256i *restrict s = (__m256i *)src; | ||||
|   __m256i *restrict d = (__m256i *)frame->data; | ||||
|   size_t wp = 0; | ||||
|  | ||||
|   _mm_mfence(); | ||||
|  | ||||
|   /* copy in chunks */ | ||||
|   while (size > 127) | ||||
|   { | ||||
|     __m256i *_d = (__m256i *)d; | ||||
|     __m256i *_s = (__m256i *)s; | ||||
|     __m256i v1 = _mm256_stream_load_si256(_s + 0); | ||||
|     __m256i v2 = _mm256_stream_load_si256(_s + 1); | ||||
|     __m256i v3 = _mm256_stream_load_si256(_s + 2); | ||||
|     __m256i v4 = _mm256_stream_load_si256(_s + 3); | ||||
|  | ||||
|     _mm256_stream_si256(_d + 0, v1); | ||||
|     _mm256_stream_si256(_d + 1, v2); | ||||
|     _mm256_stream_si256(_d + 2, v3); | ||||
|     _mm256_stream_si256(_d + 3, v4); | ||||
|  | ||||
|     s += 4; | ||||
|     d += 4; | ||||
|     size -= 128; | ||||
|     wp += 128; | ||||
|  | ||||
|     if (wp % FB_CHUNK_SIZE == 0) | ||||
|       atomic_store_explicit(&frame->wp, wp, memory_order_release); | ||||
|   } | ||||
|  | ||||
|   if (size > 63) | ||||
|   { | ||||
|     __m256i *_d = (__m256i *)d; | ||||
|     __m256i *_s = (__m256i *)s; | ||||
|     __m256i v1 = _mm256_stream_load_si256(_s); | ||||
|     __m256i v2 = _mm256_stream_load_si256(_s + 1); | ||||
|  | ||||
|     _mm256_stream_si256(_d, v1); | ||||
|     _mm256_stream_si256(_d + 1, v2); | ||||
|  | ||||
|     s += 2; | ||||
|     d += 2; | ||||
|     size -= 64; | ||||
|     wp += 64; | ||||
|  | ||||
|     if (wp % FB_CHUNK_SIZE == 0) | ||||
|       atomic_store_explicit(&frame->wp, wp, memory_order_release); | ||||
|   } | ||||
|  | ||||
|   if (size) | ||||
|   { | ||||
|     memcpy(frame->data + wp, s, size); | ||||
|     wp += size; | ||||
|   } | ||||
|  | ||||
|   atomic_store_explicit(&frame->wp, wp, memory_order_release); | ||||
|  | ||||
| #ifdef FB_PROFILE | ||||
|   runningavg_push(ra, microtime() - ts); | ||||
|   if (++raCount % 100 == 0) | ||||
|     DEBUG_INFO("Average Copy Time: %.2fμs", runningavg_calc(ra)); | ||||
| #endif | ||||
|  | ||||
|   return true; | ||||
| } | ||||
| #pragma GCC pop_options | ||||
|  | ||||
| static bool _framebuffer_write(FrameBuffer * frame, | ||||
|     const void * restrict src, size_t size) | ||||
| { | ||||
|   if (cpuInfo_getFeatures()->avx2) | ||||
|     framebuffer_write = &framebuffer_write_avx2; | ||||
|   else | ||||
|     framebuffer_write = &framebuffer_write_sse4_1; | ||||
|  | ||||
|   return framebuffer_write(frame, src, size); | ||||
| } | ||||
|  | ||||
| bool (*framebuffer_write)(FrameBuffer * frame, | ||||
|   const void * restrict src, size_t size) = &_framebuffer_write; | ||||
|  | ||||
| const uint8_t * framebuffer_get_buffer(const FrameBuffer * frame) | ||||
| { | ||||
|   return frame->data; | ||||
|   | ||||
| @@ -20,8 +20,10 @@ | ||||
|  | ||||
| #include "common/rects.h" | ||||
| #include "common/util.h" | ||||
| #include "common/cpuinfo.h" | ||||
|  | ||||
| #include <stdlib.h> | ||||
| #include <immintrin.h> | ||||
|  | ||||
| struct Corner | ||||
| { | ||||
| @@ -298,3 +300,58 @@ int rectsRejectContained(FrameDamageRect * rects, int count) | ||||
|  | ||||
|   return removeRects(rects, count, removed); | ||||
| } | ||||
|  | ||||
| static void rectCopyUnaligned_memcpy(uint8_t * dst, const uint8_t * src, | ||||
|     int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) | ||||
| { | ||||
|   src += ystart * srcPitch + dx; | ||||
|   dst += ystart * dstPitch + dx; | ||||
|   for (int i = ystart; i < yend; ++i) | ||||
|   { | ||||
|     memcpy(dst, src, width); | ||||
|     src += srcPitch; | ||||
|     dst += dstPitch; | ||||
|   } | ||||
| } | ||||
|  | ||||
| #pragma GCC push_options | ||||
| #pragma GCC target ("avx2") | ||||
| static void rectCopyUnaligned_avx(uint8_t * dst, const uint8_t * src, | ||||
|     int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) | ||||
| { | ||||
|   src += ystart * srcPitch + dx; | ||||
|   dst += ystart * dstPitch + dx; | ||||
|   for (int i = ystart; i < yend; ++i) | ||||
|   { | ||||
|     int col; | ||||
|     for(col = 0; col <= width - 32; col += 32) | ||||
|     { | ||||
|       _mm_prefetch(src + col + 256, _MM_HINT_T0); | ||||
|       __m256i srcData = _mm256_loadu_si256((__m256i*)(src + col)); | ||||
|       _mm256_storeu_si256((__m256i*)(dst + col), srcData); | ||||
|     } | ||||
|  | ||||
|     for(; col < width; ++col) | ||||
|       dst[col] = src[col]; | ||||
|  | ||||
|     src += srcPitch; | ||||
|     dst += dstPitch; | ||||
|  } | ||||
| } | ||||
| #pragma GCC pop_options | ||||
|  | ||||
| static void _rectCopyUnaligned(uint8_t * dst, const uint8_t * src, | ||||
|     int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) | ||||
| { | ||||
|   if (cpuInfo_getFeatures()->avx) | ||||
|     rectCopyUnaligned = &rectCopyUnaligned_avx; | ||||
|   else | ||||
|     rectCopyUnaligned = &rectCopyUnaligned_memcpy; | ||||
|  | ||||
|   return rectCopyUnaligned( | ||||
|       dst, src, ystart, yend, dx, dstPitch, srcPitch, width); | ||||
| } | ||||
|  | ||||
| void (*rectCopyUnaligned)(uint8_t * dst, const uint8_t * src, | ||||
|     int ystart, int yend, int dx, int dstPitch, int srcPitch, int width) = | ||||
|   &_rectCopyUnaligned; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Geoffrey McRae
					Geoffrey McRae