This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Transform 4 inputs with 4 lookup tables, making 4 outputs | |
// The 4 inputs are packed in uint32_t value, each byte is expected to be in [ 0 .. 15 ] interval | |
// The 4 tables are in a single AVX2 vector | |
uint32_t applyLookup4( uint32_t i4, __m256i tables4 ) | |
{ | |
// Move 4 bytes into SSE vector | |
__m128i bytes = _mm_cvtsi32_si128( (int)i4 ); | |
// Expand bytes into uint64_t lanes | |
__m256i v = _mm256_cvtepu8_epi64( bytes ); | |
// Multiply them by 4 to get shift amounts in bits |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <stdio.h> | |
#include <random> | |
#include <vector> | |
#include <unordered_map> | |
#include <algorithm> | |
#include <optional> | |
#include <intrin.h> | |
#include <inttypes.h> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Linq.Expressions; | |
using System.Reflection; | |
using System.Runtime.CompilerServices; | |
static class ReflectTest | |
{ | |
/// <summary>Generic method to call</summary> | |
public static T GetValue<T>( T value ) | |
{ | |
return value; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdlib.h> | |
#include <vector> | |
#include <intrin.h> | |
#include <stdint.h> | |
#include <inttypes.h> | |
std::vector<char> makeTestVector( bool random ) | |
{ | |
std::vector<char> result; | |
result.resize( 1024 * 16 ); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ==== | |
#include <array> | |
#include <immintrin.h> | |
#include <assert.h> | |
#include <float.h> | |
// Unpack 32 4-bit fields into 32 bytes | |
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval | |
inline __m256i bytesFromNibbles( const uint8_t* rsi ) | |
{ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ==== | |
#include <array> | |
#include <immintrin.h> | |
#include <assert.h> | |
#include <float.h> | |
// Unpack 32 4-bit fields into 32 bytes | |
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval | |
inline __m256i bytesFromNibbles( const uint8_t* rsi ) | |
{ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ==== | |
#include <array> | |
#include <immintrin.h> | |
#include <assert.h> | |
#include <float.h> | |
// Unpack 32 4-bit fields into 32 bytes | |
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval | |
inline __m256i bytesFromNibbles( const uint8_t* rsi ) | |
{ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ==== | |
#include <array> | |
#include <immintrin.h> | |
// Unpack 32 4-bit fields into 32 bytes | |
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval | |
inline __m256i bytesFromNibbles( const uint8_t* rsi ) | |
{ | |
// Load 16 bytes from memory | |
__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi ); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
// Compute product of width*16 column major matrix by vector of length `width`, | |
// the result is a vector of length 16 | |
// BTW, according to godbolt.org, gcc does better than clang for this code. | |
void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi ) | |
{ | |
// Using 4 accumulators per row, 4*16=64 scalars in 8 AVX vectors | |
__m256 a00 = _mm256_setzero_ps(); | |
__m256 a01 = _mm256_setzero_ps(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
// Compute product of width*16 column major matrix by vector of length `width`, | |
// the result is a vector of length 16 | |
void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi ) | |
{ | |
// Using 2 accumulators per row to workaround data dependency on the accumulators | |
// Initialize the accumulators | |
__m256 a00 = _mm256_setzero_ps(); |