Skip to content

Instantly share code, notes, and snippets.

// Transform 4 inputs with 4 lookup tables, making 4 outputs
// The 4 inputs are packed in uint32_t value, each byte is expected to be in [ 0 .. 15 ] interval
// The 4 tables are in a single AVX2 vector
uint32_t applyLookup4( uint32_t i4, __m256i tables4 )
{
// Move 4 bytes into SSE vector
__m128i bytes = _mm_cvtsi32_si128( (int)i4 );
// Expand bytes into uint64_t lanes
__m256i v = _mm256_cvtepu8_epi64( bytes );
// Multiply them by 4 to get shift amounts in bits
#include <stdlib.h>
#include <stdio.h>
#include <random>
#include <vector>
#include <unordered_map>
#include <algorithm>
#include <optional>
#include <intrin.h>
#include <inttypes.h>
using System.Linq.Expressions;
using System.Reflection;
using System.Runtime.CompilerServices;
static class ReflectTest
{
/// <summary>Generic method to call</summary>
public static T GetValue<T>( T value )
{
return value;
#include <stdlib.h>
#include <vector>
#include <intrin.h>
#include <stdint.h>
#include <inttypes.h>
std::vector<char> makeTestVector( bool random )
{
std::vector<char> result;
result.resize( 1024 * 16 );
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
#include <assert.h>
#include <float.h>
// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
#include <assert.h>
#include <float.h>
// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
#include <assert.h>
#include <float.h>
// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
#include <array>
#include <immintrin.h>
// Unpack 32 4-bit fields into 32 bytes
// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
inline __m256i bytesFromNibbles( const uint8_t* rsi )
{
// Load 16 bytes from memory
__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
#include <immintrin.h>
// Compute product of width*16 column major matrix by vector of length `width`,
// the result is a vector of length 16
// BTW, according to godbolt.org, gcc does better than clang for this code.
void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi )
{
// Using 4 accumulators per row, 4*16=64 scalars in 8 AVX vectors
__m256 a00 = _mm256_setzero_ps();
__m256 a01 = _mm256_setzero_ps();
#include <immintrin.h>
// Compute product of width*16 column major matrix by vector of length `width`,
// the result is a vector of length 16
void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi )
{
// Using 2 accumulators per row to workaround data dependency on the accumulators
// Initialize the accumulators
__m256 a00 = _mm256_setzero_ps();