Const-me’s gists

Const-me / bitGrid-avx2.cpp

Last active June 30, 2023 16:22

	// Transform 4 inputs with 4 lookup tables, making 4 outputs
	// The 4 inputs are packed in uint32_t value, each byte is expected to be in [ 0 .. 15 ] interval
	// The 4 tables are in a single AVX2 vector
	uint32_t applyLookup4( uint32_t i4, __m256i tables4 )
	{
	// Move 4 bytes into SSE vector
	__m128i bytes = _mm_cvtsi32_si128( (int)i4 );
	// Expand bytes into uint64_t lanes
	__m256i v = _mm256_cvtepu8_epi64( bytes );
	// Multiply them by 4 to get shift amounts in bits

Const-me / TwoSum.cpp

Created June 29, 2023 14:49

Const-me / ReflectTest.cs

Created May 29, 2023 17:18

	using System.Linq.Expressions;
	using System.Reflection;
	using System.Runtime.CompilerServices;

	static class ReflectTest
	{
	/// <summary>Generic method to call</summary>
	public static T GetValue<T>( T value )
	{
	return value;

Const-me / BranchTest.cpp

Created May 17, 2023 13:22

	#include <stdlib.h>
	#include <vector>
	#include <intrin.h>
	#include <stdint.h>
	#include <inttypes.h>

	std::vector<char> makeTestVector( bool random )
	{
	std::vector<char> result;
	result.resize( 1024 * 16 );

Const-me / MatMulTest.cpp

Created March 22, 2023 13:44

	// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
	#include <array>
	#include <immintrin.h>
	#include <assert.h>
	#include <float.h>

	// Unpack 32 4-bit fields into 32 bytes
	// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
	inline __m256i bytesFromNibbles( const uint8_t* rsi )
	{

Const-me / dotProduct_q40_f16.cpp

Created March 11, 2023 19:02

	// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
	#include <array>
	#include <immintrin.h>
	#include <assert.h>
	#include <float.h>

	// Unpack 32 4-bit fields into 32 bytes
	// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
	inline __m256i bytesFromNibbles( const uint8_t* rsi )
	{

Const-me / QuantisationTest.cpp

Created March 11, 2023 15:17

	// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
	#include <array>
	#include <immintrin.h>
	#include <assert.h>
	#include <float.h>

	// Unpack 32 4-bit fields into 32 bytes
	// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
	inline __m256i bytesFromNibbles( const uint8_t* rsi )
	{

Const-me / QuantisationTest.cpp

Created March 11, 2023 01:26

	// ==== AVX2 decompressor for Q4_0 and Q4_1 compressed blocks ====
	#include <array>
	#include <immintrin.h>

	// Unpack 32 4-bit fields into 32 bytes
	// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
	inline __m256i bytesFromNibbles( const uint8_t* rsi )
	{
	// Load 16 bytes from memory
	__m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );

Const-me / multiplyInner_avx16.cpp

Last active March 1, 2023 22:16

	#include <immintrin.h>

	// Compute product of width*16 column major matrix by vector of length `width`,
	// the result is a vector of length 16
	// BTW, according to godbolt.org, gcc does better than clang for this code.
	void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi )
	{
	// Using 4 accumulators per row, 4*16=64 scalars in 8 AVX vectors
	__m256 a00 = _mm256_setzero_ps();
	__m256 a01 = _mm256_setzero_ps();

Const-me / multiplyInner_avx16.cpp

Created February 28, 2023 13:18

	#include <immintrin.h>

	// Compute product of width*16 column major matrix by vector of length `width`,
	// the result is a vector of length 16
	void multiplyInner_avx16( const float* mat, const float* vec, size_t width, float* rdi )
	{
	// Using 2 accumulators per row to workaround data dependency on the accumulators

	// Initialize the accumulators
	__m256 a00 = _mm256_setzero_ps();

Konstantin Const-me