AI Engine-ML Intrinsics User Guide  (v2023.2)
Load 4x Operations

Load 4x intrinsics load four 64-bit values to a vector register from data memory. More...

Overview

Load 4x intrinsics load four 64-bit values to a vector register from data memory.

Loads vectors from four LUT elements. LUTs are specified by passing two 256-bit aligned pointers lut1 and lut2, the remaining two pointers are generated automatically so that they point to uneven memory banks. All four pointers are now used to read the low and high part of the output vector. The first read access will return the low part of the vector, the second read will return the high part of the vector. Every pointer will be offset by the value of the corresponding lane of the offset input.

Low: [(lut2+offset[3])|0x10 lut2+offset[2] (lut1+offset[1])|0x10 lut1+offset[0]]
High: [(lut2+offset[7])|0x10 lut2+offset[6] (lut1+offset[5])|0x10 lut1+offset[4]]

The load_lut_2x variants return two vectors. They require the high part of the offset input.

Functions

void load_lut_2x_float (const void *lut1, const void *lut2, v16int32 offset, chess_output v32bfloat16 &v1, chess_output v32bfloat16 &v2)
 Reads two 32 lane vectors of bfloat16 from the LUTs and stores the result in v1 and v2. More...
 
void load_lut_2x_float (const void *lut1, const void *lut2, v16uint32 offset, chess_output v32bfloat16 &v1, chess_output v32bfloat16 &v2)
 Reads two 32 lane vector of bfloat16 from the LUTs and stores the result in v1 and v2. More...
 
void load_lut_2x_int16 (const void *lut1, const void *lut2, v16int32 offset, chess_output v32int16 &v1, chess_output v32int16 &v2)
 Reads two 32 lane vectors of int16 from the LUTs and stores the results in v1 and v2. More...
 
void load_lut_2x_int16 (const void *lut1, const void *lut2, v16uint32 offset, chess_output v32int16 &v1, chess_output v32int16 &v2)
 Reads two 32 lane vectors of int16 from the LUTs and stores the results in v1 and v2. More...
 
void load_lut_2x_int32 (const void *lut1, const void *lut2, v16int32 offset, chess_output v16int32 &v1, chess_output v16int32 &v2)
 Reads two 16 lane vectors of int32 from the LUTs and stores the results in v1 and v2. More...
 
void load_lut_2x_int32 (const void *lut1, const void *lut2, v16uint32 offset, chess_output v16int32 &v1, chess_output v16int32 &v2)
 Reads two 16 lane vectors of int32 from the LUTs and stores the results in v1 and v2. More...
 
void load_lut_2x_int8 (const void *lut1, const void *lut2, v16int32 offset, chess_output v64int8 &v1, chess_output v64int8 &v2)
 Reads two 64 lane vectors of int8 from the LUTs and stores the results in v1 and v2. More...
 
void load_lut_2x_int8 (const void *lut1, const void *lut2, v16uint32 offset, chess_output v64int8 &v1, chess_output v64int8 &v2)
 Reads two 64 lane vectors of int8 from the LUTs and stores the results in v1 and v2. More...
 
void load_lut_float (const void *lut1, const void *lut2, v16int32 offset, chess_output v32bfloat16 &v1)
 Reads a 32 lane vector of bfloat16 from the LUTs and stores the result in v1. More...
 
void load_lut_float (const void *lut1, const void *lut2, v16uint32 offset, chess_output v32bfloat16 &v1)
 Reads a 32 lane vector of bfloat16 from the LUTs and stores the result in v1. More...
 
void load_lut_int16 (const void *lut1, const void *lut2, v16int32 offset, chess_output v32int16 &v1)
 Reads a 32 lane vector of int16 from the LUTs and stores the result in v1. More...
 
void load_lut_int16 (const void *lut1, const void *lut2, v16uint32 offset, chess_output v32int16 &v1)
 Reads a 32 lane vector of int16 from the LUTs and stores the result in v1. More...
 
void load_lut_int32 (const void *lut1, const void *lut2, v16int32 offset, chess_output v16int32 &v1)
 Reads a 16 lane vector of int32 from the LUTs and stores the result in v1. More...
 
void load_lut_int32 (const void *lut1, const void *lut2, v16uint32 offset, chess_output v16int32 &v1)
 Reads a 16 lane vector of int32 from the LUTs and stores the result in v1. More...
 
void load_lut_int8 (const void *lut1, const void *lut2, v16int32 offset, chess_output v64int8 &v1)
 Reads a 64 lane vector of int8 from the LUTs and stores the result in v1. More...
 
void load_lut_int8 (const void *lut1, const void *lut2, v16uint32 offset, chess_output v64int8 &v1)
 Reads a 64 lane vector of int8 from the LUTs and stores the result in v1. More...
 

Function Documentation

◆ load_lut_2x_float() [1/2]

void load_lut_2x_float ( const void *  lut1,
const void *  lut2,
v16int32  offset,
chess_output v32bfloat16 v1,
chess_output v32bfloat16 v2 
)

Reads two 32 lane vectors of bfloat16 from the LUTs and stores the result in v1 and v2.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data of the first read will be stored
v2Reference to vector in which the data of the second read will be stored

◆ load_lut_2x_float() [2/2]

void load_lut_2x_float ( const void *  lut1,
const void *  lut2,
v16uint32  offset,
chess_output v32bfloat16 v1,
chess_output v32bfloat16 v2 
)

Reads two 32 lane vector of bfloat16 from the LUTs and stores the result in v1 and v2.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data of the first read will be stored
v2Reference to vector in which the data of the second read will be stored

◆ load_lut_2x_int16() [1/2]

void load_lut_2x_int16 ( const void *  lut1,
const void *  lut2,
v16int32  offset,
chess_output v32int16 v1,
chess_output v32int16 v2 
)

Reads two 32 lane vectors of int16 from the LUTs and stores the results in v1 and v2.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data of the first read will be stored
v2Reference to vector in which the data of the second read will be stored

◆ load_lut_2x_int16() [2/2]

void load_lut_2x_int16 ( const void *  lut1,
const void *  lut2,
v16uint32  offset,
chess_output v32int16 v1,
chess_output v32int16 v2 
)

Reads two 32 lane vectors of int16 from the LUTs and stores the results in v1 and v2.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data of the first read will be stored
v2Reference to vector in which the data of the second read will be stored

◆ load_lut_2x_int32() [1/2]

void load_lut_2x_int32 ( const void *  lut1,
const void *  lut2,
v16int32  offset,
chess_output v16int32 v1,
chess_output v16int32 v2 
)

Reads two 16 lane vectors of int32 from the LUTs and stores the results in v1 and v2.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data of the first read will be stored
v2Reference to vector in which the data of the second read will be stored

◆ load_lut_2x_int32() [2/2]

void load_lut_2x_int32 ( const void *  lut1,
const void *  lut2,
v16uint32  offset,
chess_output v16int32 v1,
chess_output v16int32 v2 
)

Reads two 16 lane vectors of int32 from the LUTs and stores the results in v1 and v2.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data of the first read will be stored
v2Reference to vector in which the data of the second read will be stored

◆ load_lut_2x_int8() [1/2]

void load_lut_2x_int8 ( const void *  lut1,
const void *  lut2,
v16int32  offset,
chess_output v64int8 v1,
chess_output v64int8 v2 
)

Reads two 64 lane vectors of int8 from the LUTs and stores the results in v1 and v2.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data of the first read will be stored
v2Reference to vector in which the data of the second read will be stored

◆ load_lut_2x_int8() [2/2]

void load_lut_2x_int8 ( const void *  lut1,
const void *  lut2,
v16uint32  offset,
chess_output v64int8 v1,
chess_output v64int8 v2 
)

Reads two 64 lane vectors of int8 from the LUTs and stores the results in v1 and v2.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data of the first read will be stored
v2Reference to vector in which the data of the second read will be stored

◆ load_lut_float() [1/2]

void load_lut_float ( const void *  lut1,
const void *  lut2,
v16int32  offset,
chess_output v32bfloat16 v1 
)

Reads a 32 lane vector of bfloat16 from the LUTs and stores the result in v1.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data will be stored.

◆ load_lut_float() [2/2]

void load_lut_float ( const void *  lut1,
const void *  lut2,
v16uint32  offset,
chess_output v32bfloat16 v1 
)

Reads a 32 lane vector of bfloat16 from the LUTs and stores the result in v1.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data will be stored.

◆ load_lut_int16() [1/2]

void load_lut_int16 ( const void *  lut1,
const void *  lut2,
v16int32  offset,
chess_output v32int16 v1 
)

Reads a 32 lane vector of int16 from the LUTs and stores the result in v1.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data will be stored.

◆ load_lut_int16() [2/2]

void load_lut_int16 ( const void *  lut1,
const void *  lut2,
v16uint32  offset,
chess_output v32int16 v1 
)

Reads a 32 lane vector of int16 from the LUTs and stores the result in v1.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data will be stored.

◆ load_lut_int32() [1/2]

void load_lut_int32 ( const void *  lut1,
const void *  lut2,
v16int32  offset,
chess_output v16int32 v1 
)

Reads a 16 lane vector of int32 from the LUTs and stores the result in v1.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data will be stored.

◆ load_lut_int32() [2/2]

void load_lut_int32 ( const void *  lut1,
const void *  lut2,
v16uint32  offset,
chess_output v16int32 v1 
)

Reads a 16 lane vector of int32 from the LUTs and stores the result in v1.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data will be stored.

◆ load_lut_int8() [1/2]

void load_lut_int8 ( const void *  lut1,
const void *  lut2,
v16int32  offset,
chess_output v64int8 v1 
)

Reads a 64 lane vector of int8 from the LUTs and stores the result in v1.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data will be stored.

◆ load_lut_int8() [2/2]

void load_lut_int8 ( const void *  lut1,
const void *  lut2,
v16uint32  offset,
chess_output v64int8 v1 
)

Reads a 64 lane vector of int8 from the LUTs and stores the result in v1.

Parameters
lut1Pointer to LUT 1. Must be 256-bit aligned.
lut2Pointer to LUT 2. Must be 256-bit aligned.
offsetOffset for generation of LUT access address.
v1Reference to vector in which the data will be stored.