Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 19 additions & 21 deletions aie_kernels/aie2/gelu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,44 +13,42 @@ void gelu_tanh_approx_bf16(bfloat16 *restrict input_vector, bfloat16 *restrict o
{
event0();

auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);

aie::vector<bfloat16, 16> input;
auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);

// Constants
const bfloat16 k0_5 = 0.5f;
const bfloat16 k1 = 1.0f;
const bfloat16 sqrt_2_over_pi = 0.79788456f; // ≈ sqrt(2/π)
const bfloat16 kBeta = 0.044715f;

auto v05 = aie::broadcast<bfloat16, 16>(k0_5);
auto v1 = aie::broadcast<bfloat16, 16>(k1);
auto vs2opi = aie::broadcast<bfloat16, 16>(sqrt_2_over_pi);
auto vBeta = aie::broadcast<bfloat16, 16>(kBeta);
auto v05 = aie::broadcast<bfloat16, 32>(k0_5);
auto v1 = aie::broadcast<bfloat16, 32>(k1);
auto vs2opi = aie::broadcast<bfloat16, 32>(sqrt_2_over_pi);
auto vBeta = aie::broadcast<bfloat16, 32>(kBeta);

AIE_PREPARE_FOR_PIPELINING
AIE_LOOP_MIN_ITERATION_COUNT(64)
for (int i = 0; i < vector_size; i += 16) {
input = *it_in++;
auto x = input;
for (int i = 0; i < vector_size; i += 32) {
auto x = *it_in++;

// Compute x^3
aie::vector<bfloat16, 16> x2 = aie::mul(x, x); // x^2
aie::vector<bfloat16, 16> x3 = aie::mul(x, x2); // x^3
aie::vector<bfloat16, 32> x2 = aie::mul(x, x); // x^2
aie::vector<bfloat16, 32> x3 = aie::mul(x, x2); // x^3

// inner = sqrt(2/pi) * (x + 0.044715 * x^3)
aie::vector<bfloat16, 16> x3_beta = aie::mul(x3, vBeta);
aie::vector<bfloat16, 16> inner = aie::add(x, x3_beta);
auto inner1 = aie::mul(inner, vs2opi);
aie::vector<bfloat16, 32> x3_beta = aie::mul(x3, vBeta);
aie::vector<bfloat16, 32> inner = aie::add(x, x3_beta);
aie::vector<bfloat16, 32> inner1 = aie::mul(inner, vs2opi);

// tanh_out = tanh(inner)
aie::vector<bfloat16, 16> tanh_out = getTanhBf16(inner1.to_vector<bfloat16>());
// LUT-based tanh: split to 16-wide halves
aie::vector<bfloat16, 16> tanh_lo = getTanhBf16(inner1.extract<16>(0));
aie::vector<bfloat16, 16> tanh_hi = getTanhBf16(inner1.extract<16>(1));
aie::vector<bfloat16, 32> tanh_out = aie::concat(tanh_lo, tanh_hi);

// result = 0.5 * x * (1 + tanh_out)
aie::vector<bfloat16, 16> one_plus_tanh = aie::add(tanh_out, v1);
// Multiply by x and 0.5
aie::vector<bfloat16, 16> mul_v05 = aie::mul(v05, one_plus_tanh);
aie::vector<bfloat16, 32> one_plus_tanh = aie::add(tanh_out, v1);
aie::vector<bfloat16, 32> mul_v05 = aie::mul(v05, one_plus_tanh);
auto result = aie::mul(x, mul_v05);

*it_out++ = result.to_vector<bfloat16>();
Expand Down
2 changes: 1 addition & 1 deletion aie_kernels/aie2/relu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ void relu_vectorized_bf16(bfloat16 *restrict a, bfloat16 *restrict c, const int3
{
event0();

const int v_factor = 16;
const int v_factor = 32;
v32bfloat16 zeroes = broadcast_zero_to_v32bfloat16();
AIE_PREPARE_FOR_PIPELINING
AIE_LOOP_RANGE(16, 16)
Expand Down
4 changes: 2 additions & 2 deletions aie_kernels/aie2/rms_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,11 @@ void rms_norm_general(const T *restrict input, const T *restrict input2, T *rest
extern "C" {
void rms_norm_bf16_vector(bfloat16 *input, bfloat16 *output, int32_t size)
{
rms_norm_general<bfloat16, 16>(input, nullptr, output, size);
rms_norm_general<bfloat16, 32>(input, nullptr, output, size);
}

void weighted_rms_norm(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, int32_t size)
{
rms_norm_general<bfloat16, 16>(a_in, b_in, c_out, size);
rms_norm_general<bfloat16, 32>(a_in, b_in, c_out, size);
}
}
29 changes: 16 additions & 13 deletions aie_kernels/aie2/sigmoid.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,24 +15,27 @@ void sigmoid_tanh_approx_bf16(bfloat16 *restrict input_vector,
{
event0();

auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);

aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
aie::vector<bfloat16, 32> register_0_5 = aie::broadcast<bfloat16, 32>(0.5f);
aie::vector<bfloat16, 32> register_1 = aie::broadcast<bfloat16, 32>(1.0f);
AIE_PREPARE_FOR_PIPELINING
AIE_LOOP_MIN_ITERATION_COUNT(64)
for (int i = 0; i < vector_size; i += 16) {
// Load input vector
aie::vector<bfloat16, 16> input = *it_in++;
for (int i = 0; i < vector_size; i += 32) {
auto input = *it_in++;

// Compute tanh approximation
aie::vector<bfloat16, 16> half_x = aie::mul(input, register_0_5);
aie::vector<bfloat16, 16> tanh_half_x = getTanhBf16(half_x);
auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
// Compute half_x = x * 0.5
aie::vector<bfloat16, 32> half_x = aie::mul(input, register_0_5);

// LUT-based tanh: split to 16-wide halves
aie::vector<bfloat16, 16> tanh_lo = getTanhBf16(half_x.extract<16>(0));
aie::vector<bfloat16, 16> tanh_hi = getTanhBf16(half_x.extract<16>(1));
aie::vector<bfloat16, 32> tanh_half_x = aie::concat(tanh_lo, tanh_hi);

auto one_plus = aie::add(tanh_half_x, register_1);
aie::vector<bfloat16, 32> sigmoid_approx = aie::mul(one_plus, register_0_5);

// Store output vector
*it_out++ = sigmoid_approx;
}

Expand Down
32 changes: 17 additions & 15 deletions aie_kernels/aie2/silu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,28 @@ void silu_tanh_approx_bf16(bfloat16 *restrict input_vector, bfloat16 *restrict o
{
event0();

auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);

aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
aie::vector<bfloat16, 32> register_0_5 = aie::broadcast<bfloat16, 32>(0.5f);
aie::vector<bfloat16, 32> register_1 = aie::broadcast<bfloat16, 32>(1.0f);
AIE_PREPARE_FOR_PIPELINING
AIE_LOOP_MIN_ITERATION_COUNT(64)
for (int i = 0; i < vector_size; i += 16) {
// Load input vector
aie::vector<bfloat16, 16> input = *it_in++;

// Compute tanh approximation
aie::vector<bfloat16, 16> half_x = aie::mul(input, register_0_5);
aie::vector<bfloat16, 16> tanh_half_x = getTanhBf16(half_x);
auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
// Compute output: x * tanh_approx
for (int i = 0; i < vector_size; i += 32) {
auto input = *it_in++;

// Compute half_x = x * 0.5
aie::vector<bfloat16, 32> half_x = aie::mul(input, register_0_5);

// LUT-based tanh: split to 16-wide halves
aie::vector<bfloat16, 16> tanh_lo = getTanhBf16(half_x.extract<16>(0));
aie::vector<bfloat16, 16> tanh_hi = getTanhBf16(half_x.extract<16>(1));
aie::vector<bfloat16, 32> tanh_half_x = aie::concat(tanh_lo, tanh_hi);

auto one_plus = aie::add(tanh_half_x, register_1);
aie::vector<bfloat16, 32> sigmoid_approx = aie::mul(one_plus, register_0_5);
auto mul_output = aie::mul(input, sigmoid_approx);

// Store output vector
*it_out++ = mul_output.to_vector<bfloat16>();
}

Expand Down
17 changes: 8 additions & 9 deletions aie_kernels/aie2/tanh.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,20 +13,19 @@ void tanh_bf16_vectorized(bfloat16 *restrict input_vector, bfloat16 *restrict ou
{
event0();

auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);

AIE_PREPARE_FOR_PIPELINING
AIE_LOOP_MIN_ITERATION_COUNT(64)
for (int i = 0; i < vector_size; i += 16) {
// Load input vector
aie::vector<bfloat16, 16> input = *it_in++;
for (int i = 0; i < vector_size; i += 32) {
auto input = *it_in++;

// Compute tanh approximation
aie::vector<bfloat16, 16> tanh_x = getTanhBf16(input);
// LUT-based tanh: split to 16-wide halves
aie::vector<bfloat16, 16> tanh_lo = getTanhBf16(input.extract<16>(0));
aie::vector<bfloat16, 16> tanh_hi = getTanhBf16(input.extract<16>(1));

// Store output vector
*it_out++ = tanh_x;
*it_out++ = aie::concat(tanh_lo, tanh_hi);
}

event1();
Expand Down
39 changes: 19 additions & 20 deletions aie_kernels/aie2p/gelu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,44 +12,43 @@ void gelu_tanh_approx_bf16(bfloat16 *restrict input_vector, bfloat16 *restrict o
{
event0();

auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);

aie::vector<bfloat16, 16> input;
auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);

// Constants
const bfloat16 k0_5 = 0.5f;
const bfloat16 k1 = 1.0f;
const bfloat16 sqrt_2_over_pi = 0.79788456f; // ≈ sqrt(2/π)
const bfloat16 kBeta = 0.044715f;

auto v05 = aie::broadcast<bfloat16, 16>(k0_5);
auto v1 = aie::broadcast<bfloat16, 16>(k1);
auto v05 = aie::broadcast<bfloat16, 32>(k0_5);
auto v1 = aie::broadcast<bfloat16, 32>(k1);
auto vs2opi = aie::broadcast<bfloat16, 16>(sqrt_2_over_pi);
auto vBeta = aie::broadcast<bfloat16, 16>(kBeta);
auto vBeta = aie::broadcast<bfloat16, 32>(kBeta);

AIE_PREPARE_FOR_PIPELINING
AIE_LOOP_MIN_ITERATION_COUNT(64)
for (int i = 0; i < vector_size; i += 16) {
input = *it_in++;
auto x = input;
for (int i = 0; i < vector_size; i += 32) {
auto x = *it_in++;

// Compute x^3
aie::vector<bfloat16, 16> x2 = aie::mul(x, x); // x^2
aie::vector<bfloat16, 16> x3 = aie::mul(x, x2); // x^3
aie::vector<bfloat16, 32> x2 = aie::mul(x, x); // x^2
aie::vector<bfloat16, 32> x3 = aie::mul(x, x2); // x^3

// inner = sqrt(2/pi) * (x + 0.044715 * x^3)
aie::vector<bfloat16, 16> x3_beta = aie::mul(x3, vBeta);
aie::vector<bfloat16, 16> inner = aie::add(x, x3_beta);
auto inner1 = aie::mul(inner, vs2opi);
aie::vector<bfloat16, 32> x3_beta = aie::mul(x3, vBeta);
aie::vector<bfloat16, 32> inner = aie::add(x, x3_beta);

// tanh_out = tanh(inner)
auto tanh_out = aie::tanh<bfloat16>(inner1.to_vector<float>());
// tanh operates on 16 float lanes; split to two halves
auto inner1_lo = aie::mul(inner.extract<16>(0), vs2opi);
auto inner1_hi = aie::mul(inner.extract<16>(1), vs2opi);
auto tanh_lo = aie::tanh<bfloat16>(inner1_lo.to_vector<float>());
auto tanh_hi = aie::tanh<bfloat16>(inner1_hi.to_vector<float>());
aie::vector<bfloat16, 32> tanh_out = aie::concat(tanh_lo, tanh_hi);

// result = 0.5 * x * (1 + tanh_out)
aie::vector<bfloat16, 16> one_plus_tanh = aie::add(tanh_out, v1);
// Multiply by x and 0.5
aie::vector<bfloat16, 16> mul_v05 = aie::mul(v05, one_plus_tanh);
aie::vector<bfloat16, 32> one_plus_tanh = aie::add(tanh_out, v1);
aie::vector<bfloat16, 32> mul_v05 = aie::mul(v05, one_plus_tanh);
auto result = aie::mul(x, mul_v05);

*it_out++ = result.to_vector<bfloat16>();
Expand Down
2 changes: 1 addition & 1 deletion aie_kernels/aie2p/layer_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,6 @@ extern "C" {
void layer_norm(bfloat16 *input, bfloat16 *output, int32_t cols)
{
::aie::set_rounding(aie::rounding_mode::conv_even);
layer_norm<bfloat16, 16>(input, output, cols);
layer_norm<bfloat16, 32>(input, output, cols);
}
}
4 changes: 2 additions & 2 deletions aie_kernels/aie2p/rms_norm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,11 @@ void rms_norm_general(const T *restrict input, const T *restrict input2, T *rest
extern "C" {
void rms_norm_bf16_vector(bfloat16 *input, bfloat16 *output, int32_t size)
{
rms_norm_general<bfloat16, 16>(input, nullptr, output, size);
rms_norm_general<bfloat16, 32>(input, nullptr, output, size);
}

void weighted_rms_norm(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, int32_t size)
{
rms_norm_general<bfloat16, 16>(a_in, b_in, c_out, size);
rms_norm_general<bfloat16, 32>(a_in, b_in, c_out, size);
}
}
29 changes: 15 additions & 14 deletions aie_kernels/aie2p/sigmoid.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,27 @@ void sigmoid_tanh_approx_bf16(bfloat16 *restrict input_vector,
event0();

int num_elems = vector_size;
auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);

aie::vector<bfloat16, 16> input;
aie::vector<bfloat16, 16> output;
aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
aie::vector<bfloat16, 32> register_1 = aie::broadcast<bfloat16, 32>(1.0f);
aie::vector<bfloat16, 32> register_0_5_wide = aie::broadcast<bfloat16, 32>(0.5f);
AIE_PREPARE_FOR_PIPELINING
AIE_LOOP_MIN_ITERATION_COUNT(64)
for (int i = 0; i < num_elems; i += 16) {
// Load input vector
input = *it_in++;
for (int i = 0; i < num_elems; i += 32) {
auto input = *it_in++;

// Compute tanh approximation
auto half_x = aie::mul(input, register_0_5);
auto tanh_half_x = aie::tanh<bfloat16>(half_x.to_vector<float>());
auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
// tanh(x/2) split to two 16-wide halves
auto half_x_lo = aie::mul(input.extract<16>(0), register_0_5);
auto half_x_hi = aie::mul(input.extract<16>(1), register_0_5);
auto tanh_lo = aie::tanh<bfloat16>(half_x_lo.to_vector<float>());
auto tanh_hi = aie::tanh<bfloat16>(half_x_hi.to_vector<float>());
aie::vector<bfloat16, 32> tanh_half_x = aie::concat(tanh_lo, tanh_hi);

auto one_plus = aie::add(tanh_half_x, register_1);
aie::vector<bfloat16, 32> sigmoid_approx = aie::mul(one_plus, register_0_5_wide);

// Store output vector
*it_out++ = sigmoid_approx;
}

Expand Down
32 changes: 16 additions & 16 deletions aie_kernels/aie2p/silu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,28 @@ void silu_tanh_approx_bf16(bfloat16 *restrict input_vector, bfloat16 *restrict o
event0();

int num_elems = vector_size;
auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);

aie::vector<bfloat16, 16> input;
aie::vector<bfloat16, 16> output;
aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
aie::vector<bfloat16, 32> register_1 = aie::broadcast<bfloat16, 32>(1.0f);
aie::vector<bfloat16, 32> register_0_5_wide = aie::broadcast<bfloat16, 32>(0.5f);
AIE_PREPARE_FOR_PIPELINING
AIE_LOOP_MIN_ITERATION_COUNT(64)
for (int i = 0; i < num_elems; i += 16) {
// Load input vector
input = *it_in++;

// Compute tanh approximation
auto half_x = aie::mul(input, register_0_5);
auto tanh_half_x = aie::tanh<bfloat16>(half_x.to_vector<float>());
auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
// Compute output: x * tanh_approx
for (int i = 0; i < num_elems; i += 32) {
auto input = *it_in++;

// tanh(x/2) split to two 16-wide halves
auto half_x_lo = aie::mul(input.extract<16>(0), register_0_5);
auto half_x_hi = aie::mul(input.extract<16>(1), register_0_5);
auto tanh_lo = aie::tanh<bfloat16>(half_x_lo.to_vector<float>());
auto tanh_hi = aie::tanh<bfloat16>(half_x_hi.to_vector<float>());
aie::vector<bfloat16, 32> tanh_half_x = aie::concat(tanh_lo, tanh_hi);

auto one_plus = aie::add(tanh_half_x, register_1);
aie::vector<bfloat16, 32> sigmoid_approx = aie::mul(one_plus, register_0_5_wide);
auto mul_output = aie::mul(input, sigmoid_approx);

// Store output vector
*it_out++ = mul_output.to_vector<bfloat16>();
}

Expand Down
Loading