amd · AngryLoki · Apr 26, 2026
@@ -13,44 +13,42 @@ void gelu_tanh_approx_bf16(bfloat16 *restrict input_vector, bfloat16 *restrict o
 {
     event0();
 
-    auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
-    auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
-
-    aie::vector<bfloat16, 16> input;
+    auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
+    auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);
 
     // Constants
     const bfloat16 k0_5 = 0.5f;
     const bfloat16 k1 = 1.0f;
     const bfloat16 sqrt_2_over_pi = 0.79788456f; // ≈ sqrt(2/π)
     const bfloat16 kBeta = 0.044715f;
 
-    auto v05 = aie::broadcast<bfloat16, 16>(k0_5);
-    auto v1 = aie::broadcast<bfloat16, 16>(k1);
-    auto vs2opi = aie::broadcast<bfloat16, 16>(sqrt_2_over_pi);
-    auto vBeta = aie::broadcast<bfloat16, 16>(kBeta);
+    auto v05 = aie::broadcast<bfloat16, 32>(k0_5);
+    auto v1 = aie::broadcast<bfloat16, 32>(k1);
+    auto vs2opi = aie::broadcast<bfloat16, 32>(sqrt_2_over_pi);
+    auto vBeta = aie::broadcast<bfloat16, 32>(kBeta);
 
     AIE_PREPARE_FOR_PIPELINING
     AIE_LOOP_MIN_ITERATION_COUNT(64)
-    for (int i = 0; i < vector_size; i += 16) {
-        input = *it_in++;
-        auto x = input;
+    for (int i = 0; i < vector_size; i += 32) {
+        auto x = *it_in++;
 
         // Compute x^3
-        aie::vector<bfloat16, 16> x2 = aie::mul(x, x);  // x^2
-        aie::vector<bfloat16, 16> x3 = aie::mul(x, x2); // x^3
+        aie::vector<bfloat16, 32> x2 = aie::mul(x, x);  // x^2
+        aie::vector<bfloat16, 32> x3 = aie::mul(x, x2); // x^3
 
         // inner = sqrt(2/pi) * (x + 0.044715 * x^3)
-        aie::vector<bfloat16, 16> x3_beta = aie::mul(x3, vBeta);
-        aie::vector<bfloat16, 16> inner = aie::add(x, x3_beta);
-        auto inner1 = aie::mul(inner, vs2opi);
+        aie::vector<bfloat16, 32> x3_beta = aie::mul(x3, vBeta);
+        aie::vector<bfloat16, 32> inner = aie::add(x, x3_beta);
+        aie::vector<bfloat16, 32> inner1 = aie::mul(inner, vs2opi);
 
-        // tanh_out = tanh(inner)
-        aie::vector<bfloat16, 16> tanh_out = getTanhBf16(inner1.to_vector<bfloat16>());
+        // LUT-based tanh: split to 16-wide halves
+        aie::vector<bfloat16, 16> tanh_lo = getTanhBf16(inner1.extract<16>(0));
+        aie::vector<bfloat16, 16> tanh_hi = getTanhBf16(inner1.extract<16>(1));
+        aie::vector<bfloat16, 32> tanh_out = aie::concat(tanh_lo, tanh_hi);
 
         // result = 0.5 * x * (1 + tanh_out)
-        aie::vector<bfloat16, 16> one_plus_tanh = aie::add(tanh_out, v1);
-        // Multiply by x and 0.5
-        aie::vector<bfloat16, 16> mul_v05 = aie::mul(v05, one_plus_tanh);
+        aie::vector<bfloat16, 32> one_plus_tanh = aie::add(tanh_out, v1);
+        aie::vector<bfloat16, 32> mul_v05 = aie::mul(v05, one_plus_tanh);
         auto result = aie::mul(x, mul_v05);
 
         *it_out++ = result.to_vector<bfloat16>();

@@ -15,7 +15,7 @@ void relu_vectorized_bf16(bfloat16 *restrict a, bfloat16 *restrict c, const int3
 {
     event0();
 
-    const int v_factor = 16;
+    const int v_factor = 32;
     v32bfloat16 zeroes = broadcast_zero_to_v32bfloat16();
     AIE_PREPARE_FOR_PIPELINING
     AIE_LOOP_RANGE(16, 16)

@@ -69,11 +69,11 @@ void rms_norm_general(const T *restrict input, const T *restrict input2, T *rest
 extern "C" {
 void rms_norm_bf16_vector(bfloat16 *input, bfloat16 *output, int32_t size)
 {
-    rms_norm_general<bfloat16, 16>(input, nullptr, output, size);
+    rms_norm_general<bfloat16, 32>(input, nullptr, output, size);
 }
 
 void weighted_rms_norm(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, int32_t size)
 {
-    rms_norm_general<bfloat16, 16>(a_in, b_in, c_out, size);
+    rms_norm_general<bfloat16, 32>(a_in, b_in, c_out, size);
 }
 }
@@ -15,24 +15,27 @@ void sigmoid_tanh_approx_bf16(bfloat16 *restrict input_vector,
 {
     event0();
 
-    auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
-    auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
+    auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
+    auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);
 
-    aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
-    aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
+    aie::vector<bfloat16, 32> register_0_5 = aie::broadcast<bfloat16, 32>(0.5f);
+    aie::vector<bfloat16, 32> register_1 = aie::broadcast<bfloat16, 32>(1.0f);
     AIE_PREPARE_FOR_PIPELINING
     AIE_LOOP_MIN_ITERATION_COUNT(64)
-    for (int i = 0; i < vector_size; i += 16) {
-        // Load input vector
-        aie::vector<bfloat16, 16> input = *it_in++;
+    for (int i = 0; i < vector_size; i += 32) {
+        auto input = *it_in++;
 
-        // Compute tanh approximation
-        aie::vector<bfloat16, 16> half_x = aie::mul(input, register_0_5);
-        aie::vector<bfloat16, 16> tanh_half_x = getTanhBf16(half_x);
-        auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
-        aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
+        // Compute half_x = x * 0.5
+        aie::vector<bfloat16, 32> half_x = aie::mul(input, register_0_5);
+
+        // LUT-based tanh: split to 16-wide halves
+        aie::vector<bfloat16, 16> tanh_lo = getTanhBf16(half_x.extract<16>(0));
+        aie::vector<bfloat16, 16> tanh_hi = getTanhBf16(half_x.extract<16>(1));
+        aie::vector<bfloat16, 32> tanh_half_x = aie::concat(tanh_lo, tanh_hi);
+
+        auto one_plus = aie::add(tanh_half_x, register_1);
+        aie::vector<bfloat16, 32> sigmoid_approx = aie::mul(one_plus, register_0_5);
 
-        // Store output vector
         *it_out++ = sigmoid_approx;
     }
 

@@ -13,26 +13,28 @@ void silu_tanh_approx_bf16(bfloat16 *restrict input_vector, bfloat16 *restrict o
 {
     event0();
 
-    auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
-    auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
+    auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
+    auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);
 
-    aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
-    aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
+    aie::vector<bfloat16, 32> register_0_5 = aie::broadcast<bfloat16, 32>(0.5f);
+    aie::vector<bfloat16, 32> register_1 = aie::broadcast<bfloat16, 32>(1.0f);
     AIE_PREPARE_FOR_PIPELINING
     AIE_LOOP_MIN_ITERATION_COUNT(64)
-    for (int i = 0; i < vector_size; i += 16) {
-        // Load input vector
-        aie::vector<bfloat16, 16> input = *it_in++;
-
-        // Compute tanh approximation
-        aie::vector<bfloat16, 16> half_x = aie::mul(input, register_0_5);
-        aie::vector<bfloat16, 16> tanh_half_x = getTanhBf16(half_x);
-        auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
-        aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
-        // Compute output: x * tanh_approx
+    for (int i = 0; i < vector_size; i += 32) {
+        auto input = *it_in++;
+
+        // Compute half_x = x * 0.5
+        aie::vector<bfloat16, 32> half_x = aie::mul(input, register_0_5);
+
+        // LUT-based tanh: split to 16-wide halves
+        aie::vector<bfloat16, 16> tanh_lo = getTanhBf16(half_x.extract<16>(0));
+        aie::vector<bfloat16, 16> tanh_hi = getTanhBf16(half_x.extract<16>(1));
+        aie::vector<bfloat16, 32> tanh_half_x = aie::concat(tanh_lo, tanh_hi);
+
+        auto one_plus = aie::add(tanh_half_x, register_1);
+        aie::vector<bfloat16, 32> sigmoid_approx = aie::mul(one_plus, register_0_5);
         auto mul_output = aie::mul(input, sigmoid_approx);
 
-        // Store output vector
         *it_out++ = mul_output.to_vector<bfloat16>();
     }
 

@@ -13,20 +13,19 @@ void tanh_bf16_vectorized(bfloat16 *restrict input_vector, bfloat16 *restrict ou
 {
     event0();
 
-    auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
-    auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
+    auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
+    auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);
 
     AIE_PREPARE_FOR_PIPELINING
     AIE_LOOP_MIN_ITERATION_COUNT(64)
-    for (int i = 0; i < vector_size; i += 16) {
-        // Load input vector
-        aie::vector<bfloat16, 16> input = *it_in++;
+    for (int i = 0; i < vector_size; i += 32) {
+        auto input = *it_in++;
 
-        // Compute tanh approximation
-        aie::vector<bfloat16, 16> tanh_x = getTanhBf16(input);
+        // LUT-based tanh: split to 16-wide halves
+        aie::vector<bfloat16, 16> tanh_lo = getTanhBf16(input.extract<16>(0));
+        aie::vector<bfloat16, 16> tanh_hi = getTanhBf16(input.extract<16>(1));
 
-        // Store output vector
-        *it_out++ = tanh_x;
+        *it_out++ = aie::concat(tanh_lo, tanh_hi);
     }
 
     event1();

@@ -12,44 +12,43 @@ void gelu_tanh_approx_bf16(bfloat16 *restrict input_vector, bfloat16 *restrict o
 {
     event0();
 
-    auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
-    auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
-
-    aie::vector<bfloat16, 16> input;
+    auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
+    auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);
 
     // Constants
     const bfloat16 k0_5 = 0.5f;
     const bfloat16 k1 = 1.0f;
     const bfloat16 sqrt_2_over_pi = 0.79788456f; // ≈ sqrt(2/π)
     const bfloat16 kBeta = 0.044715f;
 
-    auto v05 = aie::broadcast<bfloat16, 16>(k0_5);
-    auto v1 = aie::broadcast<bfloat16, 16>(k1);
+    auto v05 = aie::broadcast<bfloat16, 32>(k0_5);
+    auto v1 = aie::broadcast<bfloat16, 32>(k1);
     auto vs2opi = aie::broadcast<bfloat16, 16>(sqrt_2_over_pi);
-    auto vBeta = aie::broadcast<bfloat16, 16>(kBeta);
+    auto vBeta = aie::broadcast<bfloat16, 32>(kBeta);
 
     AIE_PREPARE_FOR_PIPELINING
     AIE_LOOP_MIN_ITERATION_COUNT(64)
-    for (int i = 0; i < vector_size; i += 16) {
-        input = *it_in++;
-        auto x = input;
+    for (int i = 0; i < vector_size; i += 32) {
+        auto x = *it_in++;
 
         // Compute x^3
-        aie::vector<bfloat16, 16> x2 = aie::mul(x, x);  // x^2
-        aie::vector<bfloat16, 16> x3 = aie::mul(x, x2); // x^3
+        aie::vector<bfloat16, 32> x2 = aie::mul(x, x);  // x^2
+        aie::vector<bfloat16, 32> x3 = aie::mul(x, x2); // x^3
 
         // inner = sqrt(2/pi) * (x + 0.044715 * x^3)
-        aie::vector<bfloat16, 16> x3_beta = aie::mul(x3, vBeta);
-        aie::vector<bfloat16, 16> inner = aie::add(x, x3_beta);
-        auto inner1 = aie::mul(inner, vs2opi);
+        aie::vector<bfloat16, 32> x3_beta = aie::mul(x3, vBeta);
+        aie::vector<bfloat16, 32> inner = aie::add(x, x3_beta);
 
-        // tanh_out = tanh(inner)
-        auto tanh_out = aie::tanh<bfloat16>(inner1.to_vector<float>());
+        // tanh operates on 16 float lanes; split to two halves
+        auto inner1_lo = aie::mul(inner.extract<16>(0), vs2opi);
+        auto inner1_hi = aie::mul(inner.extract<16>(1), vs2opi);
+        auto tanh_lo = aie::tanh<bfloat16>(inner1_lo.to_vector<float>());
+        auto tanh_hi = aie::tanh<bfloat16>(inner1_hi.to_vector<float>());
+        aie::vector<bfloat16, 32> tanh_out = aie::concat(tanh_lo, tanh_hi);
 
         // result = 0.5 * x * (1 + tanh_out)
-        aie::vector<bfloat16, 16> one_plus_tanh = aie::add(tanh_out, v1);
-        // Multiply by x and 0.5
-        aie::vector<bfloat16, 16> mul_v05 = aie::mul(v05, one_plus_tanh);
+        aie::vector<bfloat16, 32> one_plus_tanh = aie::add(tanh_out, v1);
+        aie::vector<bfloat16, 32> mul_v05 = aie::mul(v05, one_plus_tanh);
         auto result = aie::mul(x, mul_v05);
 
         *it_out++ = result.to_vector<bfloat16>();

@@ -103,6 +103,6 @@ extern "C" {
 void layer_norm(bfloat16 *input, bfloat16 *output, int32_t cols)
 {
     ::aie::set_rounding(aie::rounding_mode::conv_even);
-    layer_norm<bfloat16, 16>(input, output, cols);
+    layer_norm<bfloat16, 32>(input, output, cols);
 }
 }
@@ -67,11 +67,11 @@ void rms_norm_general(const T *restrict input, const T *restrict input2, T *rest
 extern "C" {
 void rms_norm_bf16_vector(bfloat16 *input, bfloat16 *output, int32_t size)
 {
-    rms_norm_general<bfloat16, 16>(input, nullptr, output, size);
+    rms_norm_general<bfloat16, 32>(input, nullptr, output, size);
 }
 
 void weighted_rms_norm(bfloat16 *a_in, bfloat16 *b_in, bfloat16 *c_out, int32_t size)
 {
-    rms_norm_general<bfloat16, 16>(a_in, b_in, c_out, size);
+    rms_norm_general<bfloat16, 32>(a_in, b_in, c_out, size);
 }
 }
@@ -15,26 +15,27 @@ void sigmoid_tanh_approx_bf16(bfloat16 *restrict input_vector,
     event0();
 
     int num_elems = vector_size;
-    auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
-    auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
+    auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
+    auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);
 
-    aie::vector<bfloat16, 16> input;
-    aie::vector<bfloat16, 16> output;
     aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
-    aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
+    aie::vector<bfloat16, 32> register_1 = aie::broadcast<bfloat16, 32>(1.0f);
+    aie::vector<bfloat16, 32> register_0_5_wide = aie::broadcast<bfloat16, 32>(0.5f);
     AIE_PREPARE_FOR_PIPELINING
     AIE_LOOP_MIN_ITERATION_COUNT(64)
-    for (int i = 0; i < num_elems; i += 16) {
-        // Load input vector
-        input = *it_in++;
+    for (int i = 0; i < num_elems; i += 32) {
+        auto input = *it_in++;
 
-        // Compute tanh approximation
-        auto half_x = aie::mul(input, register_0_5);
-        auto tanh_half_x = aie::tanh<bfloat16>(half_x.to_vector<float>());
-        auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
-        aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
+        // tanh(x/2) split to two 16-wide halves
+        auto half_x_lo = aie::mul(input.extract<16>(0), register_0_5);
+        auto half_x_hi = aie::mul(input.extract<16>(1), register_0_5);
+        auto tanh_lo = aie::tanh<bfloat16>(half_x_lo.to_vector<float>());
+        auto tanh_hi = aie::tanh<bfloat16>(half_x_hi.to_vector<float>());
+        aie::vector<bfloat16, 32> tanh_half_x = aie::concat(tanh_lo, tanh_hi);
+
+        auto one_plus = aie::add(tanh_half_x, register_1);
+        aie::vector<bfloat16, 32> sigmoid_approx = aie::mul(one_plus, register_0_5_wide);
 
-        // Store output vector
         *it_out++ = sigmoid_approx;
     }
 

@@ -13,28 +13,28 @@ void silu_tanh_approx_bf16(bfloat16 *restrict input_vector, bfloat16 *restrict o
     event0();
 
     int num_elems = vector_size;
-    auto it_in = aie::begin_restrict_vector<16>((bfloat16 *)input_vector);
-    auto it_out = aie::begin_restrict_vector<16>((bfloat16 *)output_vector);
+    auto it_in = aie::begin_restrict_vector<32>((bfloat16 *)input_vector);
+    auto it_out = aie::begin_restrict_vector<32>((bfloat16 *)output_vector);
 
-    aie::vector<bfloat16, 16> input;
-    aie::vector<bfloat16, 16> output;
     aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
-    aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
+    aie::vector<bfloat16, 32> register_1 = aie::broadcast<bfloat16, 32>(1.0f);
+    aie::vector<bfloat16, 32> register_0_5_wide = aie::broadcast<bfloat16, 32>(0.5f);
     AIE_PREPARE_FOR_PIPELINING
     AIE_LOOP_MIN_ITERATION_COUNT(64)
-    for (int i = 0; i < num_elems; i += 16) {
-        // Load input vector
-        input = *it_in++;
-
-        // Compute tanh approximation
-        auto half_x = aie::mul(input, register_0_5);
-        auto tanh_half_x = aie::tanh<bfloat16>(half_x.to_vector<float>());
-        auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
-        aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
-        // Compute output: x * tanh_approx
+    for (int i = 0; i < num_elems; i += 32) {
+        auto input = *it_in++;
+
+        // tanh(x/2) split to two 16-wide halves
+        auto half_x_lo = aie::mul(input.extract<16>(0), register_0_5);
+        auto half_x_hi = aie::mul(input.extract<16>(1), register_0_5);
+        auto tanh_lo = aie::tanh<bfloat16>(half_x_lo.to_vector<float>());
+        auto tanh_hi = aie::tanh<bfloat16>(half_x_hi.to_vector<float>());
+        aie::vector<bfloat16, 32> tanh_half_x = aie::concat(tanh_lo, tanh_hi);
+
+        auto one_plus = aie::add(tanh_half_x, register_1);
+        aie::vector<bfloat16, 32> sigmoid_approx = aie::mul(one_plus, register_0_5_wide);
         auto mul_output = aie::mul(input, sigmoid_approx);
 
-        // Store output vector
         *it_out++ = mul_output.to_vector<bfloat16>();
     }