@@ -4052,7 +4052,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
40524052
40534053 outptr += 128 ;
40544054 }
4055- #else // __AVX512F__
4055+ #else // __AVX512F__
40564056 for (; jj + 11 < max_jj; jj += 12 )
40574057 {
40584058 __m256 _sum0;
@@ -4851,10 +4851,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
48514851 if (broadcast_type_C == 4 )
48524852 {
48534853 __m512 _tmp = _mm512_loadu_ps (pC);
4854- _sum0 = _mm512_permutexvar_ps (_mm512_setr_epi32 (0 ,0 , 0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ), _tmp);
4855- _sum1 = _mm512_permutexvar_ps (_mm512_setr_epi32 (4 ,4 , 4 , 4 , 5 , 5 , 5 , 5 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 ), _tmp);
4856- _sum2 = _mm512_permutexvar_ps (_mm512_setr_epi32 (8 ,8 , 8 , 8 , 9 , 9 , 9 , 9 , 10 ,10 ,10 ,10 ,11 ,11 ,11 ,11 ), _tmp);
4857- _sum3 = _mm512_permutexvar_ps (_mm512_setr_epi32 (12 ,12 ,12 ,12 ,13 ,13 ,13 ,13 ,14 ,14 ,14 ,14 ,15 ,15 ,15 ,15 ), _tmp);
4854+ _sum0 = _mm512_permutexvar_ps (_mm512_setr_epi32 (0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ), _tmp);
4855+ _sum1 = _mm512_permutexvar_ps (_mm512_setr_epi32 (4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 ), _tmp);
4856+ _sum2 = _mm512_permutexvar_ps (_mm512_setr_epi32 (8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 10 , 10 , 10 , 10 , 11 , 11 , 11 , 11 ), _tmp);
4857+ _sum3 = _mm512_permutexvar_ps (_mm512_setr_epi32 (12 , 12 , 12 , 12 , 13 , 13 , 13 , 13 , 14 , 14 , 14 , 14 , 15 , 15 , 15 , 15 ), _tmp);
48584858 pC += 16 ;
48594859 }
48604860 }
@@ -4876,10 +4876,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
48764876
48774877 __m512 _pAAAA = _mm512_broadcast_f32x4 (_pA);
48784878
4879- __m512 _pB0 = _mm512_permutexvar_ps (_mm512_setr_epi32 (0 ,0 , 0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ), _pB);
4880- __m512 _pB1 = _mm512_permutexvar_ps (_mm512_setr_epi32 (4 ,4 , 4 , 4 , 5 , 5 , 5 , 5 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 ), _pB);
4881- __m512 _pB2 = _mm512_permutexvar_ps (_mm512_setr_epi32 (8 ,8 , 8 , 8 , 9 , 9 , 9 , 9 , 10 ,10 ,10 ,10 ,11 ,11 ,11 ,11 ), _pB);
4882- __m512 _pB3 = _mm512_permutexvar_ps (_mm512_setr_epi32 (12 ,12 ,12 ,12 ,13 ,13 ,13 ,13 ,14 ,14 ,14 ,14 ,15 ,15 ,15 ,15 ), _pB);
4879+ __m512 _pB0 = _mm512_permutexvar_ps (_mm512_setr_epi32 (0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ), _pB);
4880+ __m512 _pB1 = _mm512_permutexvar_ps (_mm512_setr_epi32 (4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 ), _pB);
4881+ __m512 _pB2 = _mm512_permutexvar_ps (_mm512_setr_epi32 (8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 10 , 10 , 10 , 10 , 11 , 11 , 11 , 11 ), _pB);
4882+ __m512 _pB3 = _mm512_permutexvar_ps (_mm512_setr_epi32 (12 , 12 , 12 , 12 , 13 , 13 , 13 , 13 , 14 , 14 , 14 , 14 , 15 , 15 , 15 , 15 ), _pB);
48834883
48844884 _sum0 = _mm512_fmadd_ps (_pAAAA, _pB0, _sum0);
48854885 _sum1 = _mm512_fmadd_ps (_pAAAA, _pB1, _sum1);
@@ -4939,7 +4939,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
49394939
49404940 outptr += 64 ;
49414941 }
4942- #else // __AVX512F__
4942+ #else // __AVX512F__
49434943 for (; jj + 11 < max_jj; jj += 12 )
49444944 {
49454945 __m128 _sum0;
@@ -5650,7 +5650,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
56505650
56515651 outptr += 32 ;
56525652 }
5653- #else // __AVX512F__
5653+ #else // __AVX512F__
56545654 for (; jj + 11 < max_jj; jj += 12 )
56555655 {
56565656 __m128 _sum00;
@@ -6208,7 +6208,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
62086208
62096209 outptr += 16 ;
62106210 }
6211- #else // __AVX512F__
6211+ #else // __AVX512F__
62126212 for (; jj + 11 < max_jj; jj += 12 )
62136213 {
62146214 __m128 _sum0;
0 commit comments