@@ -4363,7 +4363,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
43634363
43644364 outptr += 128 ;
43654365 }
4366- #else // __AVX512F__
4366+ #else // __AVX512F__
43674367 for (; jj + 11 < max_jj; jj += 12 )
43684368 {
43694369 __m256 _sum0;
@@ -5162,10 +5162,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
51625162 if (broadcast_type_C == 4 )
51635163 {
51645164 __m512 _tmp = _mm512_loadu_ps (pC);
5165- _sum0 = _mm512_permutexvar_ps (_mm512_setr_epi32 (0 ,0 , 0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ), _tmp);
5166- _sum1 = _mm512_permutexvar_ps (_mm512_setr_epi32 (4 ,4 , 4 , 4 , 5 , 5 , 5 , 5 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 ), _tmp);
5167- _sum2 = _mm512_permutexvar_ps (_mm512_setr_epi32 (8 ,8 , 8 , 8 , 9 , 9 , 9 , 9 , 10 ,10 ,10 ,10 ,11 ,11 ,11 ,11 ), _tmp);
5168- _sum3 = _mm512_permutexvar_ps (_mm512_setr_epi32 (12 ,12 ,12 ,12 ,13 ,13 ,13 ,13 ,14 ,14 ,14 ,14 ,15 ,15 ,15 ,15 ), _tmp);
5165+ _sum0 = _mm512_permutexvar_ps (_mm512_setr_epi32 (0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ), _tmp);
5166+ _sum1 = _mm512_permutexvar_ps (_mm512_setr_epi32 (4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 ), _tmp);
5167+ _sum2 = _mm512_permutexvar_ps (_mm512_setr_epi32 (8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 10 , 10 , 10 , 10 , 11 , 11 , 11 , 11 ), _tmp);
5168+ _sum3 = _mm512_permutexvar_ps (_mm512_setr_epi32 (12 , 12 , 12 , 12 , 13 , 13 , 13 , 13 , 14 , 14 , 14 , 14 , 15 , 15 , 15 , 15 ), _tmp);
51695169 pC += 16 ;
51705170 }
51715171 }
@@ -5187,10 +5187,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
51875187
51885188 __m512 _pAAAA = _mm512_broadcast_f32x4 (_pA);
51895189
5190- __m512 _pB0 = _mm512_permutexvar_ps (_mm512_setr_epi32 (0 ,0 , 0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ), _pB);
5191- __m512 _pB1 = _mm512_permutexvar_ps (_mm512_setr_epi32 (4 ,4 , 4 , 4 , 5 , 5 , 5 , 5 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 ), _pB);
5192- __m512 _pB2 = _mm512_permutexvar_ps (_mm512_setr_epi32 (8 ,8 , 8 , 8 , 9 , 9 , 9 , 9 , 10 ,10 ,10 ,10 ,11 ,11 ,11 ,11 ), _pB);
5193- __m512 _pB3 = _mm512_permutexvar_ps (_mm512_setr_epi32 (12 ,12 ,12 ,12 ,13 ,13 ,13 ,13 ,14 ,14 ,14 ,14 ,15 ,15 ,15 ,15 ), _pB);
5190+ __m512 _pB0 = _mm512_permutexvar_ps (_mm512_setr_epi32 (0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , 3 , 3 , 3 , 3 ), _pB);
5191+ __m512 _pB1 = _mm512_permutexvar_ps (_mm512_setr_epi32 (4 , 4 , 4 , 4 , 5 , 5 , 5 , 5 , 6 , 6 , 6 , 6 , 7 , 7 , 7 , 7 ), _pB);
5192+ __m512 _pB2 = _mm512_permutexvar_ps (_mm512_setr_epi32 (8 , 8 , 8 , 8 , 9 , 9 , 9 , 9 , 10 , 10 , 10 , 10 , 11 , 11 , 11 , 11 ), _pB);
5193+ __m512 _pB3 = _mm512_permutexvar_ps (_mm512_setr_epi32 (12 , 12 , 12 , 12 , 13 , 13 , 13 , 13 , 14 , 14 , 14 , 14 , 15 , 15 , 15 , 15 ), _pB);
51945194
51955195 _sum0 = _mm512_fmadd_ps (_pAAAA, _pB0, _sum0);
51965196 _sum1 = _mm512_fmadd_ps (_pAAAA, _pB1, _sum1);
@@ -5250,7 +5250,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
52505250
52515251 outptr += 64 ;
52525252 }
5253- #else // __AVX512F__
5253+ #else // __AVX512F__
52545254 for (; jj + 11 < max_jj; jj += 12 )
52555255 {
52565256 __m128 _sum0;
@@ -5961,7 +5961,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
59615961
59625962 outptr += 32 ;
59635963 }
5964- #else // __AVX512F__
5964+ #else // __AVX512F__
59655965 for (; jj + 11 < max_jj; jj += 12 )
59665966 {
59675967 __m128 _sum00;
@@ -6519,7 +6519,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
65196519
65206520 outptr += 16 ;
65216521 }
6522- #else // __AVX512F__
6522+ #else // __AVX512F__
65236523 for (; jj + 11 < max_jj; jj += 12 )
65246524 {
65256525 __m128 _sum0;
0 commit comments