Skip to content

Commit 3d98511

Browse files
committed
Merge branch 'opt-x86-gemm-5' of github.com:nihui/ncnn into opt-x86-gemm-5
2 parents 75d1441 + e60ea74 commit 3d98511

File tree

1 file changed

+12
-12
lines changed

1 file changed

+12
-12
lines changed

src/layer/x86/gemm_x86.cpp

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4052,7 +4052,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
40524052

40534053
outptr += 128;
40544054
}
4055-
#else // __AVX512F__
4055+
#else // __AVX512F__
40564056
for (; jj + 11 < max_jj; jj += 12)
40574057
{
40584058
__m256 _sum0;
@@ -4851,10 +4851,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
48514851
if (broadcast_type_C == 4)
48524852
{
48534853
__m512 _tmp = _mm512_loadu_ps(pC);
4854-
_sum0 = _mm512_permutexvar_ps(_mm512_setr_epi32(0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3), _tmp);
4855-
_sum1 = _mm512_permutexvar_ps(_mm512_setr_epi32(4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7), _tmp);
4856-
_sum2 = _mm512_permutexvar_ps(_mm512_setr_epi32(8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11), _tmp);
4857-
_sum3 = _mm512_permutexvar_ps(_mm512_setr_epi32(12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15), _tmp);
4854+
_sum0 = _mm512_permutexvar_ps(_mm512_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3), _tmp);
4855+
_sum1 = _mm512_permutexvar_ps(_mm512_setr_epi32(4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7), _tmp);
4856+
_sum2 = _mm512_permutexvar_ps(_mm512_setr_epi32(8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11), _tmp);
4857+
_sum3 = _mm512_permutexvar_ps(_mm512_setr_epi32(12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15), _tmp);
48584858
pC += 16;
48594859
}
48604860
}
@@ -4876,10 +4876,10 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
48764876

48774877
__m512 _pAAAA = _mm512_broadcast_f32x4(_pA);
48784878

4879-
__m512 _pB0 = _mm512_permutexvar_ps(_mm512_setr_epi32(0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3), _pB);
4880-
__m512 _pB1 = _mm512_permutexvar_ps(_mm512_setr_epi32(4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7), _pB);
4881-
__m512 _pB2 = _mm512_permutexvar_ps(_mm512_setr_epi32(8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11), _pB);
4882-
__m512 _pB3 = _mm512_permutexvar_ps(_mm512_setr_epi32(12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15), _pB);
4879+
__m512 _pB0 = _mm512_permutexvar_ps(_mm512_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3), _pB);
4880+
__m512 _pB1 = _mm512_permutexvar_ps(_mm512_setr_epi32(4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7), _pB);
4881+
__m512 _pB2 = _mm512_permutexvar_ps(_mm512_setr_epi32(8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11), _pB);
4882+
__m512 _pB3 = _mm512_permutexvar_ps(_mm512_setr_epi32(12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15), _pB);
48834883

48844884
_sum0 = _mm512_fmadd_ps(_pAAAA, _pB0, _sum0);
48854885
_sum1 = _mm512_fmadd_ps(_pAAAA, _pB1, _sum1);
@@ -4939,7 +4939,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
49394939

49404940
outptr += 64;
49414941
}
4942-
#else // __AVX512F__
4942+
#else // __AVX512F__
49434943
for (; jj + 11 < max_jj; jj += 12)
49444944
{
49454945
__m128 _sum0;
@@ -5650,7 +5650,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
56505650

56515651
outptr += 32;
56525652
}
5653-
#else // __AVX512F__
5653+
#else // __AVX512F__
56545654
for (; jj + 11 < max_jj; jj += 12)
56555655
{
56565656
__m128 _sum00;
@@ -6208,7 +6208,7 @@ static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, cons
62086208

62096209
outptr += 16;
62106210
}
6211-
#else // __AVX512F__
6211+
#else // __AVX512F__
62126212
for (; jj + 11 < max_jj; jj += 12)
62136213
{
62146214
__m128 _sum0;

0 commit comments

Comments
 (0)