test++

nihui · nihui · commit 6b709bf7a7ee · 2025-12-01T20:10:13.000+08:00
diff --git a/src/layer/x86/gemm_x86.cpp b/src/layer/x86/gemm_x86.cpp
@@ -1421,17 +1421,12 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max
     }
     for (; jj + 3 < max_jj; jj += 4)
     {
-#if __AVX__
+#if __AVX__ && !__AVX512F__
         if (elempack == 8)
         {
-#if __AVX512F__
-            // assert (j + jj) % 8 == 0
-            const float* p0 = (const float*)B + (j + jj) * B_hstep + k * 8;
-#else
             const float* p0 = (const float*)B + (j + jj) / 8 * 8 * B_hstep + k * 8;
 
             if ((j + jj) % 8 == 0)
-#endif
             {
                 for (int kk = 0; kk < max_kk; kk++)
                 {
@@ -1440,7 +1435,6 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max
                     p0 += 8;
                 }
             }
-#if !__AVX512F__
             if ((j + jj) % 8 == 4)
             {
                 for (int kk = 0; kk < max_kk; kk++)
@@ -1450,9 +1444,8 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max
                     p0 += 8;
                 }
             }
-#endif // !__AVX512F__
         }
-#endif // __AVX__
+#endif // __AVX__ && !__AVX512F__
         if (elempack == 4)
         {
             const float* p0 = (const float*)B + (j + jj) * B_hstep + k * 4;
diff --git a/tests/test_gemm_oom.cpp b/tests/test_gemm_oom.cpp
@@ -363,6 +363,71 @@ static int test_gemm_4(int M, int N, int K)
 }
 #endif // NCNN_INT8
 
+static int test_gemm_nt_oom(int M, int N, int K, int transA, int transB, int output_transpose, int constantA, int constantB)
+{
+    ncnn::ParamDict pd;
+    pd.set(2, transA);
+    pd.set(3, transB);
+    pd.set(4, constantA);
+    pd.set(5, constantB);
+    pd.set(6, 1);
+    pd.set(7, M);
+    pd.set(8, N);
+    pd.set(9, K);
+    pd.set(10, -1);
+    pd.set(14, output_transpose);
+
+    std::vector<ncnn::Mat> weights;
+    if (constantA) weights.push_back(transA ? ncnn::Mat(M, K) : ncnn::Mat(K, M));
+    if (constantB) weights.push_back(transB ? ncnn::Mat(K, N) : ncnn::Mat(N, K));
+
+    std::vector<ncnn::Mat> a;
+    if (!constantA) a.push_back(transA ? ncnn::Mat(M, K) : ncnn::Mat(K, M));
+    if (!constantB) a.push_back(transB ? ncnn::Mat(K, N) : ncnn::Mat(N, K));
+
+    for (size_t i = 0; i < weights.size(); i++)
+    {
+        Randomize(weights[i]);
+    }
+
+    for (size_t i = 0; i < a.size(); i++)
+    {
+        Randomize(a[i]);
+    }
+
+    int ret = test_layer_oom("Gemm", pd, weights, a, 1, TEST_LAYER_ENABLE_THREADING);
+    if (ret != 0)
+    {
+        fprintf(stderr, "test_gemm_nt_oom failed M=%d N=%d K=%d transA=%d transB=%d output_transpose=%d constantA=%d constantB=%d\n", M, N, K, transA, transB, output_transpose, constantA, constantB);
+    }
+
+    return ret;
+}
+
+static int test_gemm_5(int M, int N, int K)
+{
+    return 0
+           || test_gemm_nt_oom(M, N, K, 0, 0, 0, 0, 0)
+           || test_gemm_nt_oom(M, N, K, 0, 1, 0, 0, 0)
+           || test_gemm_nt_oom(M, N, K, 1, 0, 1, 0, 0)
+           || test_gemm_nt_oom(M, N, K, 1, 1, 1, 0, 0)
+
+           || test_gemm_nt_oom(M, N, K, 0, 0, 1, 1, 0)
+           || test_gemm_nt_oom(M, N, K, 0, 1, 1, 1, 0)
+           || test_gemm_nt_oom(M, N, K, 1, 0, 0, 1, 0)
+           || test_gemm_nt_oom(M, N, K, 1, 1, 0, 1, 0)
+
+           || test_gemm_nt_oom(M, N, K, 0, 0, 0, 0, 1)
+           || test_gemm_nt_oom(M, N, K, 0, 1, 1, 0, 1)
+           || test_gemm_nt_oom(M, N, K, 1, 0, 0, 0, 1)
+           || test_gemm_nt_oom(M, N, K, 1, 1, 1, 0, 1)
+
+           || test_gemm_nt_oom(M, N, K, 0, 0, 1, 1, 1)
+           || test_gemm_nt_oom(M, N, K, 0, 1, 0, 1, 1)
+           || test_gemm_nt_oom(M, N, K, 1, 0, 1, 1, 1)
+           || test_gemm_nt_oom(M, N, K, 1, 1, 0, 1, 1);
+}
+
 int main()
 {
     SRAND(7767517);
@@ -391,6 +456,10 @@ int main()
         if (ret2 != 0)
             return ret2;
 #endif
+
+        int ret3 = test_gemm_5(M, N, K);
+        if (ret3 != 0)
+            return ret;
     }
 
     return 0;
diff --git a/tests/testutil.cpp b/tests/testutil.cpp
@@ -1819,9 +1819,13 @@ int test_layer_oom_opt(const char* layer_type, const ncnn::ParamDict& pd, const
     op->load_model(mb);
 
     ncnn::Option opt = _opt;
-    opt.num_threads = 1;
     opt.use_vulkan_compute = false;
 
+    if (flag & TEST_LAYER_ENABLE_THREADING)
+        opt.num_threads = ncnn::get_physical_big_cpu_count();
+    else
+        opt.num_threads = 1;
+
     op->create_pipeline(opt);
 
     if (!op->support_packing && _opt.use_packing_layout)
@@ -1993,9 +1997,13 @@ int test_layer_oom_opt(const char* layer_type, const ncnn::ParamDict& pd, const
     op->load_model(mb);
 
     ncnn::Option opt = _opt;
-    opt.num_threads = 1;
     opt.use_vulkan_compute = false;
 
+    if (flag & TEST_LAYER_ENABLE_THREADING)
+        opt.num_threads = ncnn::get_physical_big_cpu_count();
+    else
+        opt.num_threads = 1;
+
     op->create_pipeline(opt);
 
     if (!op->support_packing && _opt.use_packing_layout)

Original file line number	Diff line number	Diff line change
`@@ -1421,17 +1421,12 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max`
`1421`	`1421`	`}`
`1422`	`1422`	`for (; jj + 3 < max_jj; jj += 4)`
`1423`	`1423`	`{`
`1424`		`-#if __AVX__`
	`1424`	`+#if __AVX__ && !__AVX512F__`
`1425`	`1425`	`if (elempack == 8)`
`1426`	`1426`	`{`
`1427`		`-#if __AVX512F__`
`1428`		`- // assert (j + jj) % 8 == 0`
`1429`		`- const float* p0 = (const float)B + (j + jj) B_hstep + k * 8;`
`1430`		`-#else`
`1431`	`1427`	`const float* p0 = (const float)B + (j + jj) / 8 8 * B_hstep + k * 8;`
`1432`	`1428`
`1433`	`1429`	`if ((j + jj) % 8 == 0)`
`1434`		`-#endif`
`1435`	`1430`	`{`
`1436`	`1431`	`for (int kk = 0; kk < max_kk; kk++)`
`1437`	`1432`	`{`
`@@ -1440,7 +1435,6 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max`
`1440`	`1435`	`p0 += 8;`
`1441`	`1436`	`}`
`1442`	`1437`	`}`
`1443`		`-#if !__AVX512F__`
`1444`	`1438`	`if ((j + jj) % 8 == 4)`
`1445`	`1439`	`{`
`1446`	`1440`	`for (int kk = 0; kk < max_kk; kk++)`
`@@ -1450,9 +1444,8 @@ static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max`
`1450`	`1444`	`p0 += 8;`
`1451`	`1445`	`}`
`1452`	`1446`	`}`
`1453`		`-#endif // !__AVX512F__`
`1454`	`1447`	`}`
`1455`		`-#endif // __AVX__`
	`1448`	`+#endif // __AVX__ && !__AVX512F__`
`1456`	`1449`	`if (elempack == 4)`
`1457`	`1450`	`{`
`1458`	`1451`	`const float* p0 = (const float)B + (j + jj) B_hstep + k * 4;`