diff --git a/cmake/deps.txt b/cmake/deps.txt index e1870bf2df0cf..f8e5fb7f8ede0 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -47,7 +47,7 @@ protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/downlo psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013 pthreadpool;https://github.com/google/pthreadpool/archive/dcc9f28589066af0dbd4555579281230abbf74dd.zip;533a77943203ef15ca608bcd9dbe2c94da7451d2 pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.6.zip;f780292da9db273c8ef06ccf5fd4b623624143e9 -pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/877328f188a3c7d1fa855871a278eb48d530c4c0.zip;9152d4bf6b8bde9f19b116de3bd8a745097ed9df +pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/403d652dca4c1046e8145950b1c0997a9f748b57.zip;30b2a07fe4bae8574f89176e56274cacdd6d135b re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381 diff --git a/cmake/vcpkg-ports/cpuinfo/portfile.cmake b/cmake/vcpkg-ports/cpuinfo/portfile.cmake index 80192840ee9b0..5aa5aebe50f3d 100644 --- a/cmake/vcpkg-ports/cpuinfo/portfile.cmake +++ b/cmake/vcpkg-ports/cpuinfo/portfile.cmake @@ -6,8 +6,8 @@ endif() vcpkg_from_github( OUT_SOURCE_PATH SOURCE_PATH REPO pytorch/cpuinfo - REF 877328f188a3c7d1fa855871a278eb48d530c4c0 - SHA512 b6d5a9ce9996eee3b2f09f39115f7ae178fe4d4814cc35b049a59d04a82228e268aa52d073c307ccb56a427428622940e1c77f004c99851dfca0d3a5d803658b + REF 403d652dca4c1046e8145950b1c0997a9f748b57 + SHA512 f7cd6dc44bd1120af610cae1337ed4c0f557ba78d2de9c73fed350fa3dfe9512643a1619ae55f5a540c6316a87d641856cca27297bb8766e48f39b7b7a59da1f HEAD_REF master PATCHES patch_cpuinfo_h_for_arm64ec.patch diff --git a/cmake/vcpkg-ports/cpuinfo/vcpkg.json b/cmake/vcpkg-ports/cpuinfo/vcpkg.json index f1ccda72679b1..76486eceecf12 100644 --- a/cmake/vcpkg-ports/cpuinfo/vcpkg.json +++ b/cmake/vcpkg-ports/cpuinfo/vcpkg.json @@ -1,7 +1,7 @@ { "name": "cpuinfo", - "version-date": "2025-10-23", - "port-version": 4, + "version-date": "2025-11-18", + "port-version": 5, "description": "CPU INFOrmation library (x86/x86-64/ARM/ARM64, Linux/Windows/Android/macOS/iOS)", "homepage": "https://github.com/pytorch/cpuinfo", "license": "BSD-2-Clause", diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc index ceb498372a6fc..2bba0adcd987c 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc @@ -1,7 +1,6 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/common/cpuid_info.h" // for CPUIDInfo::GetCPUIDInfo().HasArm_SME2() #include "core/common/narrow.h" #include "core/common/safeint.h" #include "core/mlas/inc/mlas.h" @@ -213,9 +212,7 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase { } } - // Currently, MlasDynamicQGemmBatch() and associated functions require SME2 or else they are no-ops. - // We check that here too before attempting to use them. - if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME2()) { + if (!MlasIsDynamicQGemmAvailable()) { can_use_dynamic_quant_mlas_ = false; } diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc index ab8ab0b326292..f5d3b93b395e2 100644 --- a/onnxruntime/core/common/cpuid_info.cc +++ b/onnxruntime/core/common/cpuid_info.cc @@ -237,9 +237,9 @@ void CPUIDInfo::ArmLinuxInit() { #elif defined(_WIN32) // ^ defined(__linux__) void CPUIDInfo::ArmWindowsInit() { - // Read MIDR and ID_AA64ISAR1_EL1 register values from Windows registry + // Read MIDR register values from Windows registry // There should be one per CPU - std::vector midr_values{}, id_aa64isar1_el1_values{}; + std::vector midr_values{}; // TODO!! Don't support multiple processor group yet!! constexpr int MAX_CORES = 64; @@ -272,17 +272,7 @@ void CPUIDInfo::ArmWindowsInit() { break; } - uint64_t id_aa64isar1_el1_value; - data_size = sizeof(id_aa64isar1_el1_value); - - // CP 4031 corresponds to ID_AA64ISAR1_EL1 register - if (::RegGetValueA(HKEY_LOCAL_MACHINE, processor_subkey, "CP 4031", RRF_RT_REG_QWORD, - nullptr, &id_aa64isar1_el1_value, &data_size) != ERROR_SUCCESS) { - break; - } - midr_values.push_back(midr_value); - id_aa64isar1_el1_values.push_back(id_aa64isar1_el1_value); } // process midr_values @@ -308,22 +298,31 @@ void CPUIDInfo::ArmWindowsInit() { } } - has_arm_neon_i8mm_ = std::all_of( - id_aa64isar1_el1_values.begin(), id_aa64isar1_el1_values.end(), - [](uint64_t id_aa64isar1_el1_value) { - // I8MM, bits [55:52] - return ((id_aa64isar1_el1_value >> 52) & 0xF) != 0; - }); - - has_arm_neon_dot_ = (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) != 0); - #if defined(CPUINFO_SUPPORTED) if (pytorch_cpuinfo_init_) { + has_arm_neon_dot_ = cpuinfo_has_arm_neon_dot(); has_fp16_ = cpuinfo_has_arm_neon_fp16_arith(); - // cpuinfo_has_arm_i8mm() doesn't work on Windows yet. See https://github.com/pytorch/cpuinfo/issues/279. - // has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm(); - has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && has_arm_neon_i8mm_; + + // Note: + // cpuinfo is using IsProcessorFeaturePresent(PF_ARM_V82_FP16_INSTRUCTIONS_AVAILABLE): + // https://github.com/pytorch/cpuinfo/blob/403d652dca4c1046e8145950b1c0997a9f748b57/src/arm/windows/init.c#L224-L225 + // However, on some systems (notably, a Windows ARM64 CI build agent), cpuinfo_has_arm_neon_fp16_arith() started to + // return false in the newer cpuinfo version that uses IsProcessorFeaturePresent(). Perhaps the newer + // PF_ARM_V82_FP16_INSTRUCTIONS_AVAILABLE constant is not supported yet in the Windows version on those systems. + // To avoid regressing in fp16 instructions detection, we fall back to what cpuinfo used to do, i.e., use the + // detection of dot product instructions: + // https://github.com/pytorch/cpuinfo/blob/877328f188a3c7d1fa855871a278eb48d530c4c0/src/arm/windows/init.c#L206-L209 + // This workaround can be removed when cpuinfo_has_arm_neon_fp16_arith() works correctly on all the Windows + // versions that we want to support. + if (!has_fp16_) { + has_fp16_ = has_arm_neon_dot_; + } + + has_arm_neon_i8mm_ = cpuinfo_has_arm_i8mm(); + has_arm_sve_i8mm_ = cpuinfo_has_arm_sve() && cpuinfo_has_arm_i8mm(); has_arm_neon_bf16_ = cpuinfo_has_arm_neon_bf16(); + has_arm_sme_ = cpuinfo_has_arm_sme(); + has_arm_sme2_ = cpuinfo_has_arm_sme2(); } #endif // defined(CPUINFO_SUPPORTED) } @@ -397,4 +396,4 @@ CPUIDInfo::CPUIDInfo() { #endif #endif // defined(CPUIDINFO_ARCH_ARM) } -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h index 9c40627b5cd1b..ca9315c7ef95d 100644 --- a/onnxruntime/core/common/cpuid_info.h +++ b/onnxruntime/core/common/cpuid_info.h @@ -171,4 +171,4 @@ class CPUIDInfo { uint32_t vendor_id_; }; -} // namespace onnxruntime \ No newline at end of file +} // namespace onnxruntime diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 9d98a15d8457a..248c6d74e6cbd 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -634,6 +634,7 @@ MlasGemm( { MlasGemmBatch(Shape, &DataParams, 1, ThreadPool); } + /** * @brief Parameters that define the shape of a dynamically quantized GEMM operation. * @@ -646,6 +647,7 @@ struct MLAS_GEMM_DYN_QUANT_SHAPE_PARAMS { size_t N = 0; /**< Column size of matrix B */ size_t K = 0; /**< Column size of matrix A and Row size of matrix B */ }; + /** * @brief Parameters that define the data buffers and layout for a dynamic quant GEMM. * @@ -680,6 +682,14 @@ MlasDynamicQGemm ( MlasDynamicQGemmBatch(Shape, DataParams, 1, ThreadPool); } +/** + * @brief Determines whether a dynamic quantized GEMM implementation is available on the current platform. + * + * MlasDynamicQGemm() and MlasDynamicQGemmBatch() should only be called if this function returns true. + */ +bool +MLASCALL +MlasIsDynamicQGemmAvailable(); // // Symmetric QGEMM has limited buffer overrun. diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp index a1c2e467188f7..b3eb5893ba7e7 100644 --- a/onnxruntime/core/mlas/lib/qgemm.cpp +++ b/onnxruntime/core/mlas/lib/qgemm.cpp @@ -201,6 +201,17 @@ MlasGemmBatch( }); } +bool +MLASCALL +MlasIsDynamicQGemmAvailable() +{ +#if defined(USE_KLEIDIAI) && !defined(_MSC_VER) + return ArmKleidiAI::UseSME2; +#else + return false; +#endif +} + void MLASCALL MlasDynamicQGemmBatch ( @@ -211,7 +222,7 @@ MlasDynamicQGemmBatch ( ) { #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) //No fallback and putting in guards. This implementation is SME2 specific. - if(ArmKleidiAI::UseSME2){ + if (ArmKleidiAI::UseSME2) { ArmKleidiAI::MlasDynamicQGemmBatch(Shape, DataParams, BatchN, ThreadPool); } #endif @@ -336,7 +347,7 @@ MlasDynamicQgemmPackBSize( #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) //No fallback available //TODO: Insert Override - if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){//Still require this since no override + if (ArmKleidiAI::UseSME2) { //Still require this since no override bytes = ArmKleidiAI::MlasDynamicQgemmPackBSize(N, K); } #endif @@ -407,7 +418,7 @@ Return Value: ~(BufferAlignment - 1); // If this gemm B argument is used in a dynamically quantization gemm operation we can optimize for // this use case. Concat both packed representations for later decision. This allows for cases later - // where we still have the prepack at the cost of some memory otherwise we can use the qgemm quantization + // where we still have the prepack at the cost of some memory otherwise we can use the qgemm quantization // for better performance return AlignedBytesRequired + MlasDynamicQgemmPackBSize(N, K); } @@ -425,7 +436,7 @@ MlasDynamicQgemmPackB( { #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) //No fallback - if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){//Still require this since no override + if (ArmKleidiAI::UseSME2) { //Still require this since no override ArmKleidiAI::MlasDynamicQgemmPackB(N, K, B, Scales, Bias, PackedB); } #endif diff --git a/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp b/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp index bebff37ad8460..8a9de636c835f 100644 --- a/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp +++ b/onnxruntime/test/mlas/unittest/test_dynamic_qgemm.cpp @@ -7,8 +7,8 @@ // Currently this test only applies to KleidiAI Guard against it running in any other situation #if defined(USE_KLEIDIAI) && !defined(_MSC_VER) +#include "mlas.h" #include "test_util.h" -#include "core/mlas/lib/mlasi.h" // for MLAS_CPUIDINFO class MlasDynamicQgemmTest { private: @@ -20,9 +20,8 @@ class MlasDynamicQgemmTest { public: void Test(size_t M, size_t N, size_t K, size_t BatchSize) { - // Currently, MlasDynamicQGemmBatch() and associated functions require SME2 or else they are no-ops. - if (!MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2()) { - GTEST_SKIP() << "MlasDynamicQGemmBatch() requires ARM64 SME2 but it was not detected. Skipping test."; + if (!MlasIsDynamicQGemmAvailable()) { + GTEST_SKIP() << "MlasDynamicQGemmBatch() is not supported on this platform. Skipping test."; } // Setup buffers for holding various data