microsoft
diff --git a/‎.github/workflows/react_native.yml‎
Lines changed: 23 additions & 70 deletions b/‎.github/workflows/react_native.yml‎
Lines changed: 23 additions & 70 deletions
diff --git a/‎.github/workflows/reusable_linux_build.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/reusable_linux_build.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/windows_x86.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/windows_x86.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/CMakeLists.txt‎
Lines changed: 6 additions & 1 deletion b/‎cmake/CMakeLists.txt‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎cmake/deps.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/deps.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 10 additions & 0 deletions b/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 4 additions & 0 deletions b/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/ContribOperators.md‎
Lines changed: 17 additions & 2 deletions b/‎docs/ContribOperators.md‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎docs/OperatorKernels.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/OperatorKernels.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/c_cxx/Doxyfile‎
Lines changed: 1 addition & 0 deletions b/‎docs/c_cxx/Doxyfile‎
Lines changed: 1 addition & 0 deletions
@@ -53,14 +53,11 @@ jobs:
           cp tools/ci_build/github/js/react_native_e2e_full_aar_build_settings.json ${{ runner.temp }}/.build_settings/build_settings.json
 
           python3 -m pip install --user -r ${{ github.workspace }}/tools/ci_build/requirements/pybind/requirements.txt
-          
-          python3 ${{ github.workspace }}/tools/ci_build/github/android/build_aar_package.py --build_dir ${{ runner.temp }} --config Release --android_sdk_path $ANDROID_SDK_ROOT --android_ndk_path $ANDROID_NDK_ROOT ${{ runner.temp }}/.build_settings/build_settings.json      
+
+          python3 ${{ github.workspace }}/tools/ci_build/github/android/build_aar_package.py --build_dir ${{ runner.temp }} --config Release --android_sdk_path $ANDROID_SDK_ROOT --android_ndk_path $ANDROID_NDK_ROOT ${{ runner.temp }}/.build_settings/build_settings.json
 
           # Copy the built artifacts to give folder for publishing
-          BASE_PATH=${{ runner.temp }}/aar_out/Release/com/microsoft/onnxruntime/onnxruntime-android/${OnnxRuntimeVersion}
-          cp ${BASE_PATH}/*.jar  ${{ runner.temp }}/artifacts
-          cp ${BASE_PATH}/*.aar          ${{ runner.temp }}/artifacts
-          cp ${BASE_PATH}/*.pom          ${{ runner.temp }}/artifacts          
+          cp -r ${{ runner.temp }}/aar_out/Release/com ${{ runner.temp }}/artifacts
 
       - name: Upload Android AAR Artifact
         uses: actions/upload-artifact@v5
@@ -109,10 +106,8 @@ jobs:
 
       - name: Copy AAR to React Native and E2E directories
         run: |
-          mkdir -p ${{ github.workspace }}/js/react_native/android/libs
-          cp ${{ runner.temp }}/android-full-aar/*.aar ${{ github.workspace }}/js/react_native/android/libs
           mkdir -p ${{ github.workspace }}/js/react_native/e2e/android/app/libs
-          cp ${{ runner.temp }}/android-full-aar/*.aar ${{ github.workspace }}/js/react_native/e2e/android/app/libs
+          cp -r ${{ runner.temp }}/android-full-aar/com ${{ github.workspace }}/js/react_native/e2e/android/app/libs
 
       - name: Install dependencies and bootstrap
         run: |
@@ -141,10 +136,6 @@ jobs:
         with:
           ndk-version: 28.0.13004108
 
-      - name: Run React Native Android Instrumented Tests
-        run: ./gradlew connectedDebugAndroidTest --stacktrace
-        working-directory: ${{ github.workspace }}/js/react_native/android
-
       - name: Run React Native Detox Android e2e Tests
         run: |
           JEST_JUNIT_OUTPUT_FILE=${{ github.workspace }}/js/react_native/e2e/android-test-results.xml \
@@ -169,6 +160,15 @@ jobs:
             echo "Emulator PID file was expected to exist but does not."
           fi
 
+      - name: Upload Android Test Results
+        if: always()
+        uses: actions/upload-artifact@v5
+        with:
+          name: android-test-results
+          path: |
+            ${{ github.workspace }}/js/react_native/e2e/android-test-results.xml
+            ${{ github.workspace }}/js/react_native/e2e/artifacts
+
   react_native_ci_ios_build:
     name: React Native CI iOS Build
     runs-on: macos-14
@@ -211,62 +211,6 @@ jobs:
           name: ios_pod
           path: ${{ runner.temp }}/ios_pod
 
-  react_native_ci_ios_unit_tests:
-    name: React Native CI iOS Unit Tests
-    needs: react_native_ci_ios_build
-    runs-on: macos-14
-    timeout-minutes: 90
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v5
-
-      - name: Download iOS pod artifact
-        uses: actions/download-artifact@v6
-        with:
-          name: ios_pod
-          path: ${{ runner.temp }}/ios_pod
-
-      - name: Use Xcode 15.3.0
-        run: sudo xcode-select --switch /Applications/Xcode_15.3.0.app/Contents/Developer
-
-      - name: Use Node.js 22.x
-        uses: actions/setup-node@v6
-        with:
-          node-version: '22.x'
-          
-      - uses: microsoft/onnxruntime-github-actions/[email protected]
-        with:
-          vcpkg-version: '2025.06.13'
-          vcpkg-hash: 735923258c5187966698f98ce0f1393b8adc6f84d44fd8829dda7db52828639331764ecf41f50c8e881e497b569f463dbd02dcb027ee9d9ede0711102de256cc
-          cmake-version: '3.31.8'
-          cmake-hash: 99cc9c63ae49f21253efb5921de2ba84ce136018abf08632c92c060ba91d552e0f6acc214e9ba8123dee0cf6d1cf089ca389e321879fd9d719a60d975bcffcc8
-          add-cmake-to-path: 'true'
-          disable-terrapin: 'true'
-
-      - name: Install dependencies and bootstrap
-        run: |
-          npm ci
-        working-directory: ${{ github.workspace }}/js
-      - run: npm ci
-        working-directory: ${{ github.workspace }}/js/common
-      - run: |
-          set -e -x
-          npm ci
-          npm run bootstrap-no-pods
-        working-directory: ${{ github.workspace }}/js/react_native
-
-      - name: Pod install
-        run: |
-          set -e -x
-          ls ${{ runner.temp }}/ios_pod/onnxruntime-c
-          ORT_C_LOCAL_POD_PATH=${{ runner.temp }}/ios_pod/onnxruntime-c pod install --verbose
-        working-directory: ${{ github.workspace }}/js/react_native/ios
-
-      - name: Run React Native iOS Instrumented Tests
-        run: |
-          /usr/bin/xcodebuild -sdk iphonesimulator -configuration Debug -workspace ${{ github.workspace }}/js/react_native/ios/OnnxruntimeModule.xcworkspace -scheme OnnxruntimeModuleTest -destination 'platform=iOS Simulator,name=iPhone 15,OS=17.4' test CODE_SIGNING_ALLOWED=NO
-        working-directory: ${{ github.workspace }}/js/react_native/ios
-
   react_native_ci_ios_e2e_tests:
     name: React Native CI iOS E2E Tests
     needs: react_native_ci_ios_build
@@ -314,7 +258,7 @@ jobs:
           npm ci
           npm run bootstrap-no-pods
         working-directory: ${{ github.workspace }}/js/react_native
-      
+
       - name: Pod install for e2e tests
         run: |
           set -e -x
@@ -331,3 +275,12 @@ jobs:
                          --loglevel verbose \
                          --take-screenshots failing
         working-directory: ${{ github.workspace }}/js/react_native/e2e
+
+      - name: Upload iOS Test Results
+        if: always()
+        uses: actions/upload-artifact@v5
+        with:
+          name: ios-test-results
+          path: |
+            ${{ github.workspace }}/js/react_native/e2e/ios-test-results.xml
+            ${{ github.workspace }}/js/react_native/e2e/artifacts
@@ -78,6 +78,7 @@ jobs:
         uses: actions/checkout@v5
 
       - name: Set up Python ${{ inputs.python_version }}
+        if: inputs.architecture != 'arm64'
         uses: actions/setup-python@v6
         with:
           python-version: ${{ inputs.python_version }}
 
@@ -61,7 +61,7 @@ jobs:
         working-directory: ${{ github.workspace }}
 
       - name: Use .NET 8.x
-        uses: actions/setup-dotnet@v5
+        uses: actions/setup-dotnet@v3
         with:
           dotnet-version: '8.x'
         env:
 
@@ -1454,7 +1454,12 @@ if (onnxruntime_USE_CUDA)
     message(STATUS "CUDA Toolkit version is greater or equal than 12.8, enable -DENABLE_FP4 flag")
   endif()
 
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all")
+  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all -compress-mode=size")
+  else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all")
+  endif()
+
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --Werror default-stream-launch")
 
 
@@ -56,5 +56,5 @@ extensions;https://github.com/microsoft/onnxruntime-extensions/archive/c24b7bab0
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
 cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.12.0.zip;7e733cfdc410d777b76122d64232499205589a96
 dawn;https://github.com/google/dawn/archive/13c1635a14574ebb7116b56a69f5519301417fda.zip;0aadd28fc385cf7d657d5fc70a352372d2d3c76a
-kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.10.0.tar.gz;11b62149cb2514b3b9069cc435c3aa7a4e82b97a
+kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.15.0.tar.gz;62ccd24ab60bcef68766440fb42d79071ac2a5d2
 duktape;https://github.com/svaarala/duktape/releases/download/v2.7.0/duktape-2.7.0.tar.xz;8200c8e417dbab7adcc12c4dbdef7651cfc55794
@@ -5,6 +5,7 @@ set(MLAS_ROOT ${ONNXRUNTIME_ROOT}/core/mlas)
 set(MLAS_SRC_DIR ${MLAS_ROOT}/lib)
 set(MLAS_INC_DIR ${MLAS_ROOT}/inc)
 
+
 # mlas_private_compile_definitions contains compile definitions that are private to onnxruntime_mlas and targets which
 # use internal MLAS headers like mlasi.h.
 set(mlas_private_compile_definitions)
@@ -285,6 +286,15 @@ function(setup_kleidiai)
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai)
   set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES} PARENT_SCOPE)
 
+  # If KLEIDIAI_DEBUG is enabled that implies both DEBUG and KERNEL messages.
+  if(onnxruntime_KLEIDIAI_DEBUG_LOGGING)
+    target_compile_definitions(onnxruntime_mlas PRIVATE KLEIDIAI_DEBUG=1)
+    target_compile_definitions(onnxruntime_mlas PRIVATE KLEIDIAI_KERNEL=1)
+  endif()
+  if(onnxruntime_KLEIDIAI_KERNEL_LOGGING)
+    target_compile_definitions(onnxruntime_mlas PRIVATE KLEIDIAI_KERNEL=1)
+  endif()
+
   if (NOT onnxruntime_BUILD_SHARED_LIB)
     install(TARGETS kleidiai EXPORT ${PROJECT_NAME}Targets
     ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
 
@@ -1616,6 +1616,10 @@ endif()
       target_compile_definitions(onnxruntime_shared_lib_test PRIVATE USE_DUMMY_EXA_DEMANGLE=1)
     endif()
 
+    if (CMAKE_SYSTEM_NAME MATCHES "AIX" AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+      set_target_properties(onnxruntime_shared_lib_test PROPERTIES ENABLE_EXPORTS 1)
+    endif()
+
     if (IOS)
       add_custom_command(
         TARGET onnxruntime_shared_lib_test POST_BUILD
 
@@ -4534,10 +4534,19 @@ This version of the operator has been available since version 1 of the 'com.micr
 
   Quantized mixture of experts (MoE).
 
-        Only weights are quantized with symmetric quantization.
         The quantized weights are stored in column major order per expert.
         The quantization block size can be specified. If not provided, column wise quantization is used.
 
+        The formula of linear dequantization of the quantized weights using scale and (optionally) zero-point is:
+          dequantized_weight = (quantized_weight - zero_point) * scale
+        When zero_point is not provided, the default value is 2^(bits-1): 8 for 4 bits, 128 for 8 bits.
+  
+        If block_size is provided, both hidden_size and inter_size must be divisible by the block size, and
+        the dequantization is performed per block of size block_size along the K (input feature) dimension.
+  
+        If block_size and zero_point are provided, both hidden_size and inter_size must be divisible by block_size * pack_size,
+        where pack_size = 8 / expert_weight_bits.
+  
         The SwiGLU (Swish-Gated Linear Unit) activation function is like:
            g = xW + b
            l = xV + c
@@ -4579,7 +4588,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>Whether to use sparse mixer</dd>
 </dl>
 
-#### Inputs (7 - 11)
+#### Inputs (7 - 14)
 
 <dl>
 <dt><tt>input</tt> : T</dt>
@@ -4604,6 +4613,12 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>2D optional tensor with shape (num_experts, inter_size), or 3D optional tensor with shape (num_experts, inter_size, hidden_size / block_size) when block_size is provided.</dd>
 <dt><tt>fc3_experts_bias</tt> (optional) : T</dt>
 <dd>2D optional tensor with shape (num_experts, inter_size)</dd>
+<dt><tt>fc1_zero_points</tt> (optional) : T1</dt>
+<dd>2D tensor with shape (num_experts, fusion_size * inter_size / pack_size), or 3D tensor with shape (num_experts, fusion_size * inter_size, hidden_size / block_size / pack_size) when block_size is provided.</dd>
+<dt><tt>fc2_zero_points</tt> (optional) : T1</dt>
+<dd>2D tensor with shape (num_experts, hidden_size / pack_size), or 3D tensor with shape (num_experts, hidden_size, inter_size / block_size / pack_size) when block_size is provided.</dd>
+<dt><tt>fc3_zero_points</tt> (optional) : T1</dt>
+<dd>2D optional tensor with shape (num_experts, inter_size / pack_size), or 3D optional tensor with shape (num_experts, inter_size, hidden_size / block_size / pack_size) when block_size is provided.</dd>
 </dl>
 
 #### Outputs
 
@@ -584,7 +584,7 @@ Do not modify directly.*
 |QLinearSigmoid|*in* X:**T**<br> *in* X_scale:**tensor(float)**<br> *in* X_zero_point:**T**<br> *in* Y_scale:**tensor(float)**<br> *in* Y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
 |QLinearSoftmax|*in* X:**T**<br> *in* X_scale:**tensor(float)**<br> *in* x_zero_point:**T**<br> *in* y_scale:**tensor(float)**<br> *in* y_zero_point:**T**<br> *out* Y:**T**|1+|**T** = tensor(int8), tensor(uint8)|
 |QLinearWhere|*in* condition:**B**<br> *in* X:**T**<br> *in* x_scale:**TF**<br> *in* x_zero_point:**T**<br> *in* Y:**T**<br> *in* y_scale:**TF**<br> *in* y_zero_point:**T**<br> *in* z_scale:**TF**<br> *in* z_zero_point:**T**<br> *out* Z:**T**|1+|**T** = tensor(int8), tensor(uint8)|
-|QMoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T1**<br> *in* fc1_scales:**T2**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_weights:**T1**<br> *in* fc2_scales:**T2**<br> *in* fc2_experts_bias:**T**<br> *in* fc3_experts_weights:**T1**<br> *in* fc3_scales:**T2**<br> *in* fc3_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)<br/> **T1** = tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
+|QMoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T1**<br> *in* fc1_scales:**T2**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_weights:**T1**<br> *in* fc2_scales:**T2**<br> *in* fc2_experts_bias:**T**<br> *in* fc3_experts_weights:**T1**<br> *in* fc3_scales:**T2**<br> *in* fc3_experts_bias:**T**<br> *in* fc1_zero_points:**T1**<br> *in* fc2_zero_points:**T1**<br> *in* fc3_zero_points:**T1**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)<br/> **T1** = tensor(uint8)<br/> **T2** = tensor(float), tensor(float16)|
 |QuantizeLinear|*in* x:**T1**<br> *in* y_scale:**T1**<br> *in* y_zero_point:**T2**<br> *out* y:**T2**|1+|**T1** = tensor(float)<br/> **T2** = tensor(int16), tensor(int4), tensor(int8), tensor(uint16), tensor(uint4), tensor(uint8)|
 |QuickGelu|*in* X:**T**<br> *out* Y:**T**|1+|**T** = tensor(float)|
 |Range|*in* start:**T**<br> *in* limit:**T**<br> *in* delta:**T**<br> *out* Y:**T**|1+|**T** = tensor(double), tensor(float), tensor(int16), tensor(int32), tensor(int64)|
@@ -999,7 +999,7 @@ Do not modify directly.*
 |PackedMultiHeadAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* bias:**T**<br> *in* token_offset:**M**<br> *in* cumulative_sequence_length:**M**<br> *in* attention_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(float), tensor(float16)|
 |PagedAttention|*in* query:**T**<br> *in* key:**T**<br> *in* value:**T**<br> *in* key_cache:**T**<br> *in* value_cache:**T**<br> *in* cumulative_sequence_length:**S**<br> *in* past_seqlens:**S**<br> *in* block_table:**S**<br> *in* cos_cache:**T**<br> *in* sin_cache:**T**<br> *out* output:**T**<br> *out* key_cache_out:**T**<br> *out* value_cache_out:**T**|1+|**S** = tensor(int32)<br/> **T** = tensor(bfloat16), tensor(float16)|
 |QAttention|*in* input:**T1**<br> *in* weight:**T2**<br> *in* bias:**T3**<br> *in* input_scale:**T3**<br> *in* weight_scale:**T3**<br> *in* mask_index:**T4**<br> *in* input_zero_point:**T1**<br> *in* weight_zero_point:**T2**<br> *in* past:**T3**<br> *out* output:**T3**<br> *out* present:**T3**|1+|**T1** = tensor(int8)<br/> **T2** = tensor(int8)<br/> **T3** = tensor(float), tensor(float16)<br/> **T4** = tensor(int32)|
-|QMoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T1**<br> *in* fc1_scales:**T2**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_weights:**T1**<br> *in* fc2_scales:**T2**<br> *in* fc2_experts_bias:**T**<br> *in* fc3_experts_weights:**T1**<br> *in* fc3_scales:**T2**<br> *in* fc3_experts_bias:**T**<br> *out* output:**T**|1+|**T** = tensor(bfloat16), tensor(float16)<br/> **T1** = tensor(uint8)<br/> **T2** = tensor(bfloat16), tensor(float16)|
+|QMoE|*in* input:**T**<br> *in* router_probs:**T**<br> *in* fc1_experts_weights:**T1**<br> *in* fc1_scales:**T2**<br> *in* fc1_experts_bias:**T**<br> *in* fc2_experts_weights:**T1**<br> *in* fc2_scales:**T2**<br> *in* fc2_experts_bias:**T**<br> *in* fc3_experts_weights:**T1**<br> *in* fc3_scales:**T2**<br> *in* fc3_experts_bias:**T**<br> *in* fc1_zero_points:**T1**<br> *in* fc2_zero_points:**T1**<br> *in* fc3_zero_points:**T1**<br> *out* output:**T**|1+|**T** = tensor(bfloat16), tensor(float16)<br/> **T1** = tensor(uint8)<br/> **T2** = tensor(bfloat16), tensor(float16)|
 |QOrderedAttention|*in* input:**Q**<br> *in* scale_input:**S**<br> *in* scale_Q_gemm:**S**<br> *in* scale_K_gemm:**S**<br> *in* scale_V_gemm:**S**<br> *in* Q_weight:**Q**<br> *in* K_weight:**Q**<br> *in* V_weight:**Q**<br> *in* scale_Q_weight:**S**<br> *in* scale_K_weight:**S**<br> *in* scale_V_weight:**S**<br> *in* Q_bias:**S**<br> *in* K_bias:**S**<br> *in* V_bias:**S**<br> *in* scale_QKT_gemm:**S**<br> *in* scale_QKT_softmax:**S**<br> *in* scale_values_gemm:**S**<br> *in* mask_index:**G**<br> *in* past:**Q**<br> *in* attention_bias:**S**<br> *out* output:**Q**|1+|**G** = tensor(int32)<br/> **Q** = tensor(int8)<br/> **S** = tensor(float)|
 |QOrderedGelu|*in* X:**Q**<br> *in* scale_X:**S**<br> *in* scale_Y:**S**<br> *out* Y:**Q**|1+|**Q** = tensor(int8)<br/> **S** = tensor(float)|
 |QOrderedLayerNormalization|*in* X:**Q**<br> *in* scale_X:**S**<br> *in* scale:**F**<br> *in* B:**F**<br> *in* scale_Y:**S**<br> *out* Y:**Q**|1+|**F** = tensor(float), tensor(float16)<br/> **Q** = tensor(int8)<br/> **S** = tensor(float)|
 
@@ -945,6 +945,7 @@ WARN_LOGFILE           =
 
 INPUT                  = ../../include/onnxruntime/core/session/onnxruntime_c_api.h \
                          ../../include/onnxruntime/core/session/onnxruntime_cxx_api.h \
+                         ../../include/onnxruntime/core/session/onnxruntime_ep_c_api.h \
                          ../../orttraining/orttraining/training_api/include/onnxruntime_training_c_api.h \
                          ../../orttraining/orttraining/training_api/include/onnxruntime_training_cxx_api.h