From 1f2ae46eb877e4591d977df8f20f0edd79d13194 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Mon, 28 Jul 2025 20:13:42 +0200
Subject: [PATCH 01/24] Add ROCm backend

---
 cpp/CMakeLists.txt             |   98 +-
 cpp/main.cpp                   |    9 +
 cpp/neuralnet/rocmbackend.cpp  | 3077 ++++++++++++++++++++++++++++++++
 cpp/neuralnet/rocmerrorcheck.h |   59 +
 cpp/neuralnet/rocmhelpers.h    |   60 +
 cpp/neuralnet/rocmhelpers.hip  | 1905 ++++++++++++++++++++
 cpp/neuralnet/rocmincludes.h   |   15 +
 cpp/neuralnet/rocmutils.cpp    |  170 ++
 cpp/neuralnet/rocmutils.h      |   21 +
 9 files changed, 5413 insertions(+), 1 deletion(-)
 create mode 100644 cpp/neuralnet/rocmbackend.cpp
 create mode 100644 cpp/neuralnet/rocmerrorcheck.h
 create mode 100644 cpp/neuralnet/rocmhelpers.h
 create mode 100644 cpp/neuralnet/rocmhelpers.hip
 create mode 100644 cpp/neuralnet/rocmincludes.h
 create mode 100644 cpp/neuralnet/rocmutils.cpp
 create mode 100644 cpp/neuralnet/rocmutils.h

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2b6da407f..e12b7e41b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -32,7 +32,8 @@ endif()
 set(BUILD_DISTRIBUTED 0 CACHE BOOL "Build with http support for contributing to distributed training")
 set(USE_BACKEND CACHE STRING "Neural net backend")
 string(TOUPPER "${USE_BACKEND}" USE_BACKEND)
-set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN)
+# set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN)
+set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN ROCM)
 
 set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc")
 set(NO_GIT_REVISION 0 CACHE BOOL "Disable embedding the git revision into the compiled exe")
@@ -139,6 +140,42 @@ elseif(USE_BACKEND STREQUAL "EIGEN")
   set(NEURALNET_BACKEND_SOURCES
     neuralnet/eigenbackend.cpp
     )
+# --------------------------- ROCM 后端（AMD GPU / HIP  MIOpen） ---------------------------
+elseif(USE_BACKEND STREQUAL "ROCM")
+  message(STATUS "-DUSE_BACKEND=ROCM, using AMD ROCm backend.")
+
+  # 1) 启用 HIP 语言（.hip / .cpp 均可）并指定 C++17
+  enable_language(HIP)
+  set(CMAKE_HIP_STANDARD 17)
+
+  if(CMAKE_PREFIX_PATH STREQUAL "" OR NOT DEFINED CMAKE_PREFIX_PATH)
+    if(DEFINED ENV{HIP_PATH})
+      # Windows HIP‑SDK 或自定义安装
+      list(APPEND CMAKE_PREFIX_PATH $ENV{HIP_PATH})
+      message(STATUS "Auto‑detected HIP_PATH=$ENV{HIP_PATH} → CMAKE_PREFIX_PATH")
+    elseif(EXISTS "/opt/rocm")
+      # Linux 默认路径
+      list(APPEND CMAKE_PREFIX_PATH "/opt/rocm")
+      message(STATUS "CMAKE_PREFIX_PATH not given; defaulting to /opt/rocm")
+    endif()
+  endif()
+
+  # 可让用户用 -DCMAKE_HIP_ARCHITECTURES=gfx90a;gfx942 手动指定 GFX 架构
+  if(NOT DEFINED CMAKE_HIP_ARCHITECTURES)
+    # 默认同时编译常见 MI200 / RDNA3 卡，可按需精简
+    set(CMAKE_HIP_ARCHITECTURES 90a 942 908 1100 1101 1200 1201 CACHE STRING "AMD GPU targets")
+  endif()
+
+  # 2) 指定后端源码。rocmhelpers.hip 里是 GPU‑kernel，别漏了
+  set(NEURALNET_BACKEND_SOURCES
+    neuralnet/rocmbackend.cpp
+    neuralnet/rocmutils.cpp
+    neuralnet/rocmhelpers.hip
+  )
+
+  # 可选：启用 model-size‑based autotuning等额外宏
+  # add_compile_definitions(HIP_SUPPORTS_FP16)
+
 elseif(USE_BACKEND STREQUAL "")
   message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=TENSORRT or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=EIGEN to compile with the respective backend.${ColorReset}")
   set(NEURALNET_BACKEND_SOURCES neuralnet/dummybackend.cpp)
@@ -418,6 +455,65 @@ elseif(USE_BACKEND STREQUAL "OPENCL")
     link_directories(${OpenCL_LIBRARY})
     target_link_libraries(katago ${OpenCL_LIBRARY})
   endif()
+# --------------------------- ROCM 链接阶段 ---------------------------
+elseif(USE_BACKEND STREQUAL "ROCM")
+  # 宏：源代码里用 #ifdef USE_ROCM_BACKEND 判断
+  target_compile_definitions(katago PRIVATE USE_ROCM_BACKEND)
+  target_compile_definitions(katago PRIVATE HIP_TARGET_VERSION=${CMAKE_HIP_COMPILER_VERSION})
+
+  string(TOLOWER "${CMAKE_HIP_ARCHITECTURES}" _gfxlist)  # e.g. "90a;942"
+  if(_gfxlist MATCHES "803|900|90a|94[0-9]|110[0-9]|120[0-9]")
+    target_compile_definitions(katago PRIVATE HIP_SUPPORTS_FP16)
+    message(STATUS "Detected FP16‑capable GFX arch (${CMAKE_HIP_ARCHITECTURES}); defining HIP_SUPPORTS_FP16")
+  endif()
+
+  # 3) 找到 ROCm 运行时 & 库。自 ROCm 6.x 起都带 CMake config‑mode 包
+  #    如若找不到，加 -DCMAKE_PREFIX_PATH=/opt/rocm
+  find_package(hip        QUIET CONFIG)   # 导出 hip::device / hip::host
+  find_package(hipblas    QUIET CONFIG)   # 导出 roc::hipblas
+  find_package(miopen     QUIET CONFIG)   # 导出 roc::miopen
+  # ---------- fallback：HIP 运行时 ----------
+  if(NOT hip_FOUND)
+    find_path(HIP_INCLUDE_DIR hip/hip_runtime.h
+              HINTS ${CMAKE_PREFIX_PATH} /opt/rocm
+              PATH_SUFFIXES include)
+    find_library(HIP_RUNTIME_LIB amdhip64
+                 HINTS ${CMAKE_PREFIX_PATH} /opt/rocm
+                 PATH_SUFFIXES lib lib64)
+    if(NOT HIP_INCLUDE_DIR OR NOT HIP_RUNTIME_LIB)
+      message(FATAL_ERROR "HIP headers or runtime NOT found; install ROCm or set CMAKE_PREFIX_PATH.")
+    endif()
+    add_library(hip::device UNKNOWN IMPORTED)
+    set_target_properties(hip::device PROPERTIES
+      IMPORTED_LOCATION "${HIP_RUNTIME_LIB}"
+      INTERFACE_INCLUDE_DIRECTORIES "${HIP_INCLUDE_DIR}")
+    target_include_directories(katago SYSTEM PRIVATE ${HIP_INCLUDE_DIR})
+  endif()
+
+  # ---------- fallback：hipBLAS / MIOpen ----------
+  foreach(_pkg hipblas miopen)
+    if(NOT ${_pkg}_FOUND)
+      find_library(${_pkg}_LIB ${_pkg}
+                   HINTS ${CMAKE_PREFIX_PATH} /opt/rocm
+                   PATH_SUFFIXES lib lib64)
+      if(${_pkg}_LIB)
+        add_library(roc::${_pkg} UNKNOWN IMPORTED)
+        set_target_properties(roc::${_pkg} PROPERTIES
+          IMPORTED_LOCATION "${${_pkg}_LIB}")
+        target_include_directories(katago SYSTEM PRIVATE ${HIP_INCLUDE_DIR})
+      else()
+        message(FATAL_ERROR "Required ROCm component ${_pkg} not found – install it or set CMAKE_PREFIX_PATH.")
+      endif()
+    endif()
+  endforeach()
+
+  # 4) 头文件路径已由 config‑mode target 解决，无需硬编码
+  target_link_libraries(katago
+    hip::device          # HIP runtime & kernel offload
+    roc::hipblas         # BLAS
+    MIOpen
+    roc::miopen          # DNN primitives
+  )
 elseif(USE_BACKEND STREQUAL "EIGEN")
   target_compile_definitions(katago PRIVATE USE_EIGEN_BACKEND)
   if(NOT (MSVC))
diff --git a/cpp/main.cpp b/cpp/main.cpp
index f86a44a27..24259f984 100644
--- a/cpp/main.cpp
+++ b/cpp/main.cpp
@@ -239,6 +239,13 @@ string Version::getKataGoVersionFullInfo() {
   out << "Using Metal backend" << endl;
 #elif defined(USE_OPENCL_BACKEND)
   out << "Using OpenCL backend" << endl;
+#elif defined(USE_ROCM_BACKEND)
+  out << "Using ROCm backend" << endl;
+#if defined(HIP_TARGET_VERSION)
+#define STRINGIFY(x) #x
+#define STRINGIFY2(x) STRINGIFY(x)
+  out << "Compiled with HIP runtime version " << STRINGIFY2(HIP_TARGET_VERSION) << endl;
+#endif
 #elif defined(USE_EIGEN_BACKEND)
   out << "Using Eigen(CPU) backend" << endl;
 #else
@@ -271,6 +278,8 @@ string Version::getGitRevisionWithBackend() {
   s += "-cuda";
 #elif defined(USE_TENSORRT_BACKEND)
   s += "-trt";
+#elif defined(USE_ROCM_BACKEND)
+  s += "-rocm";
 #elif defined(USE_METAL_BACKEND)
   s += "-metal";
 #elif defined(USE_OPENCL_BACKEND)
diff --git a/cpp/neuralnet/rocmbackend.cpp b/cpp/neuralnet/rocmbackend.cpp
new file mode 100644
index 000000000..11489e85a
--- /dev/null
+++ b/cpp/neuralnet/rocmbackend.cpp
@@ -0,0 +1,3077 @@
+#include "hip/hip_runtime.h"
+// #ifdef USE_ROCM_BACKEND
+#include <map>
+#include <string>
+#include <vector>
+#include <cassert>
+
+#include "../neuralnet/rocmerrorcheck.h"
+#include "../neuralnet/rocmincludes.h"
+#include "../neuralnet/rocmhelpers.h"
+#include "../neuralnet/rocmutils.h"
+
+#include "../neuralnet/modelversion.h"
+#include "../neuralnet/nninterface.h"
+#include "../neuralnet/nninputs.h"
+#include "../neuralnet/sgfmetadata.h"
+#include "../neuralnet/nneval.h"
+#include "../neuralnet/desc.h"
+
+#include "../core/simpleallocator.h"
+#include "../core/test.h"
+
+#include "../external/half-2.2.0/include/half.hpp"
+
+//------------------------
+#include "../core/using.h"
+//------------------------
+
+using half_t = half_float::half;
+
+//Define this to print out some of the intermediate values of the neural net
+//#define DEBUG_INTERMEDIATE_VALUES
+
+void NeuralNet::globalInitialize() {
+  //Empty for cudnn backend
+}
+
+void NeuralNet::globalCleanup() {
+  hipDeviceReset();
+}
+
+struct CudaHandles {
+  hipblasHandle_t cublas;
+  miopenStatus_t cudnn;
+  const int majorComputeCapability;
+  const int minorComputeCapability;
+
+  CudaHandles(int major, int minor)
+    : majorComputeCapability(major),
+      minorComputeCapability(minor)
+  {
+    CUBLAS_ERR("CudaHandles",hipblasCreate(&cublas));
+    CUDNN_ERR("CudaHandles",miopenCreate(&cudnn));
+  }
+
+  ~CudaHandles() {
+    hipblasDestroy(cublas);
+    miopenDestroy(cudnn);
+  }
+
+  static CudaHandles* cudaHandlesTesting() {
+    const int gpuIdxForThisThread = 0;
+    hipDeviceProp_t prop;
+    hipGetDeviceProperties(&prop,gpuIdxForThisThread);
+    return new CudaHandles(prop.major, prop.minor);
+  }
+
+  CudaHandles(const CudaHandles&) = delete;
+  CudaHandles& operator=(const CudaHandles&) = delete;
+};
+
+//---------------------------------------------------------------------------------
+
+template<typename T>
+struct ByBatchSize {
+  const int maxBatchSize;
+  T* data;
+  miopenStatus_t (*destroyFunc)(T);
+
+  ByBatchSize()
+    : maxBatchSize(0), data(nullptr), destroyFunc(nullptr)
+  {}
+
+  ByBatchSize(
+    int maxBatchSize_
+  ) : maxBatchSize(maxBatchSize_), data(nullptr), destroyFunc(nullptr) {
+    data = new T[maxBatchSize];
+  }
+
+  ByBatchSize(const ByBatchSize&) = delete;
+  ByBatchSize& operator=(const ByBatchSize&) = delete;
+
+  ~ByBatchSize() {
+    if(destroyFunc != nullptr && data != nullptr) {
+      for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
+        (*destroyFunc)(data[batchSize-1]);
+      }
+    }
+    if(data != nullptr) {
+      delete[] data;
+      data = nullptr;
+    }
+  }
+  T& operator[](int batchSize) {
+    return data[batchSize-1];
+  }
+  const T& operator[](int batchSize) const {
+    return data[batchSize-1];
+  }
+};
+
+template<typename T>
+struct ByBatchSizeView {
+  int maxBatchSize;
+  T* data;
+
+  ByBatchSizeView()
+    : maxBatchSize(0), data(nullptr)
+  {}
+
+  ByBatchSizeView(const ByBatchSize<T>& toView)
+    : maxBatchSize(toView.maxBatchSize), data(toView.data)
+  {}
+  ByBatchSizeView& operator=(const ByBatchSize<T>& toView) {
+    maxBatchSize = toView.maxBatchSize;
+    data = toView.data;
+  }
+
+  ~ByBatchSizeView() {
+  }
+  T& operator[](int batchSize) {
+    return data[batchSize-1];
+  }
+  const T& operator[](int batchSize) const {
+    return data[batchSize-1];
+  }
+};
+
+//---------------------------------------------------------------------------------
+
+
+//channels, useFP16, useNHWC
+typedef std::tuple<int, bool, bool> CudnnTensorDesc4DKey;
+
+struct CudnnTensorDesc4DKey {
+  int channels;
+  bool useFP16;
+  bool useNHWC;
+  bool operator<(const CudnnTensorDesc4DKey& other) const {
+    return std::tie(channels, useFP16, useNHWC) <
+           std::tie(other.channels, other.useFP16, other.useNHWC);
+  }
+};
+
+template <typename T>
+struct ByBatchSize {
+  explicit ByBatchSize(int max)
+      : data(max + 1), destroyFunc(nullptr) {}
+  ~ByBatchSize() {
+    if (destroyFunc) {
+      for (auto& d : data) {
+        if (d) destroyFunc(d);
+      }
+    }
+  }
+  T& operator[](int idx) { return data[idx]; }
+  std::vector<T> data;
+  miopenStatus_t (*destroyFunc)(T) = nullptr;
+};
+
+template <typename T>
+struct ByBatchSizeView {
+  explicit ByBatchSizeView(ByBatchSize<T>& ref) : ref(ref) {}
+  T& operator[](int idx) { return ref[idx]; }
+  ByBatchSize<T>& ref;
+};
+
+// -----------------------------------------------------------------------------
+//                                CudnnManager
+// -----------------------------------------------------------------------------
+struct CudnnManager {
+  const std::string name;
+  const int maxBatchSize;
+  const int nnXLen;
+  const int nnYLen;
+  std::map<CudnnTensorDesc4DKey, ByBatchSize<miopenTensorDescriptor_t>*>
+      tensorDesc4DByBatchSizeByKey;
+
+  CudnnManager(std::string name_, int maxBatchSize_, int nnXLen_, int nnYLen_)
+      : name(std::move(name_)),
+        maxBatchSize(maxBatchSize_),
+        nnXLen(nnXLen_),
+        nnYLen(nnYLen_),
+        tensorDesc4DByBatchSizeByKey() {}
+
+  ~CudnnManager() {
+    for (auto& iter : tensorDesc4DByBatchSizeByKey) {
+      delete iter.second;
+    }
+  }
+
+  ByBatchSizeView<miopenTensorDescriptor_t> getTensorDesc4DByBatchSize(
+      int channels, bool useFP16, bool useNHWC) {
+    auto iter = tensorDesc4DByBatchSizeByKey.find({channels, useFP16, useNHWC});
+    if (iter != tensorDesc4DByBatchSizeByKey.end()) {
+      return ByBatchSizeView<miopenTensorDescriptor_t>(*(iter->second));
+    }
+
+    auto* descs = new ByBatchSize<miopenTensorDescriptor_t>(maxBatchSize);
+
+    for (int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
+      miopenTensorDescriptor_t& desc = (*descs)[batchSize];
+      // Create descriptor
+      CUDNN_ERR(name.c_str(), miopenCreateTensorDescriptor(&desc));
+
+      const miopenDataType_t dtype = useFP16 ? miopenHalf : miopenFloat;
+
+      if (!useNHWC) {
+        // Fully‑supported NCHW fast‑path
+        CUDNN_ERR(name.c_str(),
+                  miopenSet4dTensorDescriptor(desc, dtype, batchSize, channels,
+                                              nnYLen, nnXLen));
+      } else {
+        // NHWC path via generic Nd descriptor + explicit strides
+        int dims[4] = {batchSize, nnYLen, nnXLen, channels};  // N H W C
+        int strides[4];
+        strides[3] = 1;                             // C stride
+        strides[2] = strides[3] * channels;         // W stride
+        strides[1] = strides[2] * nnXLen;           // H stride
+        strides[0] = strides[1] * nnYLen;           // N stride
+
+        CUDNN_ERR(name.c_str(),
+                  miopenSetTensorDescriptor(desc, dtype, 4, dims, strides));
+      }
+    }
+
+    descs->destroyFunc = miopenDestroyTensorDescriptor;
+    tensorDesc4DByBatchSizeByKey[{channels, useFP16, useNHWC}] = descs;
+    return ByBatchSizeView<miopenTensorDescriptor_t>(*descs);
+  }
+};
+
+
+//---------------------------------------------------------------------------------
+
+struct ScratchBuffers {
+
+  const size_t batchXYFloatBytes;
+  const size_t batchFloatBytes;
+  const size_t batchXYBytes;
+  const size_t batchBytes;
+
+  SimpleAllocator<void*>* allocator;
+
+  // Not scratch, but convenient to have here
+  void* zeroBuf;
+  void* oneBuf;
+
+  ScratchBuffers() = delete;
+  ScratchBuffers(const ScratchBuffers&) = delete;
+  ScratchBuffers& operator=(const ScratchBuffers&) = delete;
+
+  ScratchBuffers(int maxBatchSize, int nnXLen, int nnYLen, bool useFP16)
+    : batchXYFloatBytes((size_t)maxBatchSize * nnXLen * nnYLen * sizeof(float)),
+      batchFloatBytes((size_t)maxBatchSize * sizeof(float)),
+      batchXYBytes((size_t)maxBatchSize * nnXLen * nnYLen * (useFP16 ? sizeof(half_t) : sizeof(float))),
+      batchBytes((size_t)maxBatchSize * (useFP16 ? sizeof(half_t) : sizeof(float)))
+  {
+    std::function<void*(size_t)> allocateFunc = [](size_t size) {
+      void* buf;
+      CUDA_ERR("ScratchBuffers",hipMalloc(&buf, size));
+      return buf;
+    };
+    std::function<void(void*)> releaseFunc = [](void* buf) {
+      hipFree(buf);
+    };
+
+    allocator = new SimpleAllocator<void*>(allocateFunc, releaseFunc);
+
+    CudaUtils::hostMallocZeroOneBufs(zeroBuf, oneBuf, useFP16);
+  }
+  ~ScratchBuffers() {
+    delete allocator;
+    free(zeroBuf);
+    free(oneBuf);
+  }
+
+  size_t getBufSizeXY(int channels) const {
+    return channels * batchXYBytes;
+  }
+  size_t getBufSizeXYFloat(int channels) const {
+    return channels * batchXYFloatBytes;
+  }
+  size_t getBufSizeFloat(int channels) const {
+    return channels * batchFloatBytes;
+  }
+  size_t getBufSize(int channels) const {
+    return channels * batchBytes;
+  }
+
+};
+
+
+//---------------------------------------------------------------------------------
+
+struct ConvLayer {
+  const string name;
+  const int inChannels;
+  const int outChannels;
+  ByBatchSizeView<miopenTensorDescriptor_t> inputDescriptors;
+  ByBatchSizeView<miopenTensorDescriptor_t> outputDescriptors;
+  miopenTensorDescriptor_t filterDescriptor;
+  miopenConvolutionDescriptor_t convolutionDescriptor;
+  ByBatchSize<miopenConvFwdAlgorithm_t >* convolutionAlgorithms; //array of one for each batch size
+  void* filterBuf;
+
+  ConvLayer() = delete;
+  ConvLayer(const ConvLayer&) = delete;
+  ConvLayer& operator=(const ConvLayer&) = delete;
+
+  ConvLayer(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const ConvLayerDesc* desc,
+    bool useFP16,
+    bool useNHWC
+  ) : ConvLayer(cudaHandles, manager, desc, useFP16, useNHWC, useNHWC)
+  {}
+
+  ConvLayer(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const ConvLayerDesc* desc,
+    bool useFP16,
+    bool useNHWCIn,
+    bool useNHWCOut
+  ) :
+    name(desc->name),
+    inChannels(desc->inChannels),
+    outChannels(desc->outChannels)
+  {
+    int convYSize = desc->convYSize;
+    int convXSize = desc->convXSize;
+    int dilationY = desc->dilationY;
+    int dilationX = desc->dilationX;
+    int paddingX = (convXSize / 2) * dilationX;
+    int paddingY = (convYSize / 2) * dilationY;
+
+    assert(convXSize % 2 == 1);
+    assert(convYSize % 2 == 1);
+
+    inputDescriptors = manager->getTensorDesc4DByBatchSize(inChannels,useFP16,useNHWCIn);
+    outputDescriptors = manager->getTensorDesc4DByBatchSize(outChannels,useFP16,useNHWCOut);
+    int maxBatchSize = manager->maxBatchSize;
+
+    bool filterNHWC = useNHWCOut && dilationY == 1 && dilationX == 1;
+
+    CUDNN_ERR(name.c_str(),miopenCreateTensorDescriptor(&filterDescriptor));
+    CUDNN_ERR(name.c_str(),miopenSet4dTensorDescriptor(
+      filterDescriptor,
+      (useFP16 ? miopenHalf : miopenFloat),
+      outChannels,
+      inChannels,
+      convYSize,
+      convXSize
+    ));
+
+    int yStride = 1;
+    int xStride = 1;
+
+    bool tensorCoresSupported = true;
+
+    CUDNN_ERR(name.c_str(),miopenCreateConvolutionDescriptor(&convolutionDescriptor));
+    CUDNN_ERR(name.c_str(),miopenInitConvolutionDescriptor(
+      convolutionDescriptor,
+      miopenConvolution,
+      paddingY,
+      paddingX,
+      yStride,
+      xStride,
+      dilationY,
+      dilationX
+    ));
+    if(useFP16) {
+      int alt = 1; // non‑zero enables alt‑impl on MI2xx+ GPUs
+      miopenSetConvolutionAttribute(convolutionDescriptor,
+                                    MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL,
+                                    alt);
+    }
+
+    convolutionAlgorithms = new ByBatchSize<miopenConvFwdAlgorithm_t >(maxBatchSize);
+
+        for(int batchSize = 1; batchSize <= maxBatchSize; ++batchSize) {
+      if(useFP16 && dilationX <= 1 && dilationY <= 1) {
+        (*convolutionAlgorithms)[batchSize] = miopenConvolutionFwdAlgoImplicitGEMM;
+      }
+      else {
+        (*convolutionAlgorithms)[batchSize] = miopenConvolutionFwdAlgoDirect;
+        // If desired, call miopenFindConvolutionForwardAlgorithm() here once you
+        // have real device buffers to auto‑tune. See porting notes.
+      }
+    }
+
+    assert(desc->weights.size() == convYSize * convXSize * inChannels * outChannels);
+
+    if(filterNHWC) {
+      vector<float> weightsTransposed(desc->weights.size());
+      for(int y = 0; y < convYSize; y++) {
+        for(int x = 0; x < convXSize; x++) {
+          for(int ic = 0; ic < inChannels; ic++) {
+            for(int oc = 0; oc < outChannels; oc++) {
+              weightsTransposed[((oc*convYSize + y)*convXSize + x)*inChannels + ic] =
+                desc->weights[((oc*inChannels + ic)*convYSize + y)*convXSize + x];
+            }
+          }
+        }
+      }
+      CudaUtils::mallocAndCopyToDevice(name,weightsTransposed,filterBuf,useFP16);
+      hipDeviceSynchronize();
+    }
+    else
+      CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
+  }
+
+  ~ConvLayer() {
+    hipFree(filterBuf);
+    miopenDestroyTensorDescriptor(filterDescriptor);
+    miopenDestroyConvolutionDescriptor(convolutionDescriptor);
+    delete convolutionAlgorithms;
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t workspaceBytes = 0;
+    CUDNN_ERR(name.c_str(), miopenConvolutionForwardGetWorkSpaceSize(
+                              cudaHandles->cudnn,
+                              filterDescriptor,
+                              inputDescriptors[batchSize],
+                              convolutionDescriptor,
+                              outputDescriptors[batchSize],
+                              &workspaceBytes));
+    return workspaceBytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    int          batchSize,
+    bool         accumulate,        // if true, beta = 1 (unsupported by MIOpen fwd)
+    void*        inputBuf,
+    void*        outputBuf,
+    void*        workspaceBuf,
+    size_t       workspaceBytes) const
+{
+  const float alpha = 1.0f;
+  const float beta  = accumulate ? 1.0f : 0.0f;
+
+  // New MIOpen API order: ... algo, beta, yDesc, y, workSpace, workSpaceSize
+  CUDNN_ERR(name.c_str(), miopenConvolutionForward(
+                cudaHandles->cudnn,
+                &alpha,
+                inputDescriptors[batchSize],
+                inputBuf,
+                filterDescriptor,
+                filterBuf,
+                convolutionDescriptor,
+                (*convolutionAlgorithms)[batchSize],
+                &beta,
+                outputDescriptors[batchSize],
+                outputBuf,
+                workspaceBuf,
+                workspaceBytes));
+  }
+
+};
+
+
+//---------------------------------------------------------------------------------
+
+struct BatchNormLayer {
+  const string name;
+  const int numChannels;
+  const float epsilon;
+  const int activation;
+  const int nnXLen;
+  const int nnYLen;
+
+  const bool usingFP16;
+  const bool usingNHWC;
+
+  void* mergedScaleBuf;
+  void* mergedBiasBuf;
+
+  BatchNormLayer() = delete;
+  BatchNormLayer(const BatchNormLayer&) = delete;
+  BatchNormLayer& operator=(const BatchNormLayer&) = delete;
+
+  BatchNormLayer(
+    CudaHandles* cudaHandles,
+    const BatchNormLayerDesc* desc,
+    const ActivationLayerDesc* actDesc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ) :
+    name(desc->name),
+    numChannels(desc->numChannels),
+    epsilon(desc->epsilon),
+    activation(actDesc->activation),
+    nnXLen(nnX),
+    nnYLen(nnY),
+    usingFP16(useFP16),
+    usingNHWC(useNHWC)
+  {
+    (void)cudaHandles;
+
+    assert(desc->mean.size() == numChannels);
+    assert(desc->variance.size() == numChannels);
+    assert(desc->scale.size() == numChannels);
+    assert(desc->bias.size() == numChannels);
+    assert(desc->mergedScale.size() == numChannels);
+    assert(desc->mergedBias.size() == numChannels);
+    CudaUtils::mallocAndCopyToDevice(name,desc->mergedScale,mergedScaleBuf,useFP16);
+    CudaUtils::mallocAndCopyToDevice(name,desc->mergedBias,mergedBiasBuf,useFP16);
+  }
+  ~BatchNormLayer() {
+    hipFree(mergedScaleBuf);
+    hipFree(mergedBiasBuf);
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    int batchSize,
+    void* inputBuf,
+    const void* maskBuf, //ok to be null
+    void* outputBuf
+  ) const {
+    (void)cudaHandles;
+    if(!usingFP16) {
+      if(!usingNHWC)
+        customCudaApplyCScaleBiasNCHW((const float*)inputBuf,(float*)outputBuf,(const float*)mergedScaleBuf,(const float*)mergedBiasBuf,
+                                      (const float*)maskBuf,
+                                      batchSize,numChannels,nnXLen*nnYLen,activation);
+      else
+        customCudaApplyCScaleBiasNHWC((const float*)inputBuf,(float*)outputBuf,(const float*)mergedScaleBuf,(const float*)mergedBiasBuf,
+                                      (const float*)maskBuf,
+                                      batchSize,nnXLen*nnYLen,numChannels,activation);
+    }
+    else {
+      if(!usingNHWC)
+        customCudaApplyCScaleBiasNCHW((const half*)inputBuf,(half*)outputBuf,(const half*)mergedScaleBuf,(const half*)mergedBiasBuf,
+                                      (const half*)maskBuf,
+                                      batchSize,numChannels,nnXLen*nnYLen,activation);
+      else
+        customCudaApplyCScaleBiasNHWC((const half*)inputBuf,(half*)outputBuf,(const half*)mergedScaleBuf,(const half*)mergedBiasBuf,
+                                      (const half*)maskBuf,
+                                      batchSize,nnXLen*nnYLen,numChannels,activation);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+    }
+
+  }
+
+};
+
+
+//---------------------------------------------------------------------------------
+
+struct MatMulLayer {
+  const string name;
+  const int inChannels;
+  const int outChannels;
+  const bool usingFP16;
+  void* matBuf;
+
+  MatMulLayer() = delete;
+  MatMulLayer(const MatMulLayer&) = delete;
+  MatMulLayer& operator=(const MatMulLayer&) = delete;
+
+  MatMulLayer(
+    CudaHandles* cudaHandles,
+    const MatMulLayerDesc* desc,
+    bool useFP16
+  ) :
+    name(desc->name),
+    inChannels(desc->inChannels),
+    outChannels(desc->outChannels),
+    usingFP16(useFP16)
+  {
+    (void)cudaHandles;
+
+    if(inChannels > 0 && outChannels > 0) {
+      assert(desc->weights.size() == inChannels * outChannels);
+      CudaUtils::mallocAndCopyToDevice(name,desc->weights,matBuf,useFP16);
+    }
+    else {
+      matBuf = NULL;
+    }
+  }
+
+  ~MatMulLayer() {
+    if(inChannels > 0 && outChannels > 0)
+      hipFree(matBuf);
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles
+  ) const {
+    (void)cudaHandles;
+    size_t workspaceBytes = 0;
+    return workspaceBytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* inputBuf,
+    void* outputBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    (void)workspaceBuf;
+    (void)workspaceBytes;
+    assert(inChannels > 0 && outChannels > 0);
+
+    if(!usingFP16) {
+      const float alpha = 1.0f;
+      const float beta = 0.0f;
+      CUBLAS_ERR(name.c_str(),hipblasSgemm(
+        cudaHandles->cublas,
+        HIPBLAS_OP_N,
+        HIPBLAS_OP_N,
+        outChannels,
+        batchSize,
+        inChannels,
+        &alpha,
+        (const float*)matBuf,outChannels,
+        (const float*)inputBuf,inChannels,
+        &beta,
+        (float*)outputBuf,outChannels
+      ));
+    }
+    else {
+      const half* alpha = (const half*)scratch->oneBuf;
+      const half* beta = (const half*)scratch->zeroBuf;
+      CUBLAS_ERR(name.c_str(),hipblasHgemm(
+        cudaHandles->cublas,
+        HIPBLAS_OP_N,
+        HIPBLAS_OP_N,
+        outChannels,
+        batchSize,
+        inChannels,
+        alpha,
+        (const half*)matBuf,outChannels,
+        (const half*)inputBuf,inChannels,
+        beta,
+        (half*)outputBuf,outChannels
+      ));
+    }
+
+  }
+
+};
+
+//---------------------------------------------------------------------------------
+
+struct MatBiasLayer {
+  const string name;
+  const int numChannels;
+  const bool usingFP16;
+  const int activation;
+
+  void* biasBuf;
+
+  MatBiasLayer() = delete;
+  MatBiasLayer(const MatBiasLayer&) = delete;
+  MatBiasLayer& operator=(const MatBiasLayer&) = delete;
+
+  MatBiasLayer(
+    CudaHandles* cudaHandles,
+    const MatBiasLayerDesc* desc,
+    bool useFP16,
+    int activation_
+  ) :
+    name(desc->name),
+    numChannels(desc->numChannels),
+    usingFP16(useFP16),
+    activation(activation_)
+  {
+    (void)cudaHandles;
+    if(numChannels > 0) {
+      assert(desc->weights.size() == numChannels);
+      CudaUtils::mallocAndCopyToDevice(name,desc->weights,biasBuf,useFP16);
+    }
+    else
+      biasBuf = NULL;
+  }
+
+  ~MatBiasLayer() {
+    if(numChannels > 0)
+      hipFree(biasBuf);
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    int batchSize,
+    void* matBuf
+  ) const {
+    (void)cudaHandles;
+    assert(numChannels > 0);
+    if(!usingFP16) {
+      customCudaAddCBiasInplaceNC((float*)matBuf,(const float*)biasBuf,batchSize,numChannels,activation);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+    }
+    else {
+      customCudaAddCBiasInplaceNC((half*)matBuf,(const half*)biasBuf,batchSize,numChannels,activation);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+    }
+  }
+
+};
+
+//---------------------------------------------------------------------------------
+
+struct NormActConv {
+  const BatchNormLayer norm;
+  const ConvLayer conv;
+
+  const int inChannels;
+  const int outChannels;
+  const int nnXLen;
+  const int nnYLen;
+  const bool usingFP16;
+  const bool usingNHWC;
+
+  NormActConv() = delete;
+  NormActConv(const NormActConv&) = delete;
+  NormActConv& operator=(const NormActConv&) = delete;
+
+  NormActConv(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const BatchNormLayerDesc* normDesc,
+    const ActivationLayerDesc* actDesc,
+    const ConvLayerDesc* convDesc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ): norm(cudaHandles,normDesc,actDesc,nnX,nnY,useFP16,useNHWC),
+     conv(cudaHandles,manager,convDesc,useFP16,useNHWC),
+     inChannels(norm.numChannels),
+     outChannels(conv.outChannels),
+     nnXLen(nnX),
+     nnYLen(nnY),
+     usingFP16(useFP16),
+     usingNHWC(useNHWC)
+  {
+    assert(norm.numChannels == conv.inChannels);
+  }
+
+  ~NormActConv()
+  {}
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+    b = conv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    int batchSize,
+    bool accumulate,
+    void* inBuf,
+    void* inScratchBuf,
+    void* outBuf,
+    void* maskBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    norm.apply(cudaHandles,batchSize,inBuf,maskBuf,inScratchBuf);
+#ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("AFTER NORM "), inScratchBuf, batchSize, inChannels, nnXLen, nnYLen, usingNHWC, usingFP16);
+#endif
+    conv.apply(cudaHandles,batchSize,accumulate,inScratchBuf,outBuf,workspaceBuf,workspaceBytes);
+  }
+
+};
+
+
+//---------------------------------------------------------------------------------
+
+struct ResidualBlock {
+  const string name;
+  const NormActConv normActConv1;
+  const NormActConv normActConv2;
+
+  ResidualBlock() = delete;
+  ResidualBlock(const ResidualBlock&) = delete;
+  ResidualBlock& operator=(const ResidualBlock&) = delete;
+
+  ResidualBlock(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const ResidualBlockDesc* desc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ): name(desc->name),
+     normActConv1(cudaHandles,manager,&desc->preBN,&desc->preActivation,&desc->regularConv,nnX,nnY,useFP16,useNHWC),
+     normActConv2(cudaHandles,manager,&desc->midBN,&desc->midActivation,&desc->finalConv,nnX,nnY,useFP16,useNHWC)
+  {
+  }
+
+  ~ResidualBlock()
+  {}
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+    b = normActConv1.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = normActConv2.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* trunkBuf,
+    void* trunkScratchBuf,
+    void* maskBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    SizedBuf<void*> midIn(scratch->allocator, scratch->getBufSizeXY(normActConv1.outChannels));
+    SizedBuf<void*> midScratch(scratch->allocator, scratch->getBufSizeXY(normActConv1.outChannels));
+    normActConv1.apply(cudaHandles,batchSize,false,trunkBuf,trunkScratchBuf,midIn.buf,maskBuf,workspaceBuf,workspaceBytes);
+    normActConv2.apply(cudaHandles,batchSize,true,midIn.buf,midScratch.buf,trunkBuf,maskBuf,workspaceBuf,workspaceBytes);
+  }
+
+};
+
+
+//----------------------------------------------------------------------------
+
+
+struct GlobalPoolingResidualBlock {
+  const string name;
+  const BatchNormLayer preBN;
+  const ConvLayer regularConv;
+  const ConvLayer gpoolConv;
+  const BatchNormLayer gpoolBN;
+  const MatMulLayer gpoolToBiasMul;
+  const NormActConv normActConv2;
+
+  const int nnXLen;
+  const int nnYLen;
+  const int regularChannels;
+  const int gpoolChannels;
+  const bool usingFP16;
+  const bool usingNHWC;
+
+  GlobalPoolingResidualBlock() = delete;
+  GlobalPoolingResidualBlock(const GlobalPoolingResidualBlock&) = delete;
+  GlobalPoolingResidualBlock& operator=(const GlobalPoolingResidualBlock&) = delete;
+
+  GlobalPoolingResidualBlock(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const GlobalPoolingResidualBlockDesc* desc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ): name(desc->name),
+     preBN(cudaHandles,&desc->preBN,&desc->preActivation,nnX,nnY,useFP16,useNHWC),
+     regularConv(cudaHandles,manager,&desc->regularConv,useFP16,useNHWC),
+     gpoolConv(cudaHandles,manager,&desc->gpoolConv,useFP16,useNHWC),
+     gpoolBN(cudaHandles,&desc->gpoolBN,&desc->gpoolActivation,nnX,nnY,useFP16,useNHWC),
+     gpoolToBiasMul(cudaHandles,&desc->gpoolToBiasMul,useFP16),
+     normActConv2(cudaHandles,manager,&desc->midBN,&desc->midActivation,&desc->finalConv,nnX,nnY,useFP16,useNHWC),
+     nnXLen(nnX),
+     nnYLen(nnY),
+     regularChannels(desc->regularConv.outChannels),
+     gpoolChannels(desc->gpoolConv.outChannels),
+     usingFP16(useFP16),
+     usingNHWC(useNHWC)
+  {
+  }
+
+  ~GlobalPoolingResidualBlock() {
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+    b = regularConv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = gpoolConv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = gpoolToBiasMul.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = normActConv2.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = sizeof(float)*batchSize*gpoolChannels*nnXLen*nnYLen;
+    bytes = std::max(bytes,b);
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* trunkBuf,
+    void* trunkScratchBuf,
+    void* maskBuf,
+    float* maskSumBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    SizedBuf<void*> regularOut(scratch->allocator, scratch->getBufSizeXY(regularChannels));
+    SizedBuf<void*> regularScratch(scratch->allocator, scratch->getBufSizeXY(regularChannels));
+    SizedBuf<void*> gpoolOut(scratch->allocator, scratch->getBufSizeXY(gpoolChannels));
+    SizedBuf<void*> gpoolOut2(scratch->allocator, scratch->getBufSizeXY(gpoolChannels));
+    SizedBuf<void*> gpoolConcat(scratch->allocator, scratch->getBufSize(gpoolChannels*3));
+    SizedBuf<void*> gpoolBias(scratch->allocator, scratch->getBufSize(regularChannels));
+
+    preBN.apply(cudaHandles,batchSize,trunkBuf,maskBuf,trunkScratchBuf);
+    regularConv.apply(cudaHandles,batchSize,false,trunkScratchBuf,regularOut.buf,workspaceBuf,workspaceBytes);
+    gpoolConv.apply(cudaHandles,batchSize,false,trunkScratchBuf,gpoolOut.buf,workspaceBuf,workspaceBytes);
+    gpoolBN.apply(cudaHandles,batchSize,gpoolOut.buf,maskBuf,gpoolOut2.buf);
+
+    if(!usingFP16) {
+      if(!usingNHWC)
+        customCudaPoolRowsGPoolNCHW((const float*)gpoolOut2.buf,(float*)gpoolConcat.buf,batchSize,gpoolChannels,nnXLen*nnYLen,(const float*)maskBuf,maskSumBuf);
+      else
+        customCudaPoolRowsGPoolNHWC((const float*)gpoolOut2.buf,(float*)gpoolConcat.buf,batchSize,nnXLen*nnYLen,gpoolChannels,(const float*)maskBuf,maskSumBuf);
+    }
+    else {
+      if(!usingNHWC)
+        customCudaPoolRowsGPoolNCHW((const half*)gpoolOut2.buf,(half*)gpoolConcat.buf,batchSize,gpoolChannels,nnXLen*nnYLen,(const half*)maskBuf,maskSumBuf);
+      else
+        customCudaPoolRowsGPoolNHWC((const half*)gpoolOut2.buf,(half*)gpoolConcat.buf,batchSize,nnXLen*nnYLen,gpoolChannels,(const half*)maskBuf,maskSumBuf);
+    }
+    CUDA_ERR(name.c_str(),hipPeekAtLastError());
+
+    gpoolToBiasMul.apply(cudaHandles,scratch,batchSize,gpoolConcat.buf,gpoolBias.buf,workspaceBuf,workspaceBytes);
+
+    if(!usingFP16) {
+      if(!usingNHWC)
+        customCudaAddNCBiasInplaceNCHW((float*)regularOut.buf,(const float*)gpoolBias.buf,batchSize,regularChannels,nnXLen*nnYLen);
+      else
+        customCudaAddNCBiasInplaceNHWC((float*)regularOut.buf,(const float*)gpoolBias.buf,batchSize,nnXLen*nnYLen,regularChannels);
+    }
+    else {
+      if(!usingNHWC)
+        customCudaAddNCBiasInplaceNCHW((half*)regularOut.buf,(const half*)gpoolBias.buf,batchSize,regularChannels,nnXLen*nnYLen);
+      else
+        customCudaAddNCBiasInplaceNHWC((half*)regularOut.buf,(const half*)gpoolBias.buf,batchSize,nnXLen*nnYLen,regularChannels);
+    }
+    CUDA_ERR(name.c_str(),hipPeekAtLastError());
+
+    normActConv2.apply(cudaHandles,batchSize,true,regularOut.buf,regularScratch.buf,trunkBuf,maskBuf,workspaceBuf,workspaceBytes);
+  }
+
+};
+
+//------------------------------------------------------------------------------
+
+struct BlockStack {
+  const int numBlocks;
+  const int trunkNumChannels;
+  const int nnXLen;
+  const int nnYLen;
+  const bool usingFP16;
+  const bool usingNHWC;
+  vector<pair<int,unique_ptr_void>> blocks;
+
+  BlockStack() = delete;
+  BlockStack(const BlockStack&) = delete;
+  BlockStack& operator=(const BlockStack&) = delete;
+
+  BlockStack(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    int nBlocks,
+    int trunkChannels,
+    const std::vector<std::pair<int, unique_ptr_void>>& descBlocks,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  );
+  ~BlockStack();
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const;
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* maskBuf,
+    float* maskSumBuf,
+    void* trunkBuf,
+    void* trunkScratchBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const;
+
+};
+
+//------------------------------------------------------------------------------
+
+struct NestedBottleneckResidualBlock {
+  const string name;
+  const NormActConv normActConv1;
+  const BlockStack blocks;
+  const NormActConv normActConv2;
+
+  NestedBottleneckResidualBlock() = delete;
+  NestedBottleneckResidualBlock(const NestedBottleneckResidualBlock&) = delete;
+  NestedBottleneckResidualBlock& operator=(const NestedBottleneckResidualBlock&) = delete;
+
+  NestedBottleneckResidualBlock(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const NestedBottleneckResidualBlockDesc* desc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ): name(desc->name),
+     normActConv1(cudaHandles,manager,&desc->preBN,&desc->preActivation,&desc->preConv,nnX,nnY,useFP16,useNHWC),
+     blocks(cudaHandles,manager,desc->numBlocks,desc->preConv.outChannels,desc->blocks,nnX,nnY,useFP16,useNHWC),
+     normActConv2(cudaHandles,manager,&desc->postBN,&desc->postActivation,&desc->postConv,nnX,nnY,useFP16,useNHWC)
+  {
+  }
+
+  ~NestedBottleneckResidualBlock()
+  {}
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+    b = normActConv1.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = blocks.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = normActConv2.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* trunkBuf,
+    void* trunkScratchBuf,
+    void* maskBuf,
+    float* maskSumBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    SizedBuf<void*> mid(scratch->allocator, scratch->getBufSizeXY(normActConv1.outChannels));
+    SizedBuf<void*> midScratch(scratch->allocator, scratch->getBufSizeXY(normActConv1.outChannels));
+    assert(normActConv1.outChannels == normActConv2.inChannels);
+    normActConv1.apply(cudaHandles,batchSize,false,trunkBuf,trunkScratchBuf,mid.buf,maskBuf,workspaceBuf,workspaceBytes);
+    blocks.apply(
+      cudaHandles,
+      scratch,
+      batchSize,
+      maskBuf,
+      maskSumBuf,
+      mid.buf,
+      midScratch.buf,
+      workspaceBuf,
+      workspaceBytes
+    );
+    normActConv2.apply(cudaHandles,batchSize,true,mid.buf,midScratch.buf,trunkBuf,maskBuf,workspaceBuf,workspaceBytes);
+  }
+
+};
+
+//------------------------------------------------------------------------------
+
+BlockStack::BlockStack(
+  CudaHandles* cudaHandles,
+  CudnnManager* manager,
+  int nBlocks,
+  int trunkChannels,
+  const std::vector<std::pair<int, unique_ptr_void>>& descBlocks,
+  int nnX,
+  int nnY,
+  bool useFP16,
+  bool useNHWC
+) :
+  numBlocks(nBlocks),
+  trunkNumChannels(trunkChannels),
+  nnXLen(nnX),
+  nnYLen(nnY),
+  usingFP16(useFP16),
+  usingNHWC(useNHWC)
+{
+  assert(numBlocks == descBlocks.size());
+  for(int i = 0; i<numBlocks; i++) {
+    if(descBlocks[i].first == ORDINARY_BLOCK_KIND) {
+      ResidualBlockDesc* blockDesc = (ResidualBlockDesc*)descBlocks[i].second.get();
+      unique_ptr_void blockPtr = make_unique_void(
+        new ResidualBlock(
+          cudaHandles,
+          manager,
+          blockDesc,
+          nnXLen,
+          nnYLen,
+          useFP16,
+          useNHWC
+        )
+      );
+      blocks.push_back(make_pair(ORDINARY_BLOCK_KIND,std::move(blockPtr)));
+    }
+    else if(descBlocks[i].first == GLOBAL_POOLING_BLOCK_KIND) {
+      GlobalPoolingResidualBlockDesc* blockDesc = (GlobalPoolingResidualBlockDesc*)descBlocks[i].second.get();
+      unique_ptr_void blockPtr = make_unique_void(
+        new GlobalPoolingResidualBlock(
+          cudaHandles,
+          manager,
+          blockDesc,
+          nnXLen,
+          nnYLen,
+          useFP16,
+          useNHWC
+        )
+      );
+      blocks.push_back(make_pair(GLOBAL_POOLING_BLOCK_KIND,std::move(blockPtr)));
+    }
+    else if(descBlocks[i].first == NESTED_BOTTLENECK_BLOCK_KIND) {
+      NestedBottleneckResidualBlockDesc* blockDesc = (NestedBottleneckResidualBlockDesc*)descBlocks[i].second.get();
+      unique_ptr_void blockPtr = make_unique_void(
+        new NestedBottleneckResidualBlock(
+          cudaHandles,
+          manager,
+          blockDesc,
+          nnXLen,
+          nnYLen,
+          useFP16,
+          useNHWC
+        )
+      );
+      blocks.push_back(make_pair(NESTED_BOTTLENECK_BLOCK_KIND,std::move(blockPtr)));
+    }
+    else {
+      ASSERT_UNREACHABLE;
+    }
+  }
+}
+BlockStack::~BlockStack() {
+}
+
+size_t BlockStack::requiredWorkspaceBytes(
+  CudaHandles* cudaHandles,
+  int batchSize
+) const {
+  size_t bytes = 0;
+  size_t b;
+
+  for(int i = 0; i<blocks.size(); i++) {
+    if(blocks[i].first == ORDINARY_BLOCK_KIND) {
+      ResidualBlock* block = (ResidualBlock*)blocks[i].second.get();
+      b = block->requiredWorkspaceBytes(cudaHandles,batchSize);
+      bytes = std::max(bytes,b);
+    }
+    else if(blocks[i].first == GLOBAL_POOLING_BLOCK_KIND) {
+      GlobalPoolingResidualBlock* block = (GlobalPoolingResidualBlock*)blocks[i].second.get();
+      b = block->requiredWorkspaceBytes(cudaHandles,batchSize);
+      bytes = std::max(bytes,b);
+    }
+    else if(blocks[i].first == NESTED_BOTTLENECK_BLOCK_KIND) {
+      NestedBottleneckResidualBlock* block = (NestedBottleneckResidualBlock*)blocks[i].second.get();
+      b = block->requiredWorkspaceBytes(cudaHandles,batchSize);
+      bytes = std::max(bytes,b);
+    }
+    else {
+      ASSERT_UNREACHABLE;
+    }
+  }
+  return bytes;
+}
+
+void BlockStack::apply(
+  CudaHandles* cudaHandles,
+  ScratchBuffers* scratch,
+  int batchSize,
+  void* maskBuf,
+  float* maskSumBuf,
+  void* trunkBuf,
+  void* trunkScratchBuf,
+  void* workspaceBuf,
+  size_t workspaceBytes
+) const {
+
+  for(int i = 0; i<blocks.size(); i++) {
+#ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("Blockstack before block " + Global::intToString(i)), trunkBuf, batchSize, trunkNumChannels, nnXLen, nnYLen, usingNHWC, usingFP16);
+#endif
+
+    if(blocks[i].first == ORDINARY_BLOCK_KIND) {
+      ResidualBlock* block = (ResidualBlock*)blocks[i].second.get();
+      block->apply(
+        cudaHandles,
+        scratch,
+        batchSize,
+        trunkBuf,
+        trunkScratchBuf,
+        maskBuf,
+        workspaceBuf,
+        workspaceBytes
+      );
+    }
+    else if(blocks[i].first == GLOBAL_POOLING_BLOCK_KIND) {
+      GlobalPoolingResidualBlock* block = (GlobalPoolingResidualBlock*)blocks[i].second.get();
+      block->apply(
+        cudaHandles,
+        scratch,
+        batchSize,
+        trunkBuf,
+        trunkScratchBuf,
+        maskBuf,
+        maskSumBuf,
+        workspaceBuf,
+        workspaceBytes
+      );
+    }
+    else if(blocks[i].first == NESTED_BOTTLENECK_BLOCK_KIND) {
+      NestedBottleneckResidualBlock* block = (NestedBottleneckResidualBlock*)blocks[i].second.get();
+      block->apply(
+        cudaHandles,
+        scratch,
+        batchSize,
+        trunkBuf,
+        trunkScratchBuf,
+        maskBuf,
+        maskSumBuf,
+        workspaceBuf,
+        workspaceBytes
+      );
+    }
+    else {
+      ASSERT_UNREACHABLE;
+    }
+  }
+}
+//------------------------------------------------------------------------------
+
+struct SGFMetadataEncoder {
+  const string name;
+
+  const bool usingFP16;
+
+  const MatMulLayer mul1;
+  const MatBiasLayer bias1;
+  const MatMulLayer mul2;
+  const MatBiasLayer bias2;
+  const MatMulLayer mul3;
+
+  SGFMetadataEncoder() = delete;
+  SGFMetadataEncoder(const SGFMetadataEncoder&) = delete;
+  SGFMetadataEncoder& operator=(const SGFMetadataEncoder&) = delete;
+
+  SGFMetadataEncoder(
+    CudaHandles* cudaHandles,
+    const SGFMetadataEncoderDesc* desc,
+    bool useFP16
+  ) :
+    name(desc->name),
+    usingFP16(useFP16),
+    mul1(cudaHandles,&desc->mul1,useFP16),
+    bias1(cudaHandles,&desc->bias1,useFP16,desc->act1.activation),
+    mul2(cudaHandles,&desc->mul2,useFP16),
+    bias2(cudaHandles,&desc->bias2,useFP16,desc->act2.activation),
+    mul3(cudaHandles,&desc->mul3,useFP16)
+  {
+  }
+
+  ~SGFMetadataEncoder()
+  {
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    (void)batchSize;
+    size_t bytes = 0;
+    size_t b;
+
+    b = mul1.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = mul2.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = mul3.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* inputBuf,
+    void* outputBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    SizedBuf<void*> internalBuf1(scratch->allocator, scratch->getBufSizeFloat(std::max(mul1.outChannels,mul2.outChannels)));
+    SizedBuf<void*> internalBuf2(scratch->allocator, scratch->getBufSizeFloat(std::max(mul1.outChannels,mul2.outChannels)));
+
+    mul1.apply(cudaHandles,scratch,batchSize,inputBuf,internalBuf1.buf,workspaceBuf,workspaceBytes);
+    bias1.apply(cudaHandles,batchSize,internalBuf1.buf);
+    mul2.apply(cudaHandles,scratch,batchSize,internalBuf1.buf,internalBuf2.buf,workspaceBuf,workspaceBytes);
+    bias2.apply(cudaHandles,batchSize,internalBuf2.buf);
+    mul3.apply(cudaHandles,scratch,batchSize,internalBuf2.buf,outputBuf,workspaceBuf,workspaceBytes);
+  }
+
+};
+
+
+//----------------------------------------------------------------------------
+
+struct Trunk {
+  const string name;
+  const int modelVersion;
+  const int numBlocks;
+  const int trunkNumChannels;
+
+  const int nnXLen;
+  const int nnYLen;
+  const bool usingFP16;
+  const bool usingNHWC;
+
+  std::unique_ptr<ConvLayer> initialConv;
+  std::unique_ptr<MatMulLayer> initialMatMul;
+  std::unique_ptr<SGFMetadataEncoder> sgfMetadataEncoder;
+  const BlockStack blocks;
+  std::unique_ptr<BatchNormLayer> trunkTipBN;
+
+  Trunk() = delete;
+  Trunk(const Trunk&) = delete;
+  Trunk& operator=(const Trunk&) = delete;
+
+  Trunk(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const TrunkDesc* desc,
+    int nnX,
+    int nnY,
+    bool inputsUseNHWC,
+    bool useFP16,
+    bool useNHWC
+  ) :
+    name(desc->name),
+    modelVersion(desc->modelVersion),
+    numBlocks(desc->numBlocks),
+    trunkNumChannels(desc->trunkNumChannels),
+    nnXLen(nnX),
+    nnYLen(nnY),
+    usingFP16(useFP16),
+    usingNHWC(useNHWC),
+    blocks(cudaHandles,manager,desc->numBlocks,desc->trunkNumChannels,desc->blocks,nnX,nnY,useFP16,useNHWC)
+  {
+    int midNumChannels = desc->midNumChannels;
+    int regularNumChannels = desc->regularNumChannels;
+    int gpoolNumChannels = desc->gpoolNumChannels;
+
+    int maxBatchSize = manager->maxBatchSize;
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,trunkNumChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,midNumChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,regularNumChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,gpoolNumChannels);
+
+    initialConv = std::make_unique<ConvLayer>(cudaHandles,manager,&desc->initialConv,useFP16,inputsUseNHWC,useNHWC);
+    initialMatMul = std::make_unique<MatMulLayer>(cudaHandles,&desc->initialMatMul,useFP16);
+    if(desc->metaEncoderVersion > 0) {
+      sgfMetadataEncoder = std::make_unique<SGFMetadataEncoder>(cudaHandles,&desc->sgfMetadataEncoder,useFP16);
+      testAssert(sgfMetadataEncoder->mul3.outChannels == initialMatMul->outChannels);
+    }
+
+    trunkTipBN = std::make_unique<BatchNormLayer>(cudaHandles,&desc->trunkTipBN,&desc->trunkTipActivation,nnXLen,nnYLen,useFP16,useNHWC);
+    assert(desc->blocks.size() == numBlocks);
+  }
+
+  ~Trunk()
+  {
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+
+    b = initialConv->requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+
+    b = initialMatMul->requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+
+    if(sgfMetadataEncoder != nullptr) {
+      b = sgfMetadataEncoder->requiredWorkspaceBytes(cudaHandles,batchSize);
+      bytes = std::max(bytes,b);
+    }
+
+    b = blocks.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* inputBuf,
+    void* inputGlobalBuf,
+    void* inputMetaBuf,
+    void* maskBuf,
+    float* maskSumBuf,
+    void* trunkBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+
+    SizedBuf<void*> trunkScratch(scratch->allocator, scratch->getBufSizeXY(trunkNumChannels));
+
+    //Feed the conv into trunkScratch.buf, not trunkBuf
+    initialConv->apply(cudaHandles,batchSize,false,inputBuf,trunkScratch.buf,workspaceBuf,workspaceBytes);
+
+    #ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("After initial conv"), trunkScratch.buf, batchSize, trunkNumChannels, nnXLen, nnYLen, usingNHWC, usingFP16);
+    #endif
+
+    //Feed the matmul into trunkBuf
+    initialMatMul->apply(cudaHandles,scratch,batchSize,inputGlobalBuf,trunkBuf,workspaceBuf,workspaceBytes);
+    //Then accumulate it into trunkScratch.buf, broadcasting during the process
+    if(!usingFP16) {
+      if(!usingNHWC)
+        customCudaAddNCBiasInplaceNCHW((float*)trunkScratch.buf,(const float*)trunkBuf,batchSize,trunkNumChannels,nnXLen*nnYLen);
+      else
+        customCudaAddNCBiasInplaceNHWC((float*)trunkScratch.buf,(const float*)trunkBuf,batchSize,nnXLen*nnYLen,trunkNumChannels);
+    }
+    else {
+      if(!usingNHWC)
+        customCudaAddNCBiasInplaceNCHW((half*)trunkScratch.buf,(const half*)trunkBuf,batchSize,trunkNumChannels,nnXLen*nnYLen);
+      else
+        customCudaAddNCBiasInplaceNHWC((half*)trunkScratch.buf,(const half*)trunkBuf,batchSize,nnXLen*nnYLen,trunkNumChannels);
+    }
+    CUDA_ERR(name.c_str(),hipPeekAtLastError());
+
+    if(sgfMetadataEncoder != nullptr) {
+      testAssert(inputMetaBuf != NULL);
+      //Feed the result into trunkBuf
+      sgfMetadataEncoder->apply(cudaHandles,scratch,batchSize,inputMetaBuf,trunkBuf,workspaceBuf,workspaceBytes);
+      //Then accumulate it into trunkScratch.buf, broadcasting during the process
+      if(!usingFP16) {
+        if(!usingNHWC)
+          customCudaAddNCBiasInplaceNCHW((float*)trunkScratch.buf,(const float*)trunkBuf,batchSize,trunkNumChannels,nnXLen*nnYLen);
+        else
+          customCudaAddNCBiasInplaceNHWC((float*)trunkScratch.buf,(const float*)trunkBuf,batchSize,nnXLen*nnYLen,trunkNumChannels);
+      }
+      else {
+        if(!usingNHWC)
+          customCudaAddNCBiasInplaceNCHW((half*)trunkScratch.buf,(const half*)trunkBuf,batchSize,trunkNumChannels,nnXLen*nnYLen);
+        else
+          customCudaAddNCBiasInplaceNHWC((half*)trunkScratch.buf,(const half*)trunkBuf,batchSize,nnXLen*nnYLen,trunkNumChannels);
+      }
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+    }
+    else {
+      testAssert(inputMetaBuf == NULL);
+    }
+
+    //Flip trunkBuf and trunkScratch.buf so that the result gets accumulated in trunkScratch.buf
+    blocks.apply(
+      cudaHandles,
+      scratch,
+      batchSize,
+      maskBuf,
+      maskSumBuf,
+      trunkScratch.buf,
+      trunkBuf,
+      workspaceBuf,
+      workspaceBytes
+    );
+
+    //And now with the final BN port it from trunkScratch.buf to trunkBuf.
+    trunkTipBN->apply(cudaHandles,batchSize,trunkScratch.buf,maskBuf,trunkBuf);
+
+    #ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("Trunk tip"), trunkBuf, batchSize, trunkNumChannels, nnXLen, nnYLen, usingNHWC, usingFP16);
+    #endif
+  }
+
+};
+
+//------------------------------------------------------------------------------
+
+static void fillMaskFloatBufAndMaskSumBuf(void* maskBuf, float*& maskFloatBuf, float*& maskSumBuf, bool usingFP16, int batchSize, int nnXLen, int nnYLen) {
+  if(!usingFP16) {
+    maskFloatBuf = (float*)maskBuf;
+    customCudaPoolRowsSumNCHW((const float*)maskFloatBuf,maskSumBuf,batchSize,1,nnXLen*nnYLen,1.0);
+    CUDA_ERR("sumMask",hipPeekAtLastError());
+  }
+  else {
+    customCudaCopyFromHalf((const half*)maskBuf,maskFloatBuf,batchSize*nnXLen*nnYLen);
+    CUDA_ERR("copyMaskFromHalf",hipPeekAtLastError());
+    customCudaPoolRowsSumNCHW((const float*)maskFloatBuf,maskSumBuf,batchSize,1,nnXLen*nnYLen,1.0);
+    CUDA_ERR("sumMask",hipPeekAtLastError());
+  }
+}
+
+
+//------------------------------------------------------------------------------
+
+struct PolicyHead {
+  const string name;
+  const int modelVersion;
+  const int nnXLen;
+  const int nnYLen;
+  const int p1Channels;
+  const int g1Channels;
+  const int p2Channels;
+  const bool usingFP16;
+  const bool usingNHWC;
+
+  const ConvLayer p1Conv;
+  const ConvLayer g1Conv;
+  const BatchNormLayer g1BN;
+  const MatMulLayer gpoolToBiasMul;
+  const BatchNormLayer p1BN;
+  const ConvLayer p2Conv;
+  const MatMulLayer gpoolToPassMul;
+  const MatBiasLayer gpoolToPassBias;
+  const MatMulLayer gpoolToPassMul2;
+
+  PolicyHead() = delete;
+  PolicyHead(const PolicyHead&) = delete;
+  PolicyHead& operator=(const PolicyHead&) = delete;
+
+  PolicyHead(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const PolicyHeadDesc* desc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ) :
+    name(desc->name),
+    modelVersion(desc->modelVersion),
+    nnXLen(nnX),
+    nnYLen(nnY),
+    p1Channels(desc->p1Conv.outChannels),
+    g1Channels(desc->g1Conv.outChannels),
+    p2Channels(desc->p2Conv.outChannels),
+    usingFP16(useFP16),
+    usingNHWC(useNHWC),
+    p1Conv(cudaHandles,manager,&desc->p1Conv,useFP16,useNHWC),
+    g1Conv(cudaHandles,manager,&desc->g1Conv,useFP16,useNHWC),
+    g1BN(cudaHandles,&desc->g1BN,&desc->g1Activation,nnX,nnY,useFP16,useNHWC),
+    gpoolToBiasMul(cudaHandles,&desc->gpoolToBiasMul,false),
+    p1BN(cudaHandles,&desc->p1BN,&desc->p1Activation,nnX,nnY,false,useNHWC),
+    p2Conv(cudaHandles,manager,&desc->p2Conv,false,useNHWC),
+    gpoolToPassMul(cudaHandles,&desc->gpoolToPassMul,false),
+    gpoolToPassBias(cudaHandles,&desc->gpoolToPassBias,false,desc->passActivation.activation),
+    gpoolToPassMul2(cudaHandles,&desc->gpoolToPassMul2,false)
+  {
+  }
+
+  ~PolicyHead()
+  {
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+
+    b = p1Conv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = g1Conv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = gpoolToBiasMul.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = p2Conv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = gpoolToPassMul.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = gpoolToPassMul2.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = sizeof(float)*batchSize*g1Channels*nnXLen*nnYLen;
+    bytes = std::max(bytes,b);
+
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* maskBuf,
+    float* maskFloatBuf,
+    float* maskSumBuf,
+    void* trunkBuf,
+    float* policyPassBuf,
+    float* policyBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+
+    SizedBuf<void*> p1Out(scratch->allocator, scratch->getBufSizeXYFloat(p1Channels)); //Need to hold floats, not just halfs
+    SizedBuf<void*> p1Out2(scratch->allocator, scratch->getBufSizeXYFloat(p1Channels)); //Need to hold floats, not just halfs
+    SizedBuf<void*> g1Out(scratch->allocator, scratch->getBufSizeXY(g1Channels));
+    SizedBuf<void*> g1Out2(scratch->allocator, scratch->getBufSizeXY(g1Channels));
+    SizedBuf<void*> g1Concat(scratch->allocator, scratch->getBufSizeFloat(g1Channels*3));
+    SizedBuf<void*> g1Bias(scratch->allocator, scratch->getBufSizeFloat(p1Channels));
+    SizedBuf<void*> p1Pass(scratch->allocator, scratch->getBufSizeFloat(p1Channels));
+
+    p1Conv.apply(cudaHandles,batchSize,false,trunkBuf,p1Out.buf,workspaceBuf,workspaceBytes);
+    g1Conv.apply(cudaHandles,batchSize,false,trunkBuf,g1Out.buf,workspaceBuf,workspaceBytes);
+    g1BN.apply(cudaHandles,batchSize,g1Out.buf,maskBuf,g1Out2.buf);
+
+    if(!usingFP16) {
+      if(!usingNHWC)
+        customCudaPoolRowsGPoolNCHW((const float*)g1Out2.buf,(float*)g1Concat.buf,batchSize,g1Channels,nnXLen*nnYLen,maskFloatBuf,maskSumBuf);
+      else
+        customCudaPoolRowsGPoolNHWC((const float*)g1Out2.buf,(float*)g1Concat.buf,batchSize,nnXLen*nnYLen,g1Channels,maskFloatBuf,maskSumBuf);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+    }
+    else {
+      customCudaCopyFromHalf((const half*)g1Out2.buf,(float*)workspaceBuf,batchSize*g1Channels*nnXLen*nnYLen);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+      if(!usingNHWC)
+        customCudaPoolRowsGPoolNCHW((const float*)workspaceBuf,(float*)g1Concat.buf,batchSize,g1Channels,nnXLen*nnYLen,maskFloatBuf,maskSumBuf);
+      else
+        customCudaPoolRowsGPoolNHWC((const float*)workspaceBuf,(float*)g1Concat.buf,batchSize,nnXLen*nnYLen,g1Channels,maskFloatBuf,maskSumBuf);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+    }
+
+    gpoolToBiasMul.apply(cudaHandles,scratch,batchSize,g1Concat.buf,g1Bias.buf,workspaceBuf,workspaceBytes);
+
+    #ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("p1 pre-gpool-sum"), p1Out.buf, batchSize, p1Channels, nnXLen, nnYLen, usingNHWC, usingFP16);
+    CudaUtils::debugPrint4D(string("g1 pre-gpool"), g1Out.buf, batchSize, g1Channels, nnXLen, nnYLen, usingNHWC, usingFP16);
+    CudaUtils::debugPrint2D(string("g1 pooled"), g1Concat.buf, batchSize, g1Channels*3, false);
+    CudaUtils::debugPrint2D(string("g1 biases"), g1Bias.buf, batchSize, p1Channels, false);
+    #endif
+
+    float* p1OutBufA;
+    float* p1OutBufB;
+    if(!usingFP16) {
+      p1OutBufA = (float*)p1Out.buf;
+      p1OutBufB = (float*)p1Out2.buf;
+    }
+    else {
+      customCudaCopyFromHalf((const half*)p1Out.buf,(float*)p1Out2.buf,batchSize*p1Channels*nnXLen*nnYLen);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+      p1OutBufA = (float*)p1Out2.buf;
+      p1OutBufB = (float*)p1Out.buf;
+    }
+
+    if(!usingNHWC)
+      customCudaAddNCBiasInplaceNCHW(p1OutBufA,(float*)g1Bias.buf,batchSize,p1Channels,nnXLen*nnYLen);
+    else
+      customCudaAddNCBiasInplaceNHWC(p1OutBufA,(float*)g1Bias.buf,batchSize,nnXLen*nnYLen,p1Channels);
+    CUDA_ERR(name.c_str(),hipPeekAtLastError());
+
+    p1BN.apply(cudaHandles,batchSize,p1OutBufA,maskFloatBuf,p1OutBufB);
+    p2Conv.apply(cudaHandles,batchSize,false,p1OutBufB,(float*)policyBuf,workspaceBuf,workspaceBytes);
+
+    if(modelVersion >= 15) {
+      gpoolToPassMul.apply(cudaHandles,scratch,batchSize,g1Concat.buf,p1Pass.buf,workspaceBuf,workspaceBytes);
+      gpoolToPassBias.apply(cudaHandles,batchSize,p1Pass.buf);
+      gpoolToPassMul2.apply(cudaHandles,scratch,batchSize,p1Pass.buf,policyPassBuf,workspaceBuf,workspaceBytes);
+    }
+    else {
+      gpoolToPassMul.apply(cudaHandles,scratch,batchSize,g1Concat.buf,policyPassBuf,workspaceBuf,workspaceBytes);
+    }
+
+    #ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("p1 after-gpool-sum"), p1OutBufA, batchSize, p1Channels, nnXLen, nnYLen, usingNHWC, false);
+    CudaUtils::debugPrint2D(string("policypass"), policyPassBuf, batchSize, 1, false);
+    CudaUtils::debugPrint4D(string("policy"), policyBuf, batchSize, p2Channels, nnXLen, nnYLen, usingNHWC, false);
+    #endif
+
+  }
+
+};
+
+//------------------------------------------------------------------------------
+
+struct ValueHead {
+  const string name;
+  const int modelVersion;
+  const int nnXLen;
+  const int nnYLen;
+  const int v1Channels;
+  const int v2Channels;
+  const int valueChannels;
+  const int scoreValueChannels;
+  const int ownershipChannels;
+  const bool usingFP16;
+  const bool usingNHWC;
+
+  const ConvLayer v1Conv;
+  const BatchNormLayer v1BN;
+  const MatMulLayer v2Mul;
+  const MatBiasLayer v2Bias;
+  const MatMulLayer v3Mul;
+  const MatBiasLayer v3Bias;
+  const MatMulLayer sv3Mul;
+  const MatBiasLayer sv3Bias;
+  const ConvLayer vOwnershipConv;
+
+  ValueHead() = delete;
+  ValueHead(const ValueHead&) = delete;
+  ValueHead& operator=(const ValueHead&) = delete;
+
+  ValueHead(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const ValueHeadDesc* desc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ) :
+    name(desc->name),
+    modelVersion(desc->modelVersion),
+    nnXLen(nnX),
+    nnYLen(nnY),
+    v1Channels(desc->v1Conv.outChannels),
+    v2Channels(desc->v2Mul.outChannels),
+    valueChannels(desc->v3Mul.outChannels),
+    scoreValueChannels(desc->sv3Mul.outChannels),
+    ownershipChannels(desc->vOwnershipConv.outChannels),
+    usingFP16(useFP16),
+    usingNHWC(useNHWC),
+    v1Conv(cudaHandles,manager,&desc->v1Conv,useFP16,useNHWC),
+    v1BN(cudaHandles,&desc->v1BN,&desc->v1Activation,nnX,nnY,useFP16,useNHWC),
+    v2Mul(cudaHandles,&desc->v2Mul,false),
+    v2Bias(cudaHandles,&desc->v2Bias,false,desc->v2Activation.activation),
+    v3Mul(cudaHandles,&desc->v3Mul,false),
+    v3Bias(cudaHandles,&desc->v3Bias,false,ACTIVATION_IDENTITY),
+    sv3Mul(cudaHandles,&desc->sv3Mul,false),
+    sv3Bias(cudaHandles,&desc->sv3Bias,false,ACTIVATION_IDENTITY),
+    vOwnershipConv(cudaHandles,manager,&desc->vOwnershipConv,useFP16,useNHWC)
+  {
+  }
+
+  ~ValueHead()
+  {
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+
+    b = v1Conv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = v2Mul.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = v3Mul.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = sizeof(float)*batchSize*v1Channels*nnXLen*nnYLen;
+    bytes = std::max(bytes,b);
+
+    b = sv3Mul.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = vOwnershipConv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = sizeof(float)*batchSize*ownershipChannels*nnXLen*nnYLen;
+    bytes = std::max(bytes,b);
+
+    return bytes;
+  }
+
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* maskBuf,
+    float* maskSumBuf,
+    void* trunkBuf,
+    float* valueBuf,
+    float* scoreValueBuf,
+    void* ownershipBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    SizedBuf<void*> v1Out(scratch->allocator, scratch->getBufSizeXY(v1Channels));
+    SizedBuf<void*> v1Out2(scratch->allocator, scratch->getBufSizeXY(v1Channels));
+    SizedBuf<void*> v1Mean(scratch->allocator, scratch->getBufSizeFloat(v1Channels*3));
+    SizedBuf<void*> v2Out(scratch->allocator, scratch->getBufSizeFloat(v2Channels));
+    SizedBuf<void*> ownershipScratch(scratch->allocator, scratch->getBufSizeXYFloat(ownershipChannels));
+
+    v1Conv.apply(cudaHandles,batchSize,false,trunkBuf,v1Out.buf,workspaceBuf,workspaceBytes);
+    v1BN.apply(cudaHandles,batchSize,v1Out.buf,maskBuf,v1Out2.buf);
+
+    void* bufToBePooled = v1Out2.buf;
+    if(usingFP16) {
+      customCudaCopyFromHalf((const half*)v1Out2.buf,(float*)workspaceBuf,batchSize*v1Channels*nnXLen*nnYLen);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+      bufToBePooled = workspaceBuf;
+    }
+
+    if(!usingNHWC)
+      customCudaValueHeadPoolNCHW((float*)bufToBePooled,(float*)v1Mean.buf,batchSize,v1Channels,nnXLen*nnYLen,maskSumBuf);
+    else
+      customCudaValueHeadPoolNHWC((const float*)bufToBePooled,(float*)v1Mean.buf,batchSize,nnXLen*nnYLen,v1Channels,maskSumBuf);
+    CUDA_ERR(name.c_str(),hipPeekAtLastError());
+
+    v2Mul.apply(cudaHandles,scratch,batchSize,v1Mean.buf,v2Out.buf,workspaceBuf,workspaceBytes);
+    v2Bias.apply(cudaHandles,batchSize,v2Out.buf);
+    v3Mul.apply(cudaHandles,scratch,batchSize,v2Out.buf,valueBuf,workspaceBuf,workspaceBytes);
+    v3Bias.apply(cudaHandles,batchSize,valueBuf);
+
+    sv3Mul.apply(cudaHandles,scratch,batchSize,v2Out.buf,scoreValueBuf,workspaceBuf,workspaceBytes);
+    sv3Bias.apply(cudaHandles,batchSize,scoreValueBuf);
+
+    #ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("v1"), v1Out.buf, batchSize, v1Channels, nnXLen, nnYLen, usingNHWC, usingFP16);
+    CudaUtils::debugPrint2D(string("v1 pooled"), v1Mean.buf, batchSize, v1Channels, false);
+    CudaUtils::debugPrint2D(string("v2"), v2Out.buf, batchSize, v1Channels, false);
+    #endif
+
+    if(!usingFP16) {
+      vOwnershipConv.apply(cudaHandles,batchSize,false,v1Out2.buf,ownershipBuf,workspaceBuf,workspaceBytes);
+    }
+    else {
+      vOwnershipConv.apply(cudaHandles,batchSize,false,v1Out2.buf,ownershipScratch.buf,workspaceBuf,workspaceBytes);
+      customCudaCopyFromHalf((const half*)ownershipScratch.buf,(float*)ownershipBuf,batchSize*ownershipChannels*nnXLen*nnYLen);
+      CUDA_ERR("vOwnership copy",hipPeekAtLastError());
+    }
+
+  }
+
+};
+
+//------------------------------------------------------------------------------
+
+struct Model {
+  const string name;
+  const int modelVersion;
+  const int maxBatchSize;
+  const int nnXLen;
+  const int nnYLen;
+  const int numInputChannels;
+  const int numInputGlobalChannels;
+  const int numInputMetaChannels;
+  const int numPolicyChannels;
+  const int numValueChannels;
+  const int numScoreValueChannels;
+  const int numOwnershipChannels;
+  const bool usingFP16;
+  const bool usingNHWC;
+  const bool inputsUsingNHWC;
+
+  std::unique_ptr<Trunk> trunk;
+  std::unique_ptr<PolicyHead> policyHead;
+  std::unique_ptr<ValueHead> valueHead;
+  std::unique_ptr<CudnnManager> manager;
+
+  Model() = delete;
+  Model(const Model&) = delete;
+  Model& operator=(const Model&) = delete;
+
+  Model(
+    CudaHandles* cudaHandles,
+    const ModelDesc* desc,
+    int maxBatchSz,
+    int nnX,
+    int nnY,
+    bool inputsUseNHWC,
+    bool useFP16,
+    bool useNHWC
+  ) :
+    name(desc->name),
+    modelVersion(desc->modelVersion),
+    maxBatchSize(maxBatchSz),
+    nnXLen(nnX),
+    nnYLen(nnY),
+    numInputChannels(desc->numInputChannels),
+    numInputGlobalChannels(desc->numInputGlobalChannels),
+    numInputMetaChannels(desc->numInputMetaChannels),
+    numPolicyChannels(desc->numPolicyChannels),
+    numValueChannels(desc->numValueChannels),
+    numScoreValueChannels(desc->numScoreValueChannels),
+    numOwnershipChannels(desc->numOwnershipChannels),
+    usingFP16(useFP16),
+    usingNHWC(useNHWC),
+    inputsUsingNHWC(inputsUseNHWC)
+  {
+    if(nnXLen > NNPos::MAX_BOARD_LEN)
+      throw StringError(Global::strprintf("nnXLen (%d) is greater than NNPos::MAX_BOARD_LEN (%d)",
+        nnXLen, NNPos::MAX_BOARD_LEN
+      ));
+    if(nnYLen > NNPos::MAX_BOARD_LEN)
+      throw StringError(Global::strprintf("nnYLen (%d) is greater than NNPos::MAX_BOARD_LEN (%d)",
+        nnYLen, NNPos::MAX_BOARD_LEN
+      ));
+
+    int numFeatures = NNModelVersion::getNumSpatialFeatures(modelVersion);
+    if(numInputChannels != numFeatures)
+      throw StringError(Global::strprintf("Neural net numInputChannels (%d) was not the expected number based on version (%d)",
+        numInputChannels, numFeatures
+      ));
+    int numGlobalFeatures = NNModelVersion::getNumGlobalFeatures(modelVersion);
+    if(numInputGlobalChannels != numGlobalFeatures)
+      throw StringError(Global::strprintf("Neural net numInputGlobalChannels (%d) was not the expected number based on version (%d)",
+        numInputGlobalChannels, numGlobalFeatures
+      ));
+    if(numInputMetaChannels > 0) {
+      if(numInputMetaChannels != SGFMetadata::METADATA_INPUT_NUM_CHANNELS)
+        throw StringError(Global::strprintf("Neural net numInputMetaChannels (%d) was not the expected number (%d)",
+          numInputMetaChannels, SGFMetadata::METADATA_INPUT_NUM_CHANNELS
+        ));
+    }
+
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numInputChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numInputGlobalChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numInputMetaChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numPolicyChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numValueChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numScoreValueChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numOwnershipChannels);
+
+    manager = std::make_unique<CudnnManager>(name, maxBatchSize, nnXLen, nnYLen);
+    trunk = std::make_unique<Trunk>(cudaHandles,manager.get(),&desc->trunk,nnXLen,nnYLen,inputsUseNHWC,useFP16,useNHWC);
+    policyHead = std::make_unique<PolicyHead>(cudaHandles,manager.get(),&desc->policyHead,nnXLen,nnYLen,useFP16,useNHWC);
+    valueHead = std::make_unique<ValueHead>(cudaHandles,manager.get(),&desc->valueHead,nnXLen,nnYLen,useFP16,useNHWC);
+  }
+
+  ~Model()
+  {
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+
+    b = trunk->requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = policyHead->requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = valueHead->requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    bool requireExactNNLen,
+
+    void* inputBuf,
+    void* inputGlobalBuf,
+    void* inputMetaBuf,
+
+    float* policyPassBuf,
+    float* policyBuf,
+
+    float* valueBuf,
+    float* scoreValueBuf,
+    void* ownershipBuf,
+
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    SizedBuf<void*> mask(scratch->allocator, scratch->getBufSizeXY(1));
+    SizedBuf<void*> maskFloat(scratch->allocator, scratch->getBufSizeXYFloat(1));
+    SizedBuf<void*> maskSum(scratch->allocator, scratch->getBufSizeFloat(1));
+
+    void* maskBuf = mask.buf;
+    float* maskFloatBuf = (float*)maskFloat.buf;
+    float* maskSumBuf = (float*)maskSum.buf;
+
+    if(!usingFP16) {
+      if(inputsUsingNHWC)
+        customCudaChannel0ExtractNHWC((const float*)inputBuf, (float*)maskBuf, batchSize, nnXLen*nnYLen, numInputChannels);
+      else
+        customCudaChannel0ExtractNCHW((const float*)inputBuf, (float*)maskBuf, batchSize, numInputChannels, nnXLen*nnYLen);
+      CUDA_ERR("modelExtractMask",hipPeekAtLastError());
+    }
+    else {
+      if(inputsUsingNHWC)
+        customCudaChannel0ExtractNHWC((const half*)inputBuf, (half*)maskBuf, batchSize, nnXLen*nnYLen, numInputChannels);
+      else
+        customCudaChannel0ExtractNCHW((const half*)inputBuf, (half*)maskBuf, batchSize, numInputChannels, nnXLen*nnYLen);
+      CUDA_ERR("modelExtractMask",hipPeekAtLastError());
+    }
+
+    fillMaskFloatBufAndMaskSumBuf(maskBuf,maskFloatBuf,maskSumBuf,usingFP16,batchSize,nnXLen,nnYLen);
+
+    //Don't do any masking if we know the board is exactly the desired size
+    if(requireExactNNLen) {
+      //Set to NULL to signal downstream that this buf doesn't need to be used
+      maskBuf = NULL;
+      maskFloatBuf = NULL;
+      //The global pooling structures need this no matter what, for normalizing based on this and its sqrt.
+      //maskSumBuf = NULL;
+    }
+
+    #ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("Initial bin features"), inputBuf, batchSize, trunk->initialConv->inChannels, nnXLen, nnYLen, inputsUsingNHWC, usingFP16);
+    CudaUtils::debugPrint2D(string("Initial global features"), inputGlobalBuf, batchSize, trunk->initialMatMul->inChannels, usingFP16);
+    if(trunk->sgfMetadataEncoder != nullptr) {
+      assert(inputMetaBuf != NULL);
+      CudaUtils::debugPrint2D(string("Initial meta features"), inputMetaBuf, batchSize, trunk->sgfMetadataEncoder->mul1.inChannels, usingFP16);
+    }
+    #endif
+
+    SizedBuf<void*> trunkBuf(scratch->allocator, scratch->getBufSizeXY(trunk->trunkNumChannels));
+
+    trunk->apply(
+      cudaHandles,
+      scratch,
+      batchSize,
+      inputBuf,
+      inputGlobalBuf,
+      inputMetaBuf,
+      maskBuf,
+      maskSumBuf,
+      trunkBuf.buf,
+      workspaceBuf,
+      workspaceBytes
+    );
+    policyHead->apply(
+      cudaHandles,
+      scratch,
+      batchSize,
+      maskBuf,
+      maskFloatBuf,
+      maskSumBuf,
+      trunkBuf.buf,
+      policyPassBuf,
+      policyBuf,
+      workspaceBuf,
+      workspaceBytes
+    );
+    valueHead->apply(
+      cudaHandles,
+      scratch,
+      batchSize,
+      maskBuf,
+      maskSumBuf,
+      trunkBuf.buf,
+      valueBuf,
+      scoreValueBuf,
+      ownershipBuf,
+      workspaceBuf,
+      workspaceBytes
+    );
+  }
+
+};
+
+
+//------------------------------------------------------------------------------
+
+struct LoadedModel {
+  ModelDesc modelDesc;
+
+  LoadedModel(const string& fileName, const string& expectedSha256) {
+    ModelDesc::loadFromFileMaybeGZipped(fileName,modelDesc,expectedSha256);
+    modelDesc.applyScale8ToReduceActivations();
+  }
+
+  LoadedModel() = delete;
+  LoadedModel(const LoadedModel&) = delete;
+  LoadedModel& operator=(const LoadedModel&) = delete;
+};
+
+LoadedModel* NeuralNet::loadModelFile(const string& file, const string& expectedSha256) {
+  LoadedModel* loadedModel = new LoadedModel(file,expectedSha256);
+  return loadedModel;
+}
+
+void NeuralNet::freeLoadedModel(LoadedModel* loadedModel) {
+  delete loadedModel;
+}
+
+const ModelDesc& NeuralNet::getModelDesc(const LoadedModel* loadedModel) {
+  return loadedModel->modelDesc;
+}
+
+//------------------------------------------------------------------------------
+
+struct Buffers {
+  //All of these are device pointers
+
+  float* inputBufFloat;
+  void* inputBuf;
+  float* inputGlobalBufFloat;
+  void* inputGlobalBuf;
+  float* inputMetaBufFloat;
+  void* inputMetaBuf;
+  size_t inputBufBytesFloat;
+  size_t inputBufBytes;
+  size_t inputGlobalBufBytesFloat;
+  size_t inputGlobalBufBytes;
+  size_t inputMetaBufBytesFloat;
+  size_t inputMetaBufBytes;
+
+  float* policyPassBuf;
+  size_t policyPassBufBytes;
+  float* policyBuf;
+  size_t policyBufBytes;
+
+  float* valueBuf;
+  size_t valueBufBytes;
+  float* scoreValueBuf;
+  size_t scoreValueBufBytes;
+  void* ownershipBuf;
+  size_t ownershipBufBytes;
+
+  void* workspaceBuf;
+  size_t workspaceBytes;
+
+  Buffers() = delete;
+  Buffers(const Buffers&) = delete;
+  Buffers& operator=(const Buffers&) = delete;
+
+  Buffers(CudaHandles* cudaHandles, const Model& m, const ScratchBuffers& scratch) {
+    size_t batchXYFloatBytes = (size_t)scratch.batchXYFloatBytes;
+    size_t batchFloatBytes = (size_t)scratch.batchFloatBytes;
+    size_t batchXYBytes = (size_t)scratch.batchXYBytes;
+    size_t batchBytes = (size_t)scratch.batchBytes;
+
+    inputBufBytesFloat = m.numInputChannels * batchXYFloatBytes;
+    inputBufBytes = m.numInputChannels * batchXYBytes;
+    inputGlobalBufBytesFloat = m.numInputGlobalChannels * batchFloatBytes;
+    inputGlobalBufBytes = m.numInputGlobalChannels * batchBytes;
+    inputMetaBufBytesFloat = m.numInputMetaChannels * batchFloatBytes;
+    inputMetaBufBytes = m.numInputMetaChannels * batchBytes;
+
+    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&inputBufFloat), inputBufBytesFloat));
+    CUDA_ERR("Buffers",hipMalloc(&inputBuf, inputBufBytes));
+    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&inputGlobalBufFloat), inputGlobalBufBytesFloat));
+    CUDA_ERR("Buffers",hipMalloc(&inputGlobalBuf, inputGlobalBufBytes));
+    if(m.numInputMetaChannels > 0) {
+      CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&inputMetaBufFloat), inputMetaBufBytesFloat));
+      CUDA_ERR("Buffers",hipMalloc(&inputMetaBuf, inputMetaBufBytes));
+    }
+    else {
+      inputMetaBufFloat = NULL;
+      inputMetaBuf = NULL;
+    }
+
+    if(m.modelVersion >= 16)
+      testAssert(m.policyHead->p2Channels == 4);
+    else if(m.modelVersion >= 12)
+      testAssert(m.policyHead->p2Channels == 2);
+    else
+      testAssert(m.policyHead->p2Channels == 1);
+
+    policyPassBufBytes = m.policyHead->p2Channels * batchFloatBytes;
+    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&policyPassBuf), policyPassBufBytes));
+    policyBufBytes = m.policyHead->p2Channels * batchXYFloatBytes;
+    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&policyBuf), policyBufBytes));
+
+    valueBufBytes = m.valueHead->valueChannels * batchFloatBytes;
+    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&valueBuf), valueBufBytes));
+
+    scoreValueBufBytes = m.valueHead->scoreValueChannels * batchFloatBytes;
+    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&scoreValueBuf), scoreValueBufBytes));
+
+    //This buf is used for both an intermdiate fp16 result in fp16 mode, and ALSO the final fp32 output, so always must be fp32-sized
+    ownershipBufBytes = m.valueHead->ownershipChannels * batchXYFloatBytes;
+    CUDA_ERR("Buffers",hipMalloc(&ownershipBuf, ownershipBufBytes));
+
+    //In theory the requiredWorkspaceBytes calls could give us values non-monotone in batch size
+    //such as if the convolution algorithm changes between batch size 1 and larger.
+    //So we call it for all the batch sizes.
+    size_t bytes = 0;
+    size_t b;
+    for(int batchSize = 1; batchSize <= m.maxBatchSize; batchSize++) {
+      b = m.requiredWorkspaceBytes(cudaHandles,batchSize);
+      bytes = std::max(bytes,b);
+    }
+
+    CUDA_ERR("Buffers",hipMalloc(&workspaceBuf, bytes));
+    workspaceBytes = bytes;
+  }
+
+  ~Buffers() {
+    hipFree(inputBufFloat);
+    hipFree(inputBuf);
+    hipFree(inputGlobalBufFloat);
+    hipFree(inputGlobalBuf);
+    if(inputMetaBufFloat != NULL)
+      hipFree(inputMetaBufFloat);
+    if(inputMetaBuf != NULL)
+      hipFree(inputMetaBuf);
+
+    hipFree(policyPassBuf);
+    hipFree(policyBuf);
+
+    hipFree(valueBuf);
+    hipFree(scoreValueBuf);
+    hipFree(ownershipBuf);
+
+    hipFree(workspaceBuf);
+  }
+
+};
+
+//------------------------------------------------------------------------------
+
+struct ComputeContext {
+  int nnXLen;
+  int nnYLen;
+  enabled_t useFP16Mode;
+  enabled_t useNHWCMode;
+};
+
+ComputeContext* NeuralNet::createComputeContext(
+  const std::vector<int>& gpuIdxs,
+  Logger* logger,
+  int nnXLen,
+  int nnYLen,
+  const string& openCLTunerFile,
+  const string& homeDataDirOverride,
+  bool openCLReTunePerBoardSize,
+  enabled_t useFP16Mode,
+  enabled_t useNHWCMode,
+  const LoadedModel* loadedModel
+) {
+  (void)gpuIdxs;
+  (void)logger;
+  (void)openCLTunerFile;
+  (void)homeDataDirOverride;
+  (void)openCLReTunePerBoardSize;
+  (void)loadedModel;
+
+  ComputeContext* context = new ComputeContext();
+  context->nnXLen = nnXLen;
+  context->nnYLen = nnYLen;
+  context->useFP16Mode = useFP16Mode;
+  context->useNHWCMode = useNHWCMode;
+  return context;
+}
+
+void NeuralNet::freeComputeContext(ComputeContext* computeContext) {
+  delete computeContext;
+}
+
+//------------------------------------------------------------------------------
+
+struct ComputeHandle {
+  std::unique_ptr<CudaHandles> cudaHandles;
+  std::unique_ptr<Model> model;
+  std::unique_ptr<ScratchBuffers> scratch;
+  std::unique_ptr<Buffers> buffers;
+  const bool usingFP16;
+  const int nnXLen;
+  const int nnYLen;
+  const bool requireExactNNLen;
+  const bool inputsUseNHWC;
+  const bool usingNHWC;
+
+  ComputeHandle(
+    const ComputeContext* context,
+    const LoadedModel* loadedModel,
+    int majorComputeCapability,
+    int minorComputeCapability,
+    int maxBatchSize,
+    bool requireExactNNLen_,
+    bool inputsUseNHWC_,
+    bool useFP16,
+    bool useNHWC
+  ) :
+    usingFP16(useFP16),
+    nnXLen(context->nnXLen),
+    nnYLen(context->nnYLen),
+    requireExactNNLen(requireExactNNLen_),
+    inputsUseNHWC(inputsUseNHWC_),
+    usingNHWC(useNHWC)
+  {
+    cudaHandles = std::make_unique<CudaHandles>(majorComputeCapability,minorComputeCapability);
+    model = std::make_unique<Model>(
+      cudaHandles.get(), &(loadedModel->modelDesc), maxBatchSize,
+      nnXLen, nnYLen, inputsUseNHWC, useFP16, useNHWC
+    );
+    scratch = std::make_unique<ScratchBuffers>(maxBatchSize, nnXLen, nnYLen, useFP16);
+    buffers = std::make_unique<Buffers>(cudaHandles.get(), *model, *scratch);
+
+    //Synchronize after creating buffers and copying all the weights, just in case
+    CUDA_ERR("ComputeHandle", hipDeviceSynchronize());
+  }
+  ~ComputeHandle() {
+  }
+
+  ComputeHandle() = delete;
+  ComputeHandle(const ComputeHandle&) = delete;
+  ComputeHandle& operator=(const ComputeHandle&) = delete;
+};
+
+ComputeHandle* NeuralNet::createComputeHandle(
+  ComputeContext* context,
+  const LoadedModel* loadedModel,
+  Logger* logger,
+  int maxBatchSize,
+  bool requireExactNNLen,
+  bool inputsUseNHWC,
+  int gpuIdxForThisThread,
+  int serverThreadIdx
+) {
+  //Use whatever CUDA believes GPU 0 to be.
+  if(gpuIdxForThisThread == -1)
+    gpuIdxForThisThread = 0;
+
+  CUDA_ERR("createComputeHandle",hipSetDevice(gpuIdxForThisThread));
+
+  hipDeviceProp_t prop;
+  hipGetDeviceProperties(&prop,gpuIdxForThisThread);
+
+  bool useFP16 = false;
+  bool useNHWC = false;
+  //Old GPUs - use FP32 and explicitly fail if FP16 enabled
+  if(prop.major < 5 || (prop.major == 5 && prop.minor < 3)) {
+    if(context->useFP16Mode == enabled_t::True)
+      throw StringError("Cuda device versions below 5.3 do not support useFP16=true");
+    if(context->useNHWCMode == enabled_t::True)
+      useNHWC = true;
+  }
+  //In theory these GPUs support FP16, so allow if the user wants.
+  else if(prop.major < 6) {
+    if(context->useFP16Mode == enabled_t::True)
+      useFP16 = true;
+    if(context->useNHWCMode == enabled_t::True)
+      useNHWC = true;
+  }
+  //On Pascal architecture, default to using FP16 operations
+  //Actually, just use FP32 - there's a risk that on certain cards this might just be a lot worse.
+  //A user manually fine-tuning for performance can just enable it themselves if they know how.
+  else if(prop.major < 7) {
+    if(context->useFP16Mode == enabled_t::True)
+      useFP16 = true;
+    if(context->useNHWCMode == enabled_t::True)
+      useNHWC = true;
+  }
+  //On Volta and higher, use FP16 and NHWC together because we have tensor cores.
+  else {
+    if(context->useFP16Mode == enabled_t::True || context->useFP16Mode == enabled_t::Auto)
+      useFP16 = true;
+    if(context->useNHWCMode == enabled_t::True || (context->useNHWCMode == enabled_t::Auto && useFP16))
+      useNHWC = true;
+  }
+
+  if(logger != NULL) {
+    logger->write(
+      "Cuda backend thread " + Global::intToString(serverThreadIdx) + ": Found GPU " + string(prop.name)
+      + " memory " + Global::uint64ToString(prop.totalGlobalMem)
+      + " compute capability major " + Global::intToString(prop.major)
+      + " minor " + Global::intToString(prop.minor)
+    );
+    logger->write(
+      "Cuda backend thread " + Global::intToString(serverThreadIdx) + ": Model version " + Global::intToString(loadedModel->modelDesc.modelVersion) +
+      " useFP16 = " + Global::boolToString(useFP16) +
+      " useNHWC = " + Global::boolToString(useNHWC)
+    );
+    logger->write(
+      "Cuda backend thread " + Global::intToString(serverThreadIdx) + ": Model name: " + loadedModel->modelDesc.name
+    );
+  }
+
+  ComputeHandle* gpuHandle = new ComputeHandle(
+    context,loadedModel,prop.major,prop.minor,maxBatchSize,requireExactNNLen,inputsUseNHWC,useFP16,useNHWC
+  );
+  return gpuHandle;
+}
+
+void NeuralNet::freeComputeHandle(ComputeHandle* gpuHandle) {
+  delete gpuHandle;
+}
+
+bool NeuralNet::isUsingFP16(const ComputeHandle* handle) {
+  return handle->usingFP16;
+}
+
+//------------------------------------------------------------------------------
+
+void NeuralNet::printDevices() {
+  int numDevices = 0;
+  hipGetDeviceCount(&numDevices);
+  for(int i = 0; i<numDevices; i++) {
+    hipDeviceProp_t prop;
+    hipGetDeviceProperties(&prop, i);
+    cout << "Found CUDA device " << i << ": " << prop.name << endl;
+  }
+}
+
+
+//------------------------------------------------------------------------------
+
+struct InputBuffers {
+  int maxBatchSize;
+
+  size_t singleInputElts;
+  size_t singleInputBytes;
+  size_t singleInputGlobalElts;
+  size_t singleInputGlobalBytes;
+  size_t singleInputMetaElts;
+  size_t singleInputMetaBytes;
+  size_t singlePolicyPassResultElts;
+  size_t singlePolicyPassResultBytes;
+  size_t singlePolicyResultElts;
+  size_t singlePolicyResultBytes;
+  size_t singleValueResultElts;
+  size_t singleValueResultBytes;
+  size_t singleScoreValueResultElts;
+  size_t singleScoreValueResultBytes;
+  size_t singleOwnershipResultElts;
+  size_t singleOwnershipResultBytes;
+
+  size_t userInputBufferBytes;
+  size_t userInputGlobalBufferBytes;
+  size_t userInputMetaBufferBytes;
+  size_t policyPassResultBufferBytes;
+  size_t policyResultBufferBytes;
+  size_t valueResultBufferBytes;
+  size_t scoreValueResultBufferBytes;
+  size_t ownershipResultBufferBytes;
+
+  float* userInputBuffer; //Host pointer
+  float* userInputGlobalBuffer; //Host pointer
+  float* userInputMetaBuffer; //Host pointer
+
+  float* policyPassResults; //Host pointer
+  float* policyResults; //Host pointer
+  float* valueResults; //Host pointer
+  float* scoreValueResults; //Host pointer
+  float* ownershipResults; //Host pointer
+
+  InputBuffers(const LoadedModel* loadedModel, int maxBatchSz, int nnXLen, int nnYLen) {
+    const ModelDesc& m = loadedModel->modelDesc;
+
+    maxBatchSize = maxBatchSz;
+    singleInputElts = (size_t)m.numInputChannels * nnXLen * nnYLen;
+    singleInputBytes = (size_t)m.numInputChannels * nnXLen * nnYLen * sizeof(float);
+    singleInputGlobalElts = (size_t)m.numInputGlobalChannels;
+    singleInputGlobalBytes = (size_t)m.numInputGlobalChannels * sizeof(float);
+    singleInputMetaElts = (size_t)m.numInputMetaChannels;
+    singleInputMetaBytes = (size_t)m.numInputMetaChannels * sizeof(float);
+    singlePolicyPassResultElts = (size_t)(m.numPolicyChannels);
+    singlePolicyPassResultBytes = (size_t)(m.numPolicyChannels) * sizeof(float);
+    singlePolicyResultElts = (size_t)(m.numPolicyChannels * nnXLen * nnYLen);
+    singlePolicyResultBytes = (size_t)(m.numPolicyChannels * nnXLen * nnYLen) * sizeof(float);
+    singleValueResultElts = (size_t)m.numValueChannels;
+    singleValueResultBytes = (size_t)m.numValueChannels * sizeof(float);
+    singleScoreValueResultElts = (size_t)m.numScoreValueChannels;
+    singleScoreValueResultBytes = (size_t)m.numScoreValueChannels * sizeof(float);
+    singleOwnershipResultElts = (size_t)m.numOwnershipChannels * nnXLen * nnYLen;
+    singleOwnershipResultBytes = (size_t)m.numOwnershipChannels * nnXLen * nnYLen * sizeof(float);
+
+    assert(NNModelVersion::getNumSpatialFeatures(m.modelVersion) == m.numInputChannels);
+    assert(NNModelVersion::getNumGlobalFeatures(m.modelVersion) == m.numInputGlobalChannels);
+    if(m.numInputMetaChannels > 0) {
+      assert(SGFMetadata::METADATA_INPUT_NUM_CHANNELS == m.numInputMetaChannels);
+    }
+
+    userInputBufferBytes = (size_t)m.numInputChannels * maxBatchSize * nnXLen * nnYLen * sizeof(float);
+    userInputGlobalBufferBytes = (size_t)m.numInputGlobalChannels * maxBatchSize * sizeof(float);
+    userInputMetaBufferBytes = (size_t)m.numInputMetaChannels * maxBatchSize * sizeof(float);
+    policyPassResultBufferBytes = (size_t)maxBatchSize * m.numPolicyChannels * sizeof(float);
+    policyResultBufferBytes = (size_t)maxBatchSize * m.numPolicyChannels * nnXLen * nnYLen * sizeof(float);
+    valueResultBufferBytes = (size_t)maxBatchSize * m.numValueChannels * sizeof(float);
+    scoreValueResultBufferBytes = (size_t)maxBatchSize * m.numScoreValueChannels * sizeof(float);
+    ownershipResultBufferBytes = (size_t)maxBatchSize * nnXLen * nnYLen * m.numOwnershipChannels * sizeof(float);
+
+    userInputBuffer = new float[(size_t)m.numInputChannels * maxBatchSize * nnXLen * nnYLen];
+    userInputGlobalBuffer = new float[(size_t)m.numInputGlobalChannels * maxBatchSize];
+    if(m.numInputMetaChannels > 0)
+      userInputMetaBuffer = new float[(size_t)m.numInputMetaChannels * maxBatchSize];
+    else
+      userInputMetaBuffer = NULL;
+
+    policyPassResults = new float[(size_t)maxBatchSize * m.numPolicyChannels];
+    policyResults = new float[(size_t)maxBatchSize * m.numPolicyChannels * nnXLen * nnYLen];
+    valueResults = new float[(size_t)maxBatchSize * m.numValueChannels];
+
+    scoreValueResults = new float[(size_t)maxBatchSize * m.numScoreValueChannels];
+    ownershipResults = new float[(size_t)maxBatchSize * nnXLen * nnYLen * m.numOwnershipChannels];
+  }
+
+  ~InputBuffers() {
+    delete[] userInputBuffer;
+    delete[] userInputGlobalBuffer;
+    if(userInputMetaBuffer != NULL)
+      delete[] userInputMetaBuffer;
+    delete[] policyPassResults;
+    delete[] policyResults;
+    delete[] valueResults;
+    delete[] scoreValueResults;
+    delete[] ownershipResults;
+  }
+
+  InputBuffers() = delete;
+  InputBuffers(const InputBuffers&) = delete;
+  InputBuffers& operator=(const InputBuffers&) = delete;
+
+};
+
+InputBuffers* NeuralNet::createInputBuffers(const LoadedModel* loadedModel, int maxBatchSize, int nnXLen, int nnYLen) {
+  return new InputBuffers(loadedModel,maxBatchSize,nnXLen,nnYLen);
+}
+void NeuralNet::freeInputBuffers(InputBuffers* inputBuffers) {
+  delete inputBuffers;
+}
+
+//---------------------------------------------------------------------------------------
+
+
+void NeuralNet::getOutput(
+  ComputeHandle* gpuHandle,
+  InputBuffers* inputBuffers,
+  int numBatchEltsFilled,
+  NNResultBuf** inputBufs,
+  vector<NNOutput*>& outputs
+) {
+  assert(numBatchEltsFilled <= inputBuffers->maxBatchSize);
+  assert(numBatchEltsFilled > 0);
+  const int batchSize = numBatchEltsFilled;
+  const int nnXLen = gpuHandle->nnXLen;
+  const int nnYLen = gpuHandle->nnYLen;
+  const int modelVersion = gpuHandle->model->modelVersion;
+
+  const int numSpatialFeatures = NNModelVersion::getNumSpatialFeatures(modelVersion);
+  const int numGlobalFeatures = NNModelVersion::getNumGlobalFeatures(modelVersion);
+  const int numMetaFeatures = inputBuffers->singleInputMetaElts;
+  assert(numSpatialFeatures == gpuHandle->model->numInputChannels);
+  assert(numSpatialFeatures * nnXLen * nnYLen == inputBuffers->singleInputElts);
+  assert(numGlobalFeatures == inputBuffers->singleInputGlobalElts);
+  const int numPolicyChannels = gpuHandle->model->numPolicyChannels;
+
+  for(int nIdx = 0; nIdx<batchSize; nIdx++) {
+    float* rowSpatialInput = inputBuffers->userInputBuffer + (inputBuffers->singleInputElts * nIdx);
+    float* rowGlobalInput = inputBuffers->userInputGlobalBuffer + (inputBuffers->singleInputGlobalElts * nIdx);
+    float* rowMetaInput = inputBuffers->userInputMetaBuffer + (inputBuffers->singleInputMetaElts * nIdx);
+
+    const float* rowGlobal = inputBufs[nIdx]->rowGlobalBuf.data();
+    const float* rowSpatial = inputBufs[nIdx]->rowSpatialBuf.data();
+    const float* rowMeta = inputBufs[nIdx]->rowMetaBuf.data();
+    bool hasRowMeta = inputBufs[nIdx]->hasRowMeta;
+    std::copy(rowGlobal,rowGlobal+numGlobalFeatures,rowGlobalInput);
+    if(numMetaFeatures > 0) {
+      testAssert(rowMeta != NULL);
+      testAssert(hasRowMeta);
+      std::copy(rowMeta,rowMeta+numMetaFeatures,rowMetaInput);
+    }
+    else {
+      testAssert(!hasRowMeta);
+    }
+    SymmetryHelpers::copyInputsWithSymmetry(rowSpatial, rowSpatialInput, 1, nnYLen, nnXLen, numSpatialFeatures, gpuHandle->inputsUseNHWC, inputBufs[nIdx]->symmetry);
+  }
+
+  Buffers* buffers = gpuHandle->buffers.get();
+  ScratchBuffers* scratch = gpuHandle->scratch.get();
+
+  if(!gpuHandle->usingFP16) {
+    assert(inputBuffers->userInputBufferBytes == buffers->inputBufBytes);
+    assert(inputBuffers->userInputGlobalBufferBytes == buffers->inputGlobalBufBytes);
+    assert(inputBuffers->userInputMetaBufferBytes == buffers->inputMetaBufBytes);
+    assert(inputBuffers->policyPassResultBufferBytes == buffers->policyPassBufBytes);
+    assert(inputBuffers->policyResultBufferBytes == buffers->policyBufBytes);
+    assert(inputBuffers->valueResultBufferBytes == buffers->valueBufBytes);
+    assert(inputBuffers->singleInputBytes == inputBuffers->singleInputElts*4);
+    assert(inputBuffers->singleInputGlobalBytes == inputBuffers->singleInputGlobalElts*4);
+    assert(inputBuffers->singleInputMetaBytes == inputBuffers->singleInputMetaElts*4);
+    assert(inputBuffers->singlePolicyPassResultElts == numPolicyChannels);
+    assert(inputBuffers->singlePolicyPassResultBytes == numPolicyChannels * sizeof(float));
+    assert(inputBuffers->singlePolicyResultElts == numPolicyChannels*nnXLen*nnYLen);
+    assert(inputBuffers->singlePolicyResultBytes == numPolicyChannels*nnXLen*nnYLen * sizeof(float));
+    assert(inputBuffers->scoreValueResultBufferBytes == buffers->scoreValueBufBytes);
+    assert(inputBuffers->ownershipResultBufferBytes == buffers->ownershipBufBytes);
+    assert(inputBuffers->singleOwnershipResultElts == nnXLen*nnYLen);
+    assert(inputBuffers->singleOwnershipResultBytes == nnXLen*nnYLen * sizeof(float));
+
+    CUDA_ERR("getOutput",hipMemcpy(buffers->inputBuf, inputBuffers->userInputBuffer, inputBuffers->singleInputBytes*batchSize, hipMemcpyHostToDevice));
+    CUDA_ERR("getOutput",hipMemcpy(buffers->inputGlobalBuf, inputBuffers->userInputGlobalBuffer, inputBuffers->singleInputGlobalBytes*batchSize, hipMemcpyHostToDevice));
+    if(numMetaFeatures > 0) {
+      CUDA_ERR("getOutput",hipMemcpy(buffers->inputMetaBuf, inputBuffers->userInputMetaBuffer, inputBuffers->singleInputMetaBytes*batchSize, hipMemcpyHostToDevice));
+    }
+  }
+  else {
+    assert(inputBuffers->userInputBufferBytes == buffers->inputBufBytesFloat);
+    assert(inputBuffers->userInputGlobalBufferBytes == buffers->inputGlobalBufBytesFloat);
+    assert(inputBuffers->userInputMetaBufferBytes == buffers->inputMetaBufBytesFloat);
+    assert(inputBuffers->policyResultBufferBytes == buffers->policyBufBytes);
+    assert(inputBuffers->valueResultBufferBytes == buffers->valueBufBytes);
+    assert(inputBuffers->userInputBufferBytes == buffers->inputBufBytes*2);
+    assert(inputBuffers->userInputGlobalBufferBytes == buffers->inputGlobalBufBytes*2);
+    assert(inputBuffers->userInputMetaBufferBytes == buffers->inputMetaBufBytes*2);
+    assert(inputBuffers->singleInputBytes == inputBuffers->singleInputElts*4);
+    assert(inputBuffers->singleInputGlobalBytes == inputBuffers->singleInputGlobalElts*4);
+    assert(inputBuffers->singleInputMetaBytes == inputBuffers->singleInputMetaElts*4);
+    assert(inputBuffers->singlePolicyPassResultElts == numPolicyChannels);
+    assert(inputBuffers->singlePolicyPassResultBytes == numPolicyChannels * sizeof(float));
+    assert(inputBuffers->singlePolicyResultElts == numPolicyChannels*nnXLen*nnYLen);
+    assert(inputBuffers->singlePolicyResultBytes == numPolicyChannels*nnXLen*nnYLen * sizeof(float));
+    assert(inputBuffers->scoreValueResultBufferBytes == buffers->scoreValueBufBytes);
+    assert(inputBuffers->ownershipResultBufferBytes == buffers->ownershipBufBytes);
+    assert(inputBuffers->singleOwnershipResultElts == nnXLen*nnYLen);
+    assert(inputBuffers->singleOwnershipResultBytes == nnXLen*nnYLen * sizeof(float));
+
+    CUDA_ERR("getOutput",hipMemcpy(buffers->inputBufFloat, inputBuffers->userInputBuffer, inputBuffers->singleInputBytes*batchSize, hipMemcpyHostToDevice));
+    CUDA_ERR("getOutput",hipMemcpy(buffers->inputGlobalBufFloat, inputBuffers->userInputGlobalBuffer, inputBuffers->singleInputGlobalBytes*batchSize, hipMemcpyHostToDevice));
+    if(numMetaFeatures > 0) {
+      CUDA_ERR("getOutput",hipMemcpy(buffers->inputMetaBufFloat, inputBuffers->userInputMetaBuffer, inputBuffers->singleInputMetaBytes*batchSize, hipMemcpyHostToDevice));
+    }
+
+    customCudaCopyToHalf((const float*)buffers->inputBufFloat,(half*)buffers->inputBuf,inputBuffers->singleInputElts*batchSize);
+    CUDA_ERR("getOutput",hipPeekAtLastError());
+    customCudaCopyToHalf((const float*)buffers->inputGlobalBufFloat,(half*)buffers->inputGlobalBuf,inputBuffers->singleInputGlobalElts*batchSize);
+    CUDA_ERR("getOutput",hipPeekAtLastError());
+    if(numMetaFeatures > 0) {
+      customCudaCopyToHalf((const float*)buffers->inputMetaBufFloat,(half*)buffers->inputMetaBuf,inputBuffers->singleInputMetaElts*batchSize);
+      CUDA_ERR("getOutput",hipPeekAtLastError());
+    }
+  }
+
+  gpuHandle->model->apply(
+    gpuHandle->cudaHandles.get(),
+    scratch,
+    batchSize,
+    gpuHandle->requireExactNNLen,
+
+    buffers->inputBuf,
+    buffers->inputGlobalBuf,
+    buffers->inputMetaBuf,
+
+    buffers->policyPassBuf,
+    buffers->policyBuf,
+
+    buffers->valueBuf,
+    buffers->scoreValueBuf,
+    buffers->ownershipBuf,
+
+    buffers->workspaceBuf,
+    buffers->workspaceBytes
+  );
+
+  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->policyPassResults, buffers->policyPassBuf, inputBuffers->singlePolicyPassResultBytes*batchSize, hipMemcpyDeviceToHost));
+  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->policyResults, buffers->policyBuf, inputBuffers->singlePolicyResultBytes*batchSize, hipMemcpyDeviceToHost));
+  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->valueResults, buffers->valueBuf, inputBuffers->singleValueResultBytes*batchSize, hipMemcpyDeviceToHost));
+  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->scoreValueResults, buffers->scoreValueBuf, inputBuffers->singleScoreValueResultBytes*batchSize, hipMemcpyDeviceToHost));
+  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->ownershipResults, buffers->ownershipBuf, inputBuffers->singleOwnershipResultBytes*batchSize, hipMemcpyDeviceToHost));
+
+  assert(outputs.size() == batchSize);
+
+  float policyProbsTmp[NNPos::MAX_NN_POLICY_SIZE];
+
+  for(int row = 0; row < batchSize; row++) {
+    NNOutput* output = outputs[row];
+    assert(output->nnXLen == nnXLen);
+    assert(output->nnYLen == nnYLen);
+    float policyOptimism = (float)inputBufs[row]->policyOptimism;
+
+    const float* policyPassSrcBuf = inputBuffers->policyPassResults + row * numPolicyChannels;
+    const float* policySrcBuf = inputBuffers->policyResults + row * numPolicyChannels * nnXLen * nnYLen;
+    float* policyProbs = output->policyProbs;
+
+    // These are in logits, the client does the postprocessing to turn them into
+    // policy probabilities and white game outcome probabilities
+    // Also we don't fill in the nnHash here either
+    // Handle version >= 12 policy optimism
+    if(numPolicyChannels == 2 || (numPolicyChannels == 4 && modelVersion >= 16)) {
+       if(gpuHandle->usingNHWC) {
+        for(int i = 0; i<nnXLen*nnYLen; i++) {
+          float p = policySrcBuf[i*numPolicyChannels];
+          float pOpt = policySrcBuf[i*numPolicyChannels+1];
+          policyProbsTmp[i] = p + (pOpt-p) * policyOptimism;
+        }
+        SymmetryHelpers::copyOutputsWithSymmetry(policyProbsTmp, policyProbs, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
+        policyProbs[nnXLen*nnYLen] = policyPassSrcBuf[0] + (policyPassSrcBuf[1] - policyPassSrcBuf[0]) * policyOptimism;
+      }
+      else {
+        for(int i = 0; i<nnXLen*nnYLen; i++) {
+          float p = policySrcBuf[i];
+          float pOpt = policySrcBuf[i+nnXLen*nnYLen];
+          policyProbsTmp[i] = p + (pOpt-p) * policyOptimism;
+        }
+        SymmetryHelpers::copyOutputsWithSymmetry(policyProbsTmp, policyProbs, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
+        policyProbs[nnXLen*nnYLen] = policyPassSrcBuf[0] + (policyPassSrcBuf[1] - policyPassSrcBuf[0]) * policyOptimism;
+      }
+    }
+    else {
+      assert(numPolicyChannels == 1);
+      SymmetryHelpers::copyOutputsWithSymmetry(policySrcBuf, policyProbs, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
+      policyProbs[nnXLen*nnYLen] = policyPassSrcBuf[0];
+    }
+
+    int numValueChannels = gpuHandle->model->numValueChannels;
+    assert(numValueChannels == 3);
+    output->whiteWinProb = inputBuffers->valueResults[row * numValueChannels];
+    output->whiteLossProb = inputBuffers->valueResults[row * numValueChannels + 1];
+    output->whiteNoResultProb = inputBuffers->valueResults[row * numValueChannels + 2];
+
+    //As above, these are NOT actually from white's perspective, but rather the player to move.
+    //As usual the client does the postprocessing.
+    if(output->whiteOwnerMap != NULL) {
+      const float* ownershipSrcBuf = inputBuffers->ownershipResults + row * nnXLen * nnYLen;
+      assert(gpuHandle->model->numOwnershipChannels == 1);
+      SymmetryHelpers::copyOutputsWithSymmetry(ownershipSrcBuf, output->whiteOwnerMap, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
+    }
+
+    if(modelVersion >= 9) {
+      int numScoreValueChannels = gpuHandle->model->numScoreValueChannels;
+      assert(numScoreValueChannels == 6);
+      output->whiteScoreMean = inputBuffers->scoreValueResults[row * numScoreValueChannels];
+      output->whiteScoreMeanSq = inputBuffers->scoreValueResults[row * numScoreValueChannels + 1];
+      output->whiteLead = inputBuffers->scoreValueResults[row * numScoreValueChannels + 2];
+      output->varTimeLeft = inputBuffers->scoreValueResults[row * numScoreValueChannels + 3];
+      output->shorttermWinlossError = inputBuffers->scoreValueResults[row * numScoreValueChannels + 4];
+      output->shorttermScoreError = inputBuffers->scoreValueResults[row * numScoreValueChannels + 5];
+    }
+    else if(modelVersion >= 8) {
+      int numScoreValueChannels = gpuHandle->model->numScoreValueChannels;
+      assert(numScoreValueChannels == 4);
+      output->whiteScoreMean = inputBuffers->scoreValueResults[row * numScoreValueChannels];
+      output->whiteScoreMeanSq = inputBuffers->scoreValueResults[row * numScoreValueChannels + 1];
+      output->whiteLead = inputBuffers->scoreValueResults[row * numScoreValueChannels + 2];
+      output->varTimeLeft = inputBuffers->scoreValueResults[row * numScoreValueChannels + 3];
+      output->shorttermWinlossError = 0;
+      output->shorttermScoreError = 0;
+    }
+    else if(modelVersion >= 4) {
+      int numScoreValueChannels = gpuHandle->model->numScoreValueChannels;
+      assert(numScoreValueChannels == 2);
+      output->whiteScoreMean = inputBuffers->scoreValueResults[row * numScoreValueChannels];
+      output->whiteScoreMeanSq = inputBuffers->scoreValueResults[row * numScoreValueChannels + 1];
+      output->whiteLead = output->whiteScoreMean;
+      output->varTimeLeft = 0;
+      output->shorttermWinlossError = 0;
+      output->shorttermScoreError = 0;
+    }
+    else if(modelVersion >= 3) {
+      int numScoreValueChannels = gpuHandle->model->numScoreValueChannels;
+      assert(numScoreValueChannels == 1);
+      output->whiteScoreMean = inputBuffers->scoreValueResults[row * numScoreValueChannels];
+      //Version 3 neural nets don't have any second moment output, implicitly already folding it in, so we just use the mean squared
+      output->whiteScoreMeanSq = output->whiteScoreMean * output->whiteScoreMean;
+      output->whiteLead = output->whiteScoreMean;
+      output->varTimeLeft = 0;
+      output->shorttermWinlossError = 0;
+      output->shorttermScoreError = 0;
+    }
+    else {
+      ASSERT_UNREACHABLE;
+    }
+  }
+
+}
+
+//TESTING ----------------------------------------------------------------------------------
+
+
+bool NeuralNet::testEvaluateConv(
+  const ConvLayerDesc* desc,
+  int desiredBatchSize,
+  int nnXLen,
+  int nnYLen,
+  bool useFP16,
+  bool useNHWC,
+  const vector<float>& inputBuffer,
+  vector<float>& outputBuffer
+) {
+  hipDeviceSynchronize();
+  CudaHandles* cudaHandles = CudaHandles::cudaHandlesTesting();
+
+  size_t numInputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->inChannels;
+  size_t numOutputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->outChannels;
+  if(numInputFloats != inputBuffer.size())
+    throw StringError("testEvaluateConv: unexpected input buffer size");
+
+  void* deviceInput;
+  void* deviceOutput;
+  CudaUtils::mallocAndCopyToDevice("deviceInput", inputBuffer.data(), numInputFloats, deviceInput, useFP16);
+  CudaUtils::mallocOnDevice("deviceOutput", numOutputFloats, deviceOutput, useFP16);
+
+  int maxBatchSize = desiredBatchSize;
+
+  CudnnManager* manager = new CudnnManager("manager",maxBatchSize,nnXLen,nnYLen);
+  ConvLayer* convLayer = new ConvLayer(cudaHandles,manager,desc,useFP16,useNHWC);
+
+  size_t workspaceBytes =
+    convLayer->requiredWorkspaceBytes(cudaHandles,desiredBatchSize);
+  void* deviceWorkspace;
+  CUDA_ERR("deviceWorkspace",hipMalloc(&deviceWorkspace, workspaceBytes));
+
+
+  bool accumulate = false;
+  convLayer->apply(
+    cudaHandles,
+    desiredBatchSize,
+    accumulate,
+    deviceInput,
+    deviceOutput,
+    deviceWorkspace,
+    workspaceBytes
+  );
+
+  outputBuffer.resize(numOutputFloats);
+  CudaUtils::expensiveCopyFromDevice("copyResultsToHost", outputBuffer.data(), numOutputFloats, deviceOutput, useFP16);
+
+  hipFree(deviceWorkspace);
+
+  delete convLayer;
+  delete manager;
+  hipFree(deviceInput);
+  hipFree(deviceOutput);
+  delete cudaHandles;
+
+  return true;
+}
+
+
+bool NeuralNet::testEvaluateBatchNorm(
+  const BatchNormLayerDesc* desc,
+  int desiredBatchSize,
+  int nnXLen,
+  int nnYLen,
+  bool useFP16,
+  bool useNHWC,
+  const vector<float>& inputBuffer,
+  const vector<float>& maskBuffer,
+  vector<float>& outputBuffer
+) {
+  hipDeviceSynchronize();
+  CudaHandles* cudaHandles = CudaHandles::cudaHandlesTesting();
+
+  size_t numInputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->numChannels;
+  size_t numMaskFloats = (size_t)desiredBatchSize * nnXLen * nnYLen;
+  size_t numOutputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->numChannels;
+  if(numInputFloats != inputBuffer.size())
+    throw StringError("testEvaluateBatchNorm: unexpected input buffer size");
+  if(numMaskFloats != maskBuffer.size())
+    throw StringError("testEvaluateBatchNorm: unexpected mask buffer size");
+
+  ActivationLayerDesc actDesc;
+  actDesc.activation = ACTIVATION_IDENTITY;
+
+  void* deviceInput;
+  void* deviceMask;
+  void* deviceOutput;
+  CudaUtils::mallocAndCopyToDevice("deviceInput", inputBuffer.data(), numInputFloats, deviceInput, useFP16);
+  CudaUtils::mallocAndCopyToDevice("deviceMask", maskBuffer.data(), numMaskFloats, deviceMask, useFP16);
+  CudaUtils::mallocOnDevice("deviceOutput", numOutputFloats, deviceOutput, useFP16);
+
+  BatchNormLayer* batchNormLayer = new BatchNormLayer(cudaHandles,desc,&actDesc,nnXLen,nnYLen,useFP16,useNHWC);
+
+  batchNormLayer->apply(
+    cudaHandles,
+    desiredBatchSize,
+    deviceInput,
+    deviceMask,
+    deviceOutput
+  );
+
+  outputBuffer.resize(numOutputFloats);
+  CudaUtils::expensiveCopyFromDevice("copyResultsToHost", outputBuffer.data(), numOutputFloats, deviceOutput, useFP16);
+
+  delete batchNormLayer;
+
+  hipFree(deviceInput);
+  hipFree(deviceMask);
+  hipFree(deviceOutput);
+  delete cudaHandles;
+
+  return true;
+}
+
+
+bool NeuralNet::testEvaluateResidualBlock(
+  const ResidualBlockDesc* desc,
+  int desiredBatchSize,
+  int nnXLen,
+  int nnYLen,
+  bool useFP16,
+  bool useNHWC,
+  const vector<float>& inputBuffer,
+  const vector<float>& maskBuffer,
+  vector<float>& outputBuffer
+) {
+  hipDeviceSynchronize();
+  CudaHandles* cudaHandles = CudaHandles::cudaHandlesTesting();
+
+  size_t numInputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->preBN.numChannels;
+  size_t numMaskFloats = (size_t)desiredBatchSize * nnXLen * nnYLen;
+  size_t numOutputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->finalConv.outChannels;
+  if(numInputFloats != inputBuffer.size())
+    throw StringError("testEvaluateResidualBlock: unexpected input buffer size");
+  if(numMaskFloats != maskBuffer.size())
+    throw StringError("testEvaluateResidualBlock: unexpected mask buffer size");
+
+  ScratchBuffers* scratch = new ScratchBuffers(desiredBatchSize, nnXLen, nnYLen, useFP16);
+
+  void* deviceInput;
+  void* deviceMask;
+  void* deviceScratch;
+  CudaUtils::mallocAndCopyToDevice("deviceInput", inputBuffer.data(), numInputFloats, deviceInput, useFP16);
+  CudaUtils::mallocAndCopyToDevice("deviceMask", maskBuffer.data(), numMaskFloats, deviceMask, useFP16);
+  CudaUtils::mallocOnDevice("deviceScratch", numInputFloats, deviceScratch, useFP16);
+
+  int maxBatchSize = desiredBatchSize;
+
+  CudnnManager* manager = new CudnnManager("manager",maxBatchSize,nnXLen,nnYLen);
+  ResidualBlock* residualBlock = new ResidualBlock(cudaHandles,manager,desc,nnXLen,nnYLen,useFP16,useNHWC);
+
+  size_t workspaceBytes =
+    residualBlock->requiredWorkspaceBytes(cudaHandles,desiredBatchSize);
+  void* deviceWorkspace;
+  CUDA_ERR("deviceWorkspace",hipMalloc(&deviceWorkspace, workspaceBytes));
+
+  residualBlock->apply(
+    cudaHandles,
+    scratch,
+    desiredBatchSize,
+    deviceInput,
+    deviceScratch,
+    deviceMask,
+    deviceWorkspace,
+    workspaceBytes
+  );
+
+  outputBuffer.resize(numOutputFloats);
+  CudaUtils::expensiveCopyFromDevice("copyResultsToHost", outputBuffer.data(), numOutputFloats, deviceInput, useFP16);
+
+  hipFree(deviceWorkspace);
+
+  delete residualBlock;
+  delete manager;
+  hipFree(deviceInput);
+  hipFree(deviceMask);
+  hipFree(deviceScratch);
+  delete scratch;
+  delete cudaHandles;
+
+  return true;
+}
+
+bool NeuralNet::testEvaluateGlobalPoolingResidualBlock(
+  const GlobalPoolingResidualBlockDesc* desc,
+  int desiredBatchSize,
+  int nnXLen,
+  int nnYLen,
+  bool useFP16,
+  bool useNHWC,
+  const vector<float>& inputBuffer,
+  const vector<float>& maskBuffer,
+  vector<float>& outputBuffer
+) {
+  hipDeviceSynchronize();
+  CudaHandles* cudaHandles = CudaHandles::cudaHandlesTesting();
+
+  size_t numInputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->preBN.numChannels;
+  size_t numMaskFloats = (size_t)desiredBatchSize * nnXLen * nnYLen;
+  size_t numMaskSumFloats = (size_t)desiredBatchSize;
+  size_t numOutputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->finalConv.outChannels;
+
+  if(numInputFloats != inputBuffer.size())
+    throw StringError("testEvaluateGlobalPoolingResidualBlock: unexpected input buffer size");
+  if(numMaskFloats != maskBuffer.size())
+    throw StringError("testEvaluateGlobalPoolingResidualBlock: unexpected mask buffer size");
+
+  ScratchBuffers* scratch = new ScratchBuffers(desiredBatchSize, nnXLen, nnYLen, useFP16);
+
+  void* deviceInput;
+  void* deviceMask;
+  float* deviceMaskFloatOrig;
+  float* deviceMaskFloat;
+  float* deviceMaskSum;
+  void* deviceScratch;
+
+  CudaUtils::mallocAndCopyToDevice("deviceInput", inputBuffer.data(), numInputFloats, deviceInput, useFP16);
+  CudaUtils::mallocAndCopyToDevice("deviceMask", maskBuffer.data(), numMaskFloats, deviceMask, useFP16);
+  CUDA_ERR("deviceMaskFloat",hipMalloc(reinterpret_cast<void**>(&deviceMaskFloat), numMaskFloats * sizeof(float)));
+  CUDA_ERR("deviceMaskSum",hipMalloc(reinterpret_cast<void**>(&deviceMaskSum), numMaskSumFloats * sizeof(float)));
+  deviceMaskFloatOrig = deviceMaskFloat;
+  CudaUtils::mallocOnDevice("deviceScratch", numInputFloats, deviceScratch, useFP16);
+
+  fillMaskFloatBufAndMaskSumBuf(deviceMask, deviceMaskFloat, deviceMaskSum, useFP16, desiredBatchSize, nnXLen, nnYLen);
+
+  int maxBatchSize = desiredBatchSize;
+
+  CudnnManager* manager = new CudnnManager("manager",maxBatchSize,nnXLen,nnYLen);
+  GlobalPoolingResidualBlock* residualBlock = new GlobalPoolingResidualBlock(
+    cudaHandles,manager,desc,nnXLen,nnYLen,useFP16,useNHWC
+  );
+
+  size_t workspaceBytes =
+    residualBlock->requiredWorkspaceBytes(
+      cudaHandles,desiredBatchSize
+    );
+
+  void* deviceWorkspace;
+  CUDA_ERR("deviceWorkspace",hipMalloc(&deviceWorkspace, workspaceBytes));
+
+  residualBlock->apply(
+    cudaHandles,
+    scratch,
+    desiredBatchSize,
+    deviceInput,
+    deviceScratch,
+    deviceMask,
+    deviceMaskSum,
+    deviceWorkspace,
+    workspaceBytes
+  );
+
+  outputBuffer.resize(numOutputFloats);
+  CudaUtils::expensiveCopyFromDevice("copyResultsToHost", outputBuffer.data(), numOutputFloats, deviceInput, useFP16);
+
+  hipFree(deviceWorkspace);
+
+  delete residualBlock;
+  delete manager;
+
+  hipFree(deviceInput);
+  hipFree(deviceMask);
+  hipFree(deviceMaskFloatOrig);
+  hipFree(deviceMaskSum);
+  hipFree(deviceScratch);
+  delete scratch;
+  delete cudaHandles;
+
+  return true;
+}
+
+
+#endif  // USE_ROCM_BACKEND
diff --git a/cpp/neuralnet/rocmerrorcheck.h b/cpp/neuralnet/rocmerrorcheck.h
new file mode 100644
index 000000000..049f1ae95
--- /dev/null
+++ b/cpp/neuralnet/rocmerrorcheck.h
@@ -0,0 +1,59 @@
+#ifndef NEURALNET_ROCMERRORCHECK_H_
+#define NEURALNET_ROCMERRORCHECK_H_
+
+#include "../neuralnet/rocmincludes.h"
+#include "../core/global.h"
+
+// ---------- HIP runtime ----------
+static inline void checkCudaError(hipError_t status,
+                                 const char* opName,
+                                 const char* file,
+                                 const char* func,
+                                 int line) {
+  if(status != hipSuccess)
+    throw StringError(std::string("HIP Error @") + opName + " " +
+                      file + ":" + func + ":" + Global::intToString(line) +
+                      " : " + cudaGetErrorString(status));
+}
+#define CUDA_ERR(opName,x)   checkCudaError((x),opName,__FILE__,#x,__LINE__)
+
+// ---------- hipBLAS ----------
+static inline const char* cublasGetErrorString(hipblasStatus_t s) {
+  switch(s) {
+    case HIPBLAS_STATUS_SUCCESS:          return "HIPBLAS_STATUS_SUCCESS";
+    case HIPBLAS_STATUS_ALLOC_FAILED:     return "HIPBLAS_STATUS_ALLOC_FAILED";
+    case HIPBLAS_STATUS_MAPPING_ERROR:    return "HIPBLAS_STATUS_MAPPING_ERROR";
+    case HIPBLAS_STATUS_EXECUTION_FAILED: return "HIPBLAS_STATUS_EXECUTION_FAILED";
+    case HIPBLAS_STATUS_INTERNAL_ERROR:   return "HIPBLAS_STATUS_INTERNAL_ERROR";
+    case HIPBLAS_STATUS_INVALID_VALUE:    return "HIPBLAS_STATUS_INVALID_VALUE";
+    case HIPBLAS_STATUS_NOT_INITIALIZED:  return "HIPBLAS_STATUS_NOT_INITIALIZED";
+    case HIPBLAS_STATUS_NOT_SUPPORTED:    return "HIPBLAS_STATUS_NOT_SUPPORTED";
+    default:                              return "HIPBLAS_STATUS_UNKNOWN";
+  }
+}
+static inline void checkCublasError(hipblasStatus_t status,
+                                     const char* opName,
+                                     const char* file,
+                                     const char* func,
+                                     int line) {
+  if(status != HIPBLAS_STATUS_SUCCESS)
+    throw StringError(std::string("hipBLAS Error @") + opName + " " +
+                      file + ":" + func + ":" + Global::intToString(line) +
+                      " : " + cublasGetErrorString(status));
+}
+#define CUBLAS_ERR(opName,x) checkCublasError((x),opName,__FILE__,#x,__LINE__)
+
+// ---------- MIOpen ----------
+static inline void checkCudnnError(miopenStatus_t status,
+                                    const char* opName,
+                                    const char* file,
+                                    const char* func,
+                                    int line) {
+  if(status != miopenStatusSuccess)
+    throw StringError(std::string("MIOpen Error @") + opName + " " +
+                      file + ":" + func + ":" + Global::intToString(line) +
+                      " : " + cudnnGetErrorString(status));
+}
+#define CUDNN_ERR(opName,x) checkCudnnError((x),opName,__FILE__,#x,__LINE__)
+
+#endif // NEURALNET_ROCMERRORCHECK_H_
diff --git a/cpp/neuralnet/rocmhelpers.h b/cpp/neuralnet/rocmhelpers.h
new file mode 100644
index 000000000..215b1e9fd
--- /dev/null
+++ b/cpp/neuralnet/rocmhelpers.h
@@ -0,0 +1,60 @@
+#include "hip/hip_runtime.h"
+#ifndef NEURALNET_ROCMHELPERS_H_
+#define NEURALNET_ROCMHELPERS_H_
+
+#include "../neuralnet/rocmincludes.h"
+#include "../neuralnet/activations.h"
+
+//Given two tensors with shapes inA: [n,cA,h,w] and inB: [n,cB,h,w], that are on the GPU
+//Copy them into a single tensor out: [n,cA+cB,h,w] that is also allocated on the gpu
+void customCudaChannelConcat(const float* inA, const float* inB, float* out, int chwA, int chwB, int n);
+void customCudaChannelConcat(const half* inA, const half* inB, half* out, int chwA, int chwB, int n);
+
+//Given a tensor [n,c,hw], extract out channel 0 to [n,hw]
+void customCudaChannel0ExtractNCHW(const float* in, float* out, int n, int c, int hw);
+void customCudaChannel0ExtractNCHW(const half* in, half* out, int n, int c, int hw);
+//Given a tensor [n,hw,c], extract out channel 0 to [n,hw]
+void customCudaChannel0ExtractNHWC(const float* in, float* out, int n, int hw, int c);
+void customCudaChannel0ExtractNHWC(const half* in, half* out, int n, int hw, int c);
+
+//Given an input tensor and an output buffer of shape [n,c], fill output buffer with sum or max over c.
+void customCudaPoolRowsSumNCHW(const float* in, float* out, int nSize, int cSize, int xySize, float scaleSum);
+void customCudaPoolRowsSumNHWC(const float* in, float* out, int nSize, int xySize, int cSize, float scaleSum);
+
+//Specialized operations for value head and general global pooling. Same as the other pooling, but fusedly fills
+//an output buffer of shape [n,c*3].
+void customCudaValueHeadPoolNCHW(const float* in, float* out, int nSize, int cSize, int xySize, const float* maskSum);
+void customCudaValueHeadPoolNHWC(const float* in, float* out, int nSize, int xySize, int cSize, const float* maskSum);
+void customCudaPoolRowsGPoolNCHW(const float* in, float* out, int nSize, int cSize, int xySize, const float* mask, const float* maskSum);
+void customCudaPoolRowsGPoolNHWC(const float* in, float* out, int nSize, int xySize, int cSize, const float* mask, const float* maskSum);
+void customCudaPoolRowsGPoolNCHW(const half* in, half* out, int nSize, int cSize, int xySize, const half* mask, const float* maskSum);
+void customCudaPoolRowsGPoolNHWC(const half* in, half* out, int nSize, int xySize, int cSize, const half* mask, const float* maskSum);
+
+void customCudaCopyToHalf(const float* in, half* out, int n);
+void customCudaCopyFromHalf(const half* in, float* out, int n);
+
+//Given a tensor, add another tensor to it.
+void customCudaAddTensorInplace(half* buf, const half* biases, int n);
+//Given an input with shape [n,c] and biases of shape [c], add the biases in-place.
+void customCudaAddCBiasInplaceNC(float* buf, const float* biases, int n, int c, int activation);
+void customCudaAddCBiasInplaceNC(half* buf, const half* biases, int n, int c, int activation);
+//Given an input with shape [n,c,xy] and biases of shape [n,c], add the biases in-place.
+void customCudaAddNCBiasInplaceNCHW(float *buf, const float* biases, int nSize, int cSize, int xySize);
+void customCudaAddNCBiasInplaceNCHW(half *buf, const half* biases, int nSize, int cSize, int xySize);
+//Given an input with shape [n,xy,c] and biases of shape [n,c], add the biases in-place.
+void customCudaAddNCBiasInplaceNHWC(float *buf, const float* biases, int nSize, int xySize, int cSize);
+void customCudaAddNCBiasInplaceNHWC(half *buf, const half* biases, int nSize, int xySize, int cSize);
+
+//Given an input with shape [n,c,xy] and scale and biases of shape [c], multiply by scale and add the biases
+//Optionally also apply an activation.
+//Optionally also multiply by mask (can be null), with shape [n,xy]
+void customCudaApplyCScaleBiasNCHW(const float* in, float* out, const float* scale, const float* biases, const float* mask, int n, int c, int xy, int activation);
+void customCudaApplyCScaleBiasNCHW(const half* in, half* out, const half* scale, const half* biases, const half* mask, int n, int c, int xy, int activation);
+//Given an input with shape [n,xy,c] and scale and biases of shape [c], multiply by scale and add the biases
+//Optionally also apply relu.
+//Optionally also multiply by mask (can be null), with shape [n,xy]
+void customCudaApplyCScaleBiasNHWC(const float* in, float* out, const float* scale, const float* biases, const float* mask, int n, int xy, int c, int activation);
+void customCudaApplyCScaleBiasNHWC(const half* in, half* out, const half* scale, const half* biases, const half* mask, int n, int xy, int c, int activation);
+
+
+#endif  // NEURALNET_ROCMHELPERS_H_
diff --git a/cpp/neuralnet/rocmhelpers.hip b/cpp/neuralnet/rocmhelpers.hip
new file mode 100644
index 000000000..2f9b94951
--- /dev/null
+++ b/cpp/neuralnet/rocmhelpers.hip
@@ -0,0 +1,1905 @@
+#include "hip/hip_runtime.h"
+
+#include "../neuralnet/rocmhelpers.h"
+
+#include <stdexcept>
+
+#if defined(__HIP_ARCH_HAS_FP16__) || (defined(__HIP_DEVICE_COMPILE__) && (__HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || ...))
+#define HIP_SUPPORTS_FP16
+#endif
+
+//TODO maybe tune this number, it varies by GPU
+static const int targetNumThreads = 512;
+
+void splitThreadsAcrossDim01(int dim0Size, int dim1Size, int& threads0, int& blocks0, int& threads1, int& blocks1) {
+  if(dim0Size > targetNumThreads) {
+    threads0 = targetNumThreads/2;
+    blocks0 = (dim0Size + threads0 - 1) / threads0;
+    threads1 = 1;
+    blocks1 = dim1Size;
+  }
+  else if(dim0Size > targetNumThreads/2) {
+    threads0 = dim0Size;
+    blocks0 = 1;
+    threads1 = 1;
+    blocks1 = dim1Size;
+  }
+  else {
+    threads0 = dim0Size;
+    blocks0 = 1;
+    threads1 = targetNumThreads / dim0Size;
+    blocks1 = (dim1Size + threads1 - 1) / threads1;
+  }
+}
+
+__forceinline__ __device__ float mishf(float a) {
+  return a * tanhf(a < 20.0f ? log1pf(expf(a)) : a);
+}
+__forceinline__ __device__ float mishf_scale8(float a) {
+  return a < 2.5f ? a * tanhf(log1pf(expf(a*8.0f))) : a;
+}
+
+#ifdef HIP_SUPPORTS_FP16
+__forceinline__ __device__ half mishh(half h) {
+  float a = __half2float(h);
+  return __float2half(a * tanhf(a < 20.0f ? log1pf(expf(a)) : a));
+}
+__forceinline__ __device__ half mishh_scale8(half h) {
+  float a = __half2float(h);
+  return __float2half(a < 2.5f ? a * tanhf(log1pf(expf(a*8.0f))) : a);
+}
+#endif
+
+//--------------------------------------------------------------------------------------------------------------
+
+template <typename T>
+__global__
+void channelConcatKernel(
+  const T* inA,
+  const T* inB,
+  T* out,
+  int chwA,
+  int chwB,
+  int numBlocksA,
+  int numBlocksB,
+  int n
+) {
+  if(blockIdx.x < numBlocksA) {
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if(index < chwA) {
+      int nchwA = n*chwA;
+      int chwOut = (chwA+chwB);
+
+      int aIdx = index;
+      int outIdx = index;
+      while(aIdx < nchwA) {
+        out[outIdx] = inA[aIdx];
+        aIdx += chwA;
+        outIdx += chwOut;
+      }
+    }
+  }
+  else {
+    int index = (blockIdx.x - numBlocksA) * blockDim.x + threadIdx.x;
+    if(index < chwB) {
+      int nchwB = n*chwB;
+      int chwOut = (chwA+chwB);
+
+      int bIdx = index;
+      int outIdx = chwA+index;
+      while(bIdx < nchwB) {
+        out[outIdx] = inB[bIdx];
+        bIdx += chwB;
+        outIdx += chwOut;
+      }
+    }
+  }
+}
+
+template <typename T>
+void customCudaChannelConcatTemplate(const T* inA, const T* inB, T* out, int chwA, int chwB, int n) {
+  int blockSize = targetNumThreads;
+  int numBlocksA = (chwA + blockSize-1) / blockSize;
+  int numBlocksB = (chwB + blockSize-1) / blockSize;
+  int numBlocks = numBlocksA + numBlocksB;
+  channelConcatKernel<<<numBlocks, blockSize>>>(inA,inB,out,chwA,chwB,numBlocksA,numBlocksB,n);
+}
+template void customCudaChannelConcatTemplate<float>(const float* inA, const float* inB, float* out, int chwA, int chwB, int n);
+template void customCudaChannelConcatTemplate<half>(const half* inA, const half* inB, half* out, int chwA, int chwB, int n);
+
+void customCudaChannelConcat(const float* inA, const float* inB, float* out, int chwA, int chwB, int n) {
+  customCudaChannelConcatTemplate<float>(inA,inB,out,chwA,chwB,n);
+}
+void customCudaChannelConcat(const half* inA, const half* inB, half* out, int chwA, int chwB, int n) {
+  customCudaChannelConcatTemplate<half>(inA,inB,out,chwA,chwB,n);
+}
+
+//--------------------------------------------------------------------------------------------------------------
+
+template <typename T>
+__global__
+void extractChannel0KernelNHWC(const T *in, T* out, int nhwSize, int cSize)
+{
+  int nhwIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(nhwIdx < nhwSize) {
+    out[nhwIdx] = in[nhwIdx*cSize];
+  }
+}
+template <typename T>
+void customCudaChannel0ExtractNHWCTemplate(const T *in, T* out, int n, int hw, int c) {
+  int nhw = n*hw;
+  int blockSize = targetNumThreads;
+  int numBlocks = (nhw+blockSize-1)/blockSize;
+  extractChannel0KernelNHWC<<<numBlocks,blockSize>>>(in,out,nhw,c);
+}
+
+template <typename T>
+__global__
+void extractChannel0KernelNCHW(const T *in, T* out, int nSize, int cSize, int hwSize)
+{
+  int hwIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int nIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  if(hwIdx < hwSize && nIdx < nSize) {
+    out[nIdx * hwSize + hwIdx] = in[nIdx * cSize * hwSize + hwIdx];
+  }
+}
+template <typename T>
+void customCudaChannel0ExtractNCHWTemplate(const T *in, T* out, int nSize, int cSize, int hwSize) {
+  int hwThreads;
+  int hwBlocks;
+  int nThreads;
+  int nBlocks;
+  splitThreadsAcrossDim01(hwSize, nSize, hwThreads, hwBlocks, nThreads, nBlocks);
+
+  if(nBlocks > 65536)
+    throw std::runtime_error("customCudaChannel0ExtractNCHW: nSize too large given hwSize");
+
+  dim3 grid(hwBlocks,nBlocks,1);
+  dim3 threads(hwThreads,nThreads,1);
+  extractChannel0KernelNCHW<<<grid,threads>>>(in,out,nSize,cSize,hwSize);
+}
+
+void customCudaChannel0ExtractNCHW(const float* in, float* out, int n, int c, int hw) {
+  customCudaChannel0ExtractNCHWTemplate<float>(in,out,n,c,hw);
+}
+void customCudaChannel0ExtractNCHW(const half* in, half* out, int n, int c, int hw) {
+  customCudaChannel0ExtractNCHWTemplate<half>(in,out,n,c,hw);
+}
+void customCudaChannel0ExtractNHWC(const float* in, float* out, int n, int hw, int c) {
+  customCudaChannel0ExtractNHWCTemplate<float>(in,out,n,hw,c);
+}
+void customCudaChannel0ExtractNHWC(const half* in, half* out, int n, int hw, int c) {
+  customCudaChannel0ExtractNHWCTemplate<half>(in,out,n,hw,c);
+}
+
+//--------------------------------------------------------------------------------------------------------------
+
+__global__
+void sumChannelsNCHWKernel(const float* in, float* out, int cSize, int xySize, float scaleSum)
+{
+  extern __shared__ float sumPoolNCHWShared[];
+  int xyId = threadIdx.x;
+  int xyBlockDim = blockDim.x;
+  int cId = threadIdx.y;
+  int cBlockDim = blockDim.y;
+  int cIdx = blockIdx.y * cBlockDim + cId;
+  int nIdx = blockIdx.z;
+
+  int xycSize = xySize*cSize;
+  int sharedIdx = xyId + cId * xyBlockDim;
+
+  float acc = 0.0f;
+  if(cIdx < cSize) {
+    int xyIdx = xyId;
+    while(xyIdx < xySize) {
+      acc += in[xyIdx + cIdx * xySize + nIdx * xycSize];
+      xyIdx += xyBlockDim;
+    }
+    sumPoolNCHWShared[sharedIdx] = acc;
+  }
+  __syncthreads();
+
+  for(int s = xyBlockDim>>1; s > 0; s >>= 1) {
+    if(xyId < s) {
+      sumPoolNCHWShared[sharedIdx] += sumPoolNCHWShared[sharedIdx + s];
+    }
+    __syncthreads();
+  }
+  if(xyId == 0 && cIdx < cSize)
+    out[cIdx + nIdx * cSize] = sumPoolNCHWShared[sharedIdx] * scaleSum;
+}
+__global__
+void valueHeadPoolChannelsNCHWKernel(const float* in, float* out, int nSize, int cSize, int xySize, const float* maskSum)
+{
+  extern __shared__ float sumPoolNCHWShared[];
+  int xyId = threadIdx.x;
+  int xyBlockDim = blockDim.x;
+  int cId = threadIdx.y;
+  int cBlockDim = blockDim.y;
+  int cIdx = blockIdx.y * cBlockDim + cId;
+  int nIdx = blockIdx.z;
+
+  int xycSize = xySize*cSize;
+  int sharedIdx = xyId + cId * xyBlockDim;
+
+  float acc = 0.0f;
+  if(cIdx < cSize) {
+    int xyIdx = xyId;
+    while(xyIdx < xySize) {
+      acc += in[xyIdx + cIdx * xySize + nIdx * xycSize];
+      xyIdx += xyBlockDim;
+    }
+    sumPoolNCHWShared[sharedIdx] = acc;
+  }
+  __syncthreads();
+
+  for(int s = xyBlockDim>>1; s > 0; s >>= 1) {
+    if(xyId < s) {
+      sumPoolNCHWShared[sharedIdx] += sumPoolNCHWShared[sharedIdx + s];
+    }
+    __syncthreads();
+  }
+  if(xyId == 0 && cIdx < cSize) {
+    float sum = sumPoolNCHWShared[sharedIdx];
+    float div = maskSum[nIdx];
+    float sqrtdiv = sqrt(div);
+    float mean = sum/div;
+    out[cIdx + nIdx * cSize*3] = mean;
+    out[cIdx + nIdx * cSize*3 + cSize] = mean * (sqrtdiv - 14.0f) * 0.1f;
+    out[cIdx + nIdx * cSize*3 + cSize*2] = mean * ((sqrtdiv - 14.0f) * (sqrtdiv - 14.0f) * 0.01f - 0.1f);
+  }
+}
+__global__
+void gPoolChannelsNCHWKernel(const float* in, float* out, int cSize, int xySize, const float* maskSum, int sharedMemElts)
+{
+  extern __shared__ float poolNCHWShared[];
+  float* sumShared = (float*)poolNCHWShared;
+  float* maxShared = (float*)poolNCHWShared + sharedMemElts;
+
+  int xyId = threadIdx.x;
+  int xyBlockDim = blockDim.x;
+  int cId = threadIdx.y;
+  int cBlockDim = blockDim.y;
+  int cIdx = blockIdx.y * cBlockDim + cId;
+  int nIdx = blockIdx.z;
+
+  int xycSize = xySize*cSize;
+  int sharedIdx = xyId + cId * xyBlockDim;
+
+  if(cIdx < cSize) {
+    float accSum = 0.0f;
+    float accMax = -1.0f;
+    int xyIdx = xyId;
+    while(xyIdx < xySize) {
+      float a = in[xyIdx + cIdx * xySize + nIdx * xycSize];
+      accSum += a;
+      accMax = fmaxf(accMax, a);
+      xyIdx += xyBlockDim;
+    }
+    sumShared[sharedIdx] = accSum;
+    maxShared[sharedIdx] = accMax;
+  }
+  __syncthreads();
+
+  for(int s = xyBlockDim>>1; s > 0; s >>= 1) {
+    if(xyId < s) {
+      sumShared[sharedIdx] += sumShared[sharedIdx + s];
+      maxShared[sharedIdx] = fmaxf(maxShared[sharedIdx], maxShared[sharedIdx + s]);
+    }
+    __syncthreads();
+  }
+  if(xyId == 0 && cIdx < cSize) {
+    float sum = sumShared[sharedIdx];
+    float div = maskSum[nIdx];
+    float sqrtdiv = sqrt(div);
+    float mean = sum/div;
+
+    out[cIdx + nIdx * (cSize*3)] = mean;
+    out[cIdx + nIdx * (cSize*3) + cSize] = mean * (sqrtdiv - 14.0f) * 0.1f;
+    out[cIdx + nIdx * (cSize*3) + cSize*2] = maxShared[sharedIdx];
+  }
+}
+__global__
+void gPoolChannelsNCHWMaskKernel(const float* in, float* out, int cSize, int xySize, const float* mask, const float* maskSum, int sharedMemElts)
+{
+  extern __shared__ float poolNCHWShared[];
+  float* sumShared = (float*)poolNCHWShared;
+  float* maxShared = (float*)poolNCHWShared + sharedMemElts;
+
+  int xyId = threadIdx.x;
+  int xyBlockDim = blockDim.x;
+  int cId = threadIdx.y;
+  int cBlockDim = blockDim.y;
+  int cIdx = blockIdx.y * cBlockDim + cId;
+  int nIdx = blockIdx.z;
+
+  int xycSize = xySize*cSize;
+  int sharedIdx = xyId + cId * xyBlockDim;
+
+  if(cIdx < cSize) {
+    float accSum = 0.0f;
+    float accMax = -1.0f;
+    int xyIdx = xyId;
+    while(xyIdx < xySize) {
+      float a = in[xyIdx + cIdx * xySize + nIdx * xycSize];
+      accSum += a;
+      // Init to -1.0 above and + mask - 1.0 is because it will effectively make all padded space into -1.0
+      // which is lower than the lowest value that any current activation function will produce.
+      // so the max over all valid spaces will the same as the mask over all spaces including padding
+      // We're relying on all padded space being equal to 0 because this gpool only ever follows a BN+Activate with a mask.
+      accMax = fmaxf(accMax, a + (mask[xyIdx + nIdx * xySize] - 1.0f));
+      xyIdx += xyBlockDim;
+    }
+    sumShared[sharedIdx] = accSum;
+    maxShared[sharedIdx] = accMax;
+  }
+  __syncthreads();
+
+  for(int s = xyBlockDim>>1; s > 0; s >>= 1) {
+    if(xyId < s) {
+      sumShared[sharedIdx] += sumShared[sharedIdx + s];
+      maxShared[sharedIdx] = fmaxf(maxShared[sharedIdx], maxShared[sharedIdx + s]);
+    }
+    __syncthreads();
+  }
+  if(xyId == 0 && cIdx < cSize) {
+    float sum = sumShared[sharedIdx];
+    float div = maskSum[nIdx];
+    float sqrtdiv = sqrt(div);
+    float mean = sum/div;
+
+    out[cIdx + nIdx * (cSize*3)] = mean;
+    out[cIdx + nIdx * (cSize*3) + cSize] = mean * (sqrtdiv - 14.0f) * 0.1f;
+    out[cIdx + nIdx * (cSize*3) + cSize*2] = maxShared[sharedIdx];
+  }
+}
+
+void customCudaPoolRowsSumNCHW(const float* in, float* out, int nSize, int cSize, int xySize, float scaleSum) {
+  if(nSize > 65536)
+    throw std::runtime_error("customCudaPoolRowsSumNCHW: nSize too large");
+  if(cSize > 65536)
+    throw std::runtime_error("customCudaPoolRowsSumNCHW: cSize too large");
+
+  //Use up as many threads as possible along the xy dimension.
+  int xyThreads = 1;
+  while(xyThreads < targetNumThreads && xyThreads < xySize/2)
+    xyThreads *= 2;
+
+  //Distribute the extra threads along the c dimension.
+  int cThreads = (targetNumThreads < xyThreads) ? 1 : (targetNumThreads / xyThreads);
+  int cBlocks = (cSize + cThreads - 1) / cThreads;
+
+  //We need one shared memory spot per thread
+  int sharedMemSize = sizeof(float) * cThreads * xyThreads;
+
+  dim3 grid(1,cBlocks,nSize);
+  dim3 threads(xyThreads,cThreads,1);
+  sumChannelsNCHWKernel<<<grid,threads,sharedMemSize>>>(in,out,cSize,xySize,scaleSum);
+}
+void customCudaValueHeadPoolNCHW(const float* in, float* out, int nSize, int cSize, int xySize, const float* maskSum) {
+  if(nSize > 65536)
+    throw std::runtime_error("customCudaValueHeadPoolNCHW: nSize too large");
+  if(cSize > 65536)
+    throw std::runtime_error("customCudaValueHeadPoolNCHW: cSize too large");
+
+  //Use up as many threads as possible along the xy dimension.
+  int xyThreads = 1;
+  while(xyThreads < targetNumThreads && xyThreads < xySize/2)
+    xyThreads *= 2;
+
+  //Distribute the extra threads along the c dimension.
+  int cThreads = (targetNumThreads < xyThreads) ? 1 : (targetNumThreads / xyThreads);
+  int cBlocks = (cSize + cThreads - 1) / cThreads;
+
+  //We need one shared memory spot per thread
+  int sharedMemSize = sizeof(float) * cThreads * xyThreads;
+
+  dim3 grid(1,cBlocks,nSize);
+  dim3 threads(xyThreads,cThreads,1);
+  valueHeadPoolChannelsNCHWKernel<<<grid,threads,sharedMemSize>>>(in,out,nSize,cSize,xySize,maskSum);
+}
+void customCudaPoolRowsGPoolNCHW(const float* in, float* out, int nSize, int cSize, int xySize, const float* mask, const float* maskSum) {
+  if(nSize > 65536)
+    throw std::runtime_error("customCudaPoolRowsGPoolNCHW: nSize too large");
+  if(cSize > 65536)
+    throw std::runtime_error("customCudaPoolRowsGPoolNCHW: cSize too large");
+
+  //Use up as many threads as possible along the xy dimension.
+  int xyThreads = 1;
+  while(xyThreads < targetNumThreads && xyThreads < xySize/2)
+    xyThreads *= 2;
+
+  //Distribute the extra threads along the c dimension.
+  int cThreads = (targetNumThreads < xyThreads) ? 1 : (targetNumThreads / xyThreads);
+  int cBlocks = (cSize + cThreads - 1) / cThreads;
+
+  //We need one shared memory spot per thread, and then we double it because we need both sum and max.
+  //We also make sure it's a power of two to address any alignment concerns.
+  int sharedMemElts = 128;
+  while(sharedMemElts < cThreads * xyThreads)
+    sharedMemElts *= 2;
+  int sharedMemSize = sizeof(float) * sharedMemElts * 2;
+
+  dim3 grid(1,cBlocks,nSize);
+  dim3 threads(xyThreads,cThreads,1);
+  if(mask != NULL)
+    gPoolChannelsNCHWMaskKernel<<<grid,threads,sharedMemSize>>>(in,out,cSize,xySize,mask,maskSum,sharedMemElts);
+  else
+    gPoolChannelsNCHWKernel<<<grid,threads,sharedMemSize>>>(in,out,cSize,xySize,maskSum,sharedMemElts);
+}
+
+//--------------------------------------------------------------------------------------------------------------
+
+__global__
+void gPoolChannelsNCHWHalfKernel(const half* in, half* out, int cSize, int xySize, const float* maskSum, int sharedMemElts)
+{
+#ifdef HIP_SUPPORTS_FP16
+  extern __shared__ float poolNCHWShared[];
+  float* sumShared = (float*)poolNCHWShared;
+  float* maxShared = (float*)poolNCHWShared + sharedMemElts;
+
+  int xyId = threadIdx.x;
+  int xyBlockDim = blockDim.x;
+  int cId = threadIdx.y;
+  int cBlockDim = blockDim.y;
+  int cIdx = blockIdx.y * cBlockDim + cId;
+  int nIdx = blockIdx.z;
+
+  int xycSize = xySize*cSize;
+  int sharedIdx = xyId + cId * xyBlockDim;
+
+  if(cIdx < cSize) {
+    float accSum = 0.0f;
+    float accMax = -1.0f;
+    int xyIdx = xyId;
+    while(xyIdx < xySize) {
+      float a = __half2float(in[xyIdx + cIdx * xySize + nIdx * xycSize]);
+      accSum += a;
+      accMax = fmaxf(accMax, a);
+      xyIdx += xyBlockDim;
+    }
+    sumShared[sharedIdx] = accSum;
+    maxShared[sharedIdx] = accMax;
+  }
+  __syncthreads();
+
+  for(int s = xyBlockDim>>1; s > 0; s >>= 1) {
+    if(xyId < s) {
+      sumShared[sharedIdx] += sumShared[sharedIdx + s];
+      maxShared[sharedIdx] = fmaxf(maxShared[sharedIdx], maxShared[sharedIdx + s]);
+    }
+    __syncthreads();
+  }
+  if(xyId == 0 && cIdx < cSize) {
+    float sum = sumShared[sharedIdx];
+    float div = maskSum[nIdx];
+    float sqrtdiv = sqrt(div);
+    float mean = sum/div;
+
+    out[cIdx + nIdx * (cSize*3)] = __float2half(mean);
+    out[cIdx + nIdx * (cSize*3) + cSize] = __float2half(mean * (sqrtdiv - 14.0f) * 0.1f);
+    out[cIdx + nIdx * (cSize*3) + cSize*2] = __float2half(maxShared[sharedIdx]);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void gPoolChannelsNCHWHalfMaskKernel(const half* in, half* out, int cSize, int xySize, const half* mask, const float* maskSum, int sharedMemElts)
+{
+#ifdef HIP_SUPPORTS_FP16
+  extern __shared__ float poolNCHWShared[];
+  float* sumShared = (float*)poolNCHWShared;
+  float* maxShared = (float*)poolNCHWShared + sharedMemElts;
+
+  int xyId = threadIdx.x;
+  int xyBlockDim = blockDim.x;
+  int cId = threadIdx.y;
+  int cBlockDim = blockDim.y;
+  int cIdx = blockIdx.y * cBlockDim + cId;
+  int nIdx = blockIdx.z;
+
+  int xycSize = xySize*cSize;
+  int sharedIdx = xyId + cId * xyBlockDim;
+
+  if(cIdx < cSize) {
+    float accSum = 0.0f;
+    float accMax = -1.0f;
+    int xyIdx = xyId;
+    while(xyIdx < xySize) {
+      float a = __half2float(in[xyIdx + cIdx * xySize + nIdx * xycSize]);
+      accSum += a;
+      // Init to -1.0 above and + mask - 1.0 is because it will effectively make all padded space into -1.0
+      // which is lower than the lowest value that any current activation function will produce.
+      // so the max over all valid spaces will the same as the mask over all spaces including padding
+      accMax = fmaxf(accMax, a + (__half2float(mask[xyIdx + nIdx * xySize]) - 1.0f));
+      xyIdx += xyBlockDim;
+    }
+    sumShared[sharedIdx] = accSum;
+    maxShared[sharedIdx] = accMax;
+  }
+  __syncthreads();
+
+  for(int s = xyBlockDim>>1; s > 0; s >>= 1) {
+    if(xyId < s) {
+      sumShared[sharedIdx] += sumShared[sharedIdx + s];
+      maxShared[sharedIdx] = fmaxf(maxShared[sharedIdx], maxShared[sharedIdx + s]);
+    }
+    __syncthreads();
+  }
+  if(xyId == 0 && cIdx < cSize) {
+    float sum = sumShared[sharedIdx];
+    float div = maskSum[nIdx];
+    float sqrtdiv = sqrt(div);
+    float mean = sum/div;
+
+    out[cIdx + nIdx * (cSize*3)] = __float2half(mean);
+    out[cIdx + nIdx * (cSize*3) + cSize] = __float2half(mean * (sqrtdiv - 14.0f) * 0.1f);
+    out[cIdx + nIdx * (cSize*3) + cSize*2] = __float2half(maxShared[sharedIdx]);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+
+void customCudaPoolRowsGPoolNCHW(const half* in, half* out, int nSize, int cSize, int xySize, const half* mask, const float* maskSum) {
+  if(nSize > 65536)
+    throw std::runtime_error("customCudaPoolRowsGPoolNCHW: nSize too large");
+  if(cSize > 65536)
+    throw std::runtime_error("customCudaPoolRowsGPoolNCHW: cSize too large");
+
+  //Use up as many threads as possible along the xy dimension.
+  int xyThreads = 1;
+  while(xyThreads < targetNumThreads && xyThreads < xySize/2)
+    xyThreads *= 2;
+
+  //Distribute the extra threads along the c dimension.
+  int cThreads = (targetNumThreads < xyThreads) ? 1 : (targetNumThreads / xyThreads);
+  int cBlocks = (cSize + cThreads - 1) / cThreads;
+
+  //We need one shared memory spot per thread, and then we double it because we need both sum and max.
+  //We also make sure it's a power of two to address any alignment concerns.
+  int sharedMemElts = 128;
+  while(sharedMemElts < cThreads * xyThreads)
+    sharedMemElts *= 2;
+  int sharedMemSize = sizeof(float) * sharedMemElts * 2;
+
+  dim3 grid(1,cBlocks,nSize);
+  dim3 threads(xyThreads,cThreads,1);
+  if(mask != NULL)
+    gPoolChannelsNCHWHalfMaskKernel<<<grid,threads,sharedMemSize>>>(in,out,cSize,xySize,mask,maskSum,sharedMemElts);
+  else
+    gPoolChannelsNCHWHalfKernel<<<grid,threads,sharedMemSize>>>(in,out,cSize,xySize,maskSum,sharedMemElts);
+}
+
+
+
+//--------------------------------------------------------------------------------------------------------------
+
+__global__
+void sumChannelsNHWCKernel(const float* in, float* out, int xySize, int cSize, float scaleSum)
+{
+  extern __shared__ float sumPoolNHWCShared[];
+  int cId = threadIdx.x;
+  int cBlockDim = blockDim.x;
+  int xyId = threadIdx.y;
+  int xyBlockDim = blockDim.y;
+
+  int cIdx = blockIdx.x * cBlockDim + cId;
+  int nIdx = blockIdx.z;
+  int sharedIdx = cId + cBlockDim * xyId;
+  int xycSize = xySize*cSize;
+
+  sumPoolNHWCShared[sharedIdx] = 0;
+
+  if(cIdx < cSize) {
+    int xyIdx = xyId;
+    while(xyIdx < xySize) {
+      sumPoolNHWCShared[sharedIdx] += in[cIdx + xyIdx * cSize + nIdx * xycSize];
+      xyIdx += xyBlockDim;
+    }
+  }
+  __syncthreads();
+
+  for(int s = xyBlockDim>>1; s > 0; s >>= 1) {
+    if(xyId < s) {
+      sumPoolNHWCShared[sharedIdx] += sumPoolNHWCShared[sharedIdx + cBlockDim * s];
+    }
+    __syncthreads();
+  }
+  if(xyId == 0 && cIdx < cSize)
+    out[cIdx + nIdx * cSize] = sumPoolNHWCShared[sharedIdx] * scaleSum;
+}
+__global__
+void valueHeadPoolChannelsNHWCKernel(const float* in, float* out, int nSize, int xySize, int cSize, const float* maskSum)
+{
+  extern __shared__ float sumPoolNHWCShared[];
+  int cId = threadIdx.x;
+  int cBlockDim = blockDim.x;
+  int xyId = threadIdx.y;
+  int xyBlockDim = blockDim.y;
+
+  int cIdx = blockIdx.x * cBlockDim + cId;
+  int nIdx = blockIdx.z;
+  int sharedIdx = cId + cBlockDim * xyId;
+  int xycSize = xySize*cSize;
+
+  sumPoolNHWCShared[sharedIdx] = 0;
+
+  if(cIdx < cSize) {
+    int xyIdx = xyId;
+    while(xyIdx < xySize) {
+      sumPoolNHWCShared[sharedIdx] += in[cIdx + xyIdx * cSize + nIdx * xycSize];
+      xyIdx += xyBlockDim;
+    }
+  }
+  __syncthreads();
+
+  for(int s = xyBlockDim>>1; s > 0; s >>= 1) {
+    if(xyId < s) {
+      sumPoolNHWCShared[sharedIdx] += sumPoolNHWCShared[sharedIdx + cBlockDim * s];
+    }
+    __syncthreads();
+  }
+  if(xyId == 0 && cIdx < cSize) {
+    float sum = sumPoolNHWCShared[sharedIdx];
+    float div = maskSum[nIdx];
+    float sqrtdiv = sqrt(div);
+    float mean = sum/div;
+    out[cIdx + nIdx * cSize*3] = mean;
+    out[cIdx + nIdx * cSize*3 + cSize] = mean * (sqrtdiv - 14.0f) * 0.1f;
+    out[cIdx + nIdx * cSize*3 + cSize*2] = mean * ((sqrtdiv - 14.0f) * (sqrtdiv - 14.0f) * 0.01f - 0.1f);
+  }
+}
+__global__
+void gPoolChannelsNHWCKernel(const float* in, float* out, int xySize, int cSize, const float* maskSum, int sharedMemElts)
+{
+  extern __shared__ float poolNHWCShared[];
+  float* sumShared = (float*)poolNHWCShared;
+  float* maxShared = (float*)poolNHWCShared + sharedMemElts;
+
+  int cId = threadIdx.x;
+  int cBlockDim = blockDim.x;
+  int xyId = threadIdx.y;
+  int xyBlockDim = blockDim.y;
+
+  int cIdx = blockIdx.x * cBlockDim + cId;
+  int nIdx = blockIdx.z;
+  int sharedIdx = cId + cBlockDim * xyId;
+  int xycSize = xySize*cSize;
+
+  sumShared[sharedIdx] = 0;
+  maxShared[sharedIdx] = -1.0f;
+
+  if(cIdx < cSize) {
+    int xyIdx = xyId;
+    while(xyIdx < xySize) {
+      float a = in[cIdx + xyIdx * cSize + nIdx * xycSize];
+      sumShared[sharedIdx] += a;
+      maxShared[sharedIdx] = fmaxf(maxShared[sharedIdx], a);
+      xyIdx += xyBlockDim;
+    }
+  }
+  __syncthreads();
+
+  for(int s = xyBlockDim>>1; s > 0; s >>= 1) {
+    if(xyId < s) {
+      sumShared[sharedIdx] += sumShared[sharedIdx + cBlockDim * s];
+      maxShared[sharedIdx] = fmaxf(maxShared[sharedIdx],maxShared[sharedIdx + cBlockDim * s]);
+    }
+    __syncthreads();
+  }
+  if(xyId == 0 && cIdx < cSize) {
+    float sum = sumShared[sharedIdx];
+    float div = maskSum[nIdx];
+    float sqrtdiv = sqrt(div);
+    float mean = sum/div;
+
+    out[cIdx + nIdx * (cSize*3)] = mean;
+    out[cIdx + nIdx * (cSize*3) + cSize] = mean * (sqrtdiv - 14.0f) * 0.1f;
+    out[cIdx + nIdx * (cSize*3) + cSize*2] = maxShared[sharedIdx];
+  }
+}
+__global__
+void gPoolChannelsNHWCMaskKernel(const float* in, float* out, int xySize, int cSize, const float* mask, const float* maskSum, int sharedMemElts)
+{
+  extern __shared__ float poolNHWCShared[];
+  float* sumShared = (float*)poolNHWCShared;
+  float* maxShared = (float*)poolNHWCShared + sharedMemElts;
+
+  int cId = threadIdx.x;
+  int cBlockDim = blockDim.x;
+  int xyId = threadIdx.y;
+  int xyBlockDim = blockDim.y;
+
+  int cIdx = blockIdx.x * cBlockDim + cId;
+  int nIdx = blockIdx.z;
+  int sharedIdx = cId + cBlockDim * xyId;
+  int xycSize = xySize*cSize;
+
+  sumShared[sharedIdx] = 0;
+  maxShared[sharedIdx] = -1.0f;
+
+  if(cIdx < cSize) {
+    int xyIdx = xyId;
+    while(xyIdx < xySize) {
+      float a = in[cIdx + xyIdx * cSize + nIdx * xycSize];
+      sumShared[sharedIdx] += a;
+      // Init to -1.0 above and + mask - 1.0 is because it will effectively make all padded space into -1.0
+      // which is lower than the lowest value that any current activation function will produce.
+      // so the max over all valid spaces will the same as the mask over all spaces including padding
+      maxShared[sharedIdx] = fmaxf(maxShared[sharedIdx], a + (mask[xyIdx + nIdx * xySize] - 1.0f));
+      xyIdx += xyBlockDim;
+    }
+  }
+  __syncthreads();
+
+  for(int s = xyBlockDim>>1; s > 0; s >>= 1) {
+    if(xyId < s) {
+      sumShared[sharedIdx] += sumShared[sharedIdx + cBlockDim * s];
+      maxShared[sharedIdx] = fmaxf(maxShared[sharedIdx],maxShared[sharedIdx + cBlockDim * s]);
+    }
+    __syncthreads();
+  }
+  if(xyId == 0 && cIdx < cSize) {
+    float sum = sumShared[sharedIdx];
+    float div = maskSum[nIdx];
+    float sqrtdiv = sqrt(div);
+    float mean = sum/div;
+
+    out[cIdx + nIdx * (cSize*3)] = mean;
+    out[cIdx + nIdx * (cSize*3) + cSize] = mean * (sqrtdiv - 14.0f) * 0.1f;
+    out[cIdx + nIdx * (cSize*3) + cSize*2] = maxShared[sharedIdx];
+  }
+}
+
+
+void customCudaPoolRowsSumNHWC(const float* in, float* out, int nSize, int xySize, int cSize, float scaleSum) {
+  if(nSize > 65536)
+    throw std::runtime_error("customCudaPoolRowsSumNHWC: nSize too large");
+  if(cSize > 65536)
+    throw std::runtime_error("customCudaPoolRowsSumNHWC: cSize too large");
+
+  //Use up to two warps worth of threads along the channel dimension, which is the
+  //most compact
+  int cThreads = 1;
+  while(cThreads < 64 && cThreads < cSize/2)
+    cThreads *= 2;
+  int cBlocks = (cSize + cThreads - 1) / cThreads;
+
+  //Distribute the extra threads to perform parallel reduction along the xy dimension.
+  int xyThreads = (targetNumThreads < cThreads) ? 1 : (targetNumThreads / cThreads);
+
+  //We need one shared memory spot per thread
+  int sharedMemSize = sizeof(float) * cThreads * xyThreads;
+
+  dim3 grid(cBlocks,1,nSize);
+  dim3 threads(cThreads,xyThreads,1);
+  sumChannelsNHWCKernel<<<grid,threads,sharedMemSize>>>(in,out,xySize,cSize,scaleSum);
+}
+
+void customCudaValueHeadPoolNHWC(const float* in, float* out, int nSize, int xySize, int cSize, const float* maskSum) {
+  if(nSize > 65536)
+    throw std::runtime_error("customCudaValueHeadPoolNHWC: nSize too large");
+  if(cSize > 65536)
+    throw std::runtime_error("customCudaValueHeadPoolNHWC: cSize too large");
+
+  //Use up to two warps worth of threads along the channel dimension, which is the
+  //most compact
+  int cThreads = 1;
+  while(cThreads < 64 && cThreads < cSize/2)
+    cThreads *= 2;
+  int cBlocks = (cSize + cThreads - 1) / cThreads;
+
+  //Distribute the extra threads to perform parallel reduction along the xy dimension.
+  int xyThreads = (targetNumThreads < cThreads) ? 1 : (targetNumThreads / cThreads);
+
+  //We need one shared memory spot per thread
+  int sharedMemSize = sizeof(float) * cThreads * xyThreads;
+
+  dim3 grid(cBlocks,1,nSize);
+  dim3 threads(cThreads,xyThreads,1);
+  valueHeadPoolChannelsNHWCKernel<<<grid,threads,sharedMemSize>>>(in,out,nSize,xySize,cSize,maskSum);
+}
+
+void customCudaPoolRowsGPoolNHWC(const float* in, float* out, int nSize, int xySize, int cSize, const float* mask, const float* maskSum) {
+  if(nSize > 65536)
+    throw std::runtime_error("customCudaPoolRowsGPoolNHWC: nSize too large");
+  if(cSize > 65536)
+    throw std::runtime_error("customCudaPoolRowsGPoolNHWC: cSize too large");
+
+  //Use up to two warps worth of threads along the channel dimension, which is the
+  //most compact
+  int cThreads = 1;
+  while(cThreads < 64 && cThreads < cSize/2)
+    cThreads *= 2;
+  int cBlocks = (cSize + cThreads - 1) / cThreads;
+
+  //Distribute the extra threads to perform parallel reduction along the xy dimension.
+  int xyThreads = (targetNumThreads < cThreads) ? 1 : (targetNumThreads / cThreads);
+
+  //We need one shared memory spot per thread, and then we double it because we need both sum and max.
+  //We also make sure it's a power of two to address any alignment concerns.
+  int sharedMemElts = 128;
+  while(sharedMemElts < cThreads * xyThreads)
+    sharedMemElts *= 2;
+  int sharedMemSize = sizeof(float) * sharedMemElts * 2;
+
+  dim3 grid(cBlocks,1,nSize);
+  dim3 threads(cThreads,xyThreads,1);
+  if(mask != NULL)
+    gPoolChannelsNHWCMaskKernel<<<grid,threads,sharedMemSize>>>(in,out,xySize,cSize,mask,maskSum,sharedMemElts);
+  else
+    gPoolChannelsNHWCKernel<<<grid,threads,sharedMemSize>>>(in,out,xySize,cSize,maskSum,sharedMemElts);
+}
+
+//--------------------------------------------------------------------------------------------------------------
+
+__global__
+void gPoolChannelsNHWCHalfKernel(const half* in, half* out, int xySize, int cSize, const float* maskSum, int sharedMemElts)
+{
+#ifdef HIP_SUPPORTS_FP16
+  extern __shared__ float poolNHWCShared[];
+  float* sumShared = (float*)poolNHWCShared;
+  float* maxShared = (float*)poolNHWCShared + sharedMemElts;
+
+  int cId = threadIdx.x;
+  int cBlockDim = blockDim.x;
+  int xyId = threadIdx.y;
+  int xyBlockDim = blockDim.y;
+
+  int cIdx = blockIdx.x * cBlockDim + cId;
+  int nIdx = blockIdx.z;
+  int sharedIdx = cId + cBlockDim * xyId;
+  int xycSize = xySize*cSize;
+
+  sumShared[sharedIdx] = 0;
+  maxShared[sharedIdx] = -1.0f;
+
+  if(cIdx < cSize) {
+    int xyIdx = xyId;
+    while(xyIdx < xySize) {
+      float a = __half2float(in[cIdx + xyIdx * cSize + nIdx * xycSize]);
+      sumShared[sharedIdx] += a;
+      maxShared[sharedIdx] = fmaxf(maxShared[sharedIdx], a);
+      xyIdx += xyBlockDim;
+    }
+  }
+  __syncthreads();
+
+  for(int s = xyBlockDim>>1; s > 0; s >>= 1) {
+    if(xyId < s) {
+      sumShared[sharedIdx] += sumShared[sharedIdx + cBlockDim * s];
+      maxShared[sharedIdx] = fmaxf(maxShared[sharedIdx],maxShared[sharedIdx + cBlockDim * s]);
+    }
+    __syncthreads();
+  }
+  if(xyId == 0 && cIdx < cSize) {
+    float sum = sumShared[sharedIdx];
+    float div = maskSum[nIdx];
+    float sqrtdiv = sqrt(div);
+    float mean = sum/div;
+
+    out[cIdx + nIdx * (cSize*3)] = __float2half(mean);
+    out[cIdx + nIdx * (cSize*3) + cSize] = __float2half(mean * (sqrtdiv - 14.0f) * 0.1f);
+    out[cIdx + nIdx * (cSize*3) + cSize*2] = __float2half(maxShared[sharedIdx]);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void gPoolChannelsNHWCHalfMaskKernel(const half* in, half* out, int xySize, int cSize, const half* mask, const float* maskSum, int sharedMemElts)
+{
+#ifdef HIP_SUPPORTS_FP16
+  extern __shared__ float poolNHWCShared[];
+  float* sumShared = (float*)poolNHWCShared;
+  float* maxShared = (float*)poolNHWCShared + sharedMemElts;
+
+  int cId = threadIdx.x;
+  int cBlockDim = blockDim.x;
+  int xyId = threadIdx.y;
+  int xyBlockDim = blockDim.y;
+
+  int cIdx = blockIdx.x * cBlockDim + cId;
+  int nIdx = blockIdx.z;
+  int sharedIdx = cId + cBlockDim * xyId;
+  int xycSize = xySize*cSize;
+
+  sumShared[sharedIdx] = 0;
+  maxShared[sharedIdx] = -1.0f;
+
+  if(cIdx < cSize) {
+    int xyIdx = xyId;
+    while(xyIdx < xySize) {
+      float a = __half2float(in[cIdx + xyIdx * cSize + nIdx * xycSize]);
+      sumShared[sharedIdx] += a;
+      // Init to -1.0 above and + mask - 1.0 is because it will effectively make all padded space into -1.0
+      // which is lower than the lowest value that any current activation function will produce.
+      // so the max over all valid spaces will the same as the mask over all spaces including padding
+      maxShared[sharedIdx] = fmaxf(maxShared[sharedIdx], a + (__half2float(mask[xyIdx + nIdx * xySize]) - 1.0f));
+      xyIdx += xyBlockDim;
+    }
+  }
+  __syncthreads();
+
+  for(int s = xyBlockDim>>1; s > 0; s >>= 1) {
+    if(xyId < s) {
+      sumShared[sharedIdx] += sumShared[sharedIdx + cBlockDim * s];
+      maxShared[sharedIdx] = fmaxf(maxShared[sharedIdx],maxShared[sharedIdx + cBlockDim * s]);
+    }
+    __syncthreads();
+  }
+  if(xyId == 0 && cIdx < cSize) {
+    float sum = sumShared[sharedIdx];
+    float div = maskSum[nIdx];
+    float sqrtdiv = sqrt(div);
+    float mean = sum/div;
+
+    out[cIdx + nIdx * (cSize*3)] = __float2half(mean);
+    out[cIdx + nIdx * (cSize*3) + cSize] = __float2half(mean * (sqrtdiv - 14.0f) * 0.1f);
+    out[cIdx + nIdx * (cSize*3) + cSize*2] = __float2half(maxShared[sharedIdx]);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+
+void customCudaPoolRowsGPoolNHWC(const half* in, half* out, int nSize, int xySize, int cSize, const half* mask, const float* maskSum) {
+  if(nSize > 65536)
+    throw std::runtime_error("customCudaPoolRowsGPoolNHWC: nSize too large");
+  if(cSize > 65536)
+    throw std::runtime_error("customCudaPoolRowsGPoolNHWC: cSize too large");
+
+  //Use up to two warps worth of threads along the channel dimension, which is the
+  //most compact
+  int cThreads = 1;
+  while(cThreads < 64 && cThreads < cSize/2)
+    cThreads *= 2;
+  int cBlocks = (cSize + cThreads - 1) / cThreads;
+
+  //Distribute the extra threads to perform parallel reduction along the xy dimension.
+  int xyThreads = (targetNumThreads < cThreads) ? 1 : (targetNumThreads / cThreads);
+
+  //We need one shared memory spot per thread, and then we double it because we need both sum and max.
+  //We also make sure it's a power of two to address any alignment concerns.
+  int sharedMemElts = 128;
+  while(sharedMemElts < cThreads * xyThreads)
+    sharedMemElts *= 2;
+  int sharedMemSize = sizeof(float) * sharedMemElts * 2;
+
+  dim3 grid(cBlocks,1,nSize);
+  dim3 threads(cThreads,xyThreads,1);
+  if(mask != NULL)
+    gPoolChannelsNHWCHalfMaskKernel<<<grid,threads,sharedMemSize>>>(in,out,xySize,cSize,mask,maskSum,sharedMemElts);
+  else
+    gPoolChannelsNHWCHalfKernel<<<grid,threads,sharedMemSize>>>(in,out,xySize,cSize,maskSum,sharedMemElts);
+}
+
+
+//--------------------------------------------------------------------------------------------------------------
+
+__global__
+void copyToHalfKernel(const float *in, half* out, int n)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx < n) {
+    out[idx] = __float2half(in[idx]);
+  }
+}
+__global__
+void copyFromHalfKernel(const half *in, float* out, int n)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx < n) {
+    out[idx] = __half2float(in[idx]);
+  }
+}
+
+void customCudaCopyToHalf(const float* in, half* out, int n) {
+  int blockSize = targetNumThreads;
+  int numBlocks = (n+blockSize-1)/blockSize;
+  copyToHalfKernel<<<numBlocks, blockSize>>>(in,out,n);
+}
+void customCudaCopyFromHalf(const half* in, float* out, int n) {
+  int blockSize = targetNumThreads;
+  int numBlocks = (n+blockSize-1)/blockSize;
+  copyFromHalfKernel<<<numBlocks, blockSize>>>(in,out,n);
+}
+
+//--------------------------------------------------------------------------------------------------------------
+
+
+__global__
+void addTensorInplaceHalfKernel(half *buf, const half* biases, int nSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx < nSize) {
+    buf[idx] = __hadd(buf[idx],biases[idx]);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+void customCudaAddTensorInplace(half* buf, const half* biases, int nSize) {
+  int blockSize = targetNumThreads;
+  int numBlocks = (nSize+blockSize-1)/blockSize;
+  addTensorInplaceHalfKernel<<<numBlocks, blockSize>>>(buf,biases,nSize);
+}
+
+//--------------------------------------------------------------------------------------------------------------
+
+
+__global__
+void addCBiasInplaceNCKernel(float *buf, const float* biases, int nSize, int cSize)
+{
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int nIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  if(cIdx < cSize && nIdx < nSize) {
+    int idx = nIdx * cSize + cIdx;
+    buf[idx] = buf[idx] + biases[cIdx];
+  }
+}
+__global__
+void addCBiasInplaceNCHalfKernel(half *buf, const half* biases, int nSize, int cSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int nIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  if(cIdx < cSize && nIdx < nSize) {
+    int idx = nIdx * cSize + cIdx;
+    buf[idx] = __hadd(buf[idx],biases[cIdx]);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+
+__global__
+void addCBiasInplaceNCKernelRelu(float *buf, const float* biases, int nSize, int cSize)
+{
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int nIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  if(cIdx < cSize && nIdx < nSize) {
+    int idx = nIdx * cSize + cIdx;
+    buf[idx] = fmaxf(buf[idx] + biases[cIdx],0.0f);
+  }
+}
+__global__
+void addCBiasInplaceNCHalfKernelRelu(half *buf, const half* biases, int nSize, int cSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int nIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  if(cIdx < cSize && nIdx < nSize) {
+    int idx = nIdx * cSize + cIdx;
+    const half halfzero = __float2half(0.0f);
+    half a = __hadd(buf[idx],biases[cIdx]);
+    buf[idx] = __hgt(a,halfzero) ? a : halfzero;
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+
+__global__
+void addCBiasInplaceNCKernelMish(float *buf, const float* biases, int nSize, int cSize)
+{
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int nIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  if(cIdx < cSize && nIdx < nSize) {
+    int idx = nIdx * cSize + cIdx;
+    buf[idx] = mishf(buf[idx] + biases[cIdx]);
+  }
+}
+__global__
+void addCBiasInplaceNCHalfKernelMish(half *buf, const half* biases, int nSize, int cSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int nIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  if(cIdx < cSize && nIdx < nSize) {
+    int idx = nIdx * cSize + cIdx;
+    half a = __hadd(buf[idx],biases[cIdx]);
+    buf[idx] = mishh(a);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void addCBiasInplaceNCKernelMishScale8(float *buf, const float* biases, int nSize, int cSize)
+{
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int nIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  if(cIdx < cSize && nIdx < nSize) {
+    int idx = nIdx * cSize + cIdx;
+    buf[idx] = mishf_scale8(buf[idx] + biases[cIdx]);
+  }
+}
+__global__
+void addCBiasInplaceNCHalfKernelMishScale8(half *buf, const half* biases, int nSize, int cSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int nIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  if(cIdx < cSize && nIdx < nSize) {
+    int idx = nIdx * cSize + cIdx;
+    half a = __hadd(buf[idx],biases[cIdx]);
+    buf[idx] = mishh_scale8(a);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+
+void sharedAddCBiasInplaceNC(void* buf, const void* biases, int nSize, int cSize, bool isHalf, int activation) {
+  int cThreads;
+  int cBlocks;
+  int nThreads;
+  int nBlocks;
+  splitThreadsAcrossDim01(cSize, nSize, cThreads, cBlocks, nThreads, nBlocks);
+
+  if(nBlocks > 65536)
+    throw std::runtime_error("customCudaAddCBiasInplaceNC: nSize too large given cSize");
+
+  dim3 grid(cBlocks,nBlocks,1);
+  dim3 threads(cThreads,nThreads,1);
+
+  if(activation == ACTIVATION_IDENTITY) {
+    if(isHalf)
+      addCBiasInplaceNCHalfKernel<<<grid,threads>>>((half*)buf,(const half*)biases,nSize,cSize);
+    else
+      addCBiasInplaceNCKernel<<<grid,threads>>>((float*)buf,(const float*)biases,nSize,cSize);
+  }
+  else if(activation == ACTIVATION_RELU) {
+    if(isHalf)
+      addCBiasInplaceNCHalfKernelRelu<<<grid,threads>>>((half*)buf,(const half*)biases,nSize,cSize);
+    else
+      addCBiasInplaceNCKernelRelu<<<grid,threads>>>((float*)buf,(const float*)biases,nSize,cSize);
+  }
+  else if(activation == ACTIVATION_MISH) {
+    if(isHalf)
+      addCBiasInplaceNCHalfKernelMish<<<grid,threads>>>((half*)buf,(const half*)biases,nSize,cSize);
+    else
+      addCBiasInplaceNCKernelMish<<<grid,threads>>>((float*)buf,(const float*)biases,nSize,cSize);
+  }
+  else if(activation == ACTIVATION_MISH_SCALE8) {
+    if(isHalf)
+      addCBiasInplaceNCHalfKernelMishScale8<<<grid,threads>>>((half*)buf,(const half*)biases,nSize,cSize);
+    else
+      addCBiasInplaceNCKernelMishScale8<<<grid,threads>>>((float*)buf,(const float*)biases,nSize,cSize);
+  }
+  else {
+    throw std::runtime_error("customCudaAddCBiasInplaceNC: unsupported activation");
+  }
+}
+
+void customCudaAddCBiasInplaceNC(float* buf, const float* biases, int nSize, int cSize, int activation) {
+  sharedAddCBiasInplaceNC(buf,biases,nSize,cSize,false,activation);
+}
+void customCudaAddCBiasInplaceNC(half* buf, const half* biases, int nSize, int cSize, int activation) {
+  sharedAddCBiasInplaceNC(buf,biases,nSize,cSize,true,activation);
+}
+
+//--------------------------------------------------------------------------------------------------------------
+
+__global__
+void addNCBiasInplaceNCHWKernel(float *buf, const float* biases, int cSize, int sSize)
+{
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int ncIdx = nIdx * cSize + cIdx;
+    int idx = ncIdx * sSize + sIdx;
+    buf[idx] = buf[idx] + biases[ncIdx];
+  }
+}
+__global__
+void addNCBiasInplaceNCHWHalfKernel(half *buf, const half* biases, int cSize, int sSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int ncIdx = nIdx * cSize + cIdx;
+    int idx = ncIdx * sSize + sIdx;
+    buf[idx] = __hadd(buf[idx],biases[ncIdx]);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+
+void sharedAddNCBiasInplaceNCHW(void *buf, const void* biases, int nSize, int cSize, int xySize, bool isHalf) {
+  if(nSize > 65536)
+    throw std::runtime_error("customCudaAddNCBiasInplaceNCHW: nSize too large");
+  if(cSize > 65536)
+    throw std::runtime_error("customCudaAddNCBiasInplaceNCHW: cSize too large");
+
+  int sSize = xySize;
+  int sThreads;
+  int sBlocks;
+  int cThreads;
+  int cBlocks;
+  splitThreadsAcrossDim01(sSize, cSize, sThreads, sBlocks, cThreads, cBlocks);
+
+  dim3 grid(sBlocks,cBlocks,nSize);
+  dim3 threads(sThreads,cThreads,1);
+  if(isHalf)
+    addNCBiasInplaceNCHWHalfKernel<<<grid,threads>>>((half*)buf,(const half*)biases,cSize,sSize);
+  else
+    addNCBiasInplaceNCHWKernel<<<grid,threads>>>((float*)buf,(const float*)biases,cSize,sSize);
+}
+
+void customCudaAddNCBiasInplaceNCHW(float *buf, const float* biases, int nSize, int cSize, int xySize) {
+  sharedAddNCBiasInplaceNCHW(buf,biases,nSize,cSize,xySize,false);
+}
+void customCudaAddNCBiasInplaceNCHW(half *buf, const half* biases, int nSize, int cSize, int xySize) {
+  sharedAddNCBiasInplaceNCHW(buf,biases,nSize,cSize,xySize,true);
+}
+
+//--------------------------------------------------------------------------------------------------------------
+
+__global__
+void addNCBiasInplaceNHWCKernel(float *buf, const float* biases, int sSize, int cSize)
+{
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int ncIdx = nIdx * cSize + cIdx;
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    buf[idx] = buf[idx] + biases[ncIdx];
+  }
+}
+__global__
+void addNCBiasInplaceNHWCHalfKernel(half *buf, const half* biases, int sSize, int cSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int ncIdx = nIdx * cSize + cIdx;
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    buf[idx] = __hadd(buf[idx],biases[ncIdx]);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+
+void sharedAddNCBiasInplaceNHWC(void *buf, const void* biases, int nSize, int xySize, int cSize, bool isHalf) {
+  if(nSize > 65536)
+    throw std::runtime_error("customCudaAddNCBiasInplaceNHWC: nSize too large");
+  if(xySize > 65536)
+    throw std::runtime_error("customCudaAddNCBiasInplaceNHWC: xySize too large");
+
+  int sSize = xySize;
+  int cThreads;
+  int cBlocks;
+  int sThreads;
+  int sBlocks;
+  splitThreadsAcrossDim01(cSize, sSize, cThreads, cBlocks, sThreads, sBlocks);
+
+  dim3 grid(cBlocks,sBlocks,nSize);
+  dim3 threads(cThreads,sThreads,1);
+  if(isHalf)
+    addNCBiasInplaceNHWCHalfKernel<<<grid,threads>>>((half*)buf,(const half*)biases,sSize,cSize);
+  else
+    addNCBiasInplaceNHWCKernel<<<grid,threads>>>((float*)buf,(const float*)biases,sSize,cSize);
+}
+
+void customCudaAddNCBiasInplaceNHWC(float *buf, const float* biases, int nSize, int xySize, int cSize) {
+  sharedAddNCBiasInplaceNHWC(buf,biases,nSize,xySize,cSize,false);
+}
+void customCudaAddNCBiasInplaceNHWC(half *buf, const half* biases, int nSize, int xySize, int cSize) {
+  sharedAddNCBiasInplaceNHWC(buf,biases,nSize,xySize,cSize,true);
+}
+
+//--------------------------------------------------------------------------------------------------------------
+
+__global__
+void applyCScaleBiasNCHWKernel(const float *in, float* out, const float* scale, const float* biases, int cSize, int sSize)
+{
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    out[idx] = in[idx] * scale[cIdx] + biases[cIdx];
+  }
+}
+__global__
+void applyCScaleBiasNCHWReluKernel(const float *in, float* out, const float* scale, const float* biases, int cSize, int sSize)
+{
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    out[idx] = fmaxf(in[idx] * scale[cIdx] + biases[cIdx],0.0f);
+  }
+}
+__global__
+void applyCScaleBiasNCHWMishKernel(const float *in, float* out, const float* scale, const float* biases, int cSize, int sSize)
+{
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    out[idx] = mishf(in[idx] * scale[cIdx] + biases[cIdx]);
+  }
+}
+__global__
+void applyCScaleBiasNCHWMishScale8Kernel(const float *in, float* out, const float* scale, const float* biases, int cSize, int sSize)
+{
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    out[idx] = mishf_scale8(in[idx] * scale[cIdx] + biases[cIdx]);
+  }
+}
+__global__
+void applyCScaleBiasNCHWMaskKernel(const float *in, float* out, const float* scale, const float* biases, const float* mask, int cSize, int sSize)
+{
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    out[idx] = (in[idx] * scale[cIdx] + biases[cIdx]) * mask[nIdx*sSize+sIdx];
+  }
+}
+__global__
+void applyCScaleBiasNCHWReluMaskKernel(const float *in, float* out, const float* scale, const float* biases, const float* mask, int cSize, int sSize)
+{
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    out[idx] = fmaxf(in[idx] * scale[cIdx] + biases[cIdx],0.0f) * mask[nIdx*sSize+sIdx];
+  }
+}
+__global__
+void applyCScaleBiasNCHWMishMaskKernel(const float *in, float* out, const float* scale, const float* biases, const float* mask, int cSize, int sSize)
+{
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    out[idx] = mishf(in[idx] * scale[cIdx] + biases[cIdx]) * mask[nIdx*sSize+sIdx];
+  }
+}
+__global__
+void applyCScaleBiasNCHWMishScale8MaskKernel(const float *in, float* out, const float* scale, const float* biases, const float* mask, int cSize, int sSize)
+{
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    out[idx] = mishf_scale8(in[idx] * scale[cIdx] + biases[cIdx]) * mask[nIdx*sSize+sIdx];
+  }
+}
+__global__
+void applyCScaleBiasNCHWHalfKernel(const half *in, half* out, const half* scale, const half* biases, int cSize, int sSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    out[idx] = __hfma(in[idx],scale[cIdx],biases[cIdx]);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNCHWReluHalfKernel(const half *in, half* out, const half* scale, const half* biases, int cSize, int sSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    half a = __hfma(in[idx],scale[cIdx],biases[cIdx]);
+    const half halfzero = __float2half(0.0f);
+    out[idx] = __hgt(a,halfzero) ? a : halfzero;
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNCHWMishHalfKernel(const half *in, half* out, const half* scale, const half* biases, int cSize, int sSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    half a = __hfma(in[idx],scale[cIdx],biases[cIdx]);
+    out[idx] = mishh(a);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNCHWMishScale8HalfKernel(const half *in, half* out, const half* scale, const half* biases, int cSize, int sSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    half a = __hfma(in[idx],scale[cIdx],biases[cIdx]);
+    out[idx] = mishh_scale8(a);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNCHWMaskHalfKernel(const half *in, half* out, const half* scale, const half* biases, const half* mask, int cSize, int sSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    out[idx] = __hmul(__hfma(in[idx],scale[cIdx],biases[cIdx]),mask[nIdx*sSize+sIdx]);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNCHWReluMaskHalfKernel(const half *in, half* out, const half* scale, const half* biases, const half* mask, int cSize, int sSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    half a = __hmul(__hfma(in[idx],scale[cIdx],biases[cIdx]),mask[nIdx*sSize+sIdx]);
+    const half halfzero = __float2half(0.0f);
+    out[idx] = __hgt(a,halfzero) ? a : halfzero;
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNCHWMishMaskHalfKernel(const half *in, half* out, const half* scale, const half* biases, const half* mask, int cSize, int sSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    half a = __hmul(__hfma(in[idx],scale[cIdx],biases[cIdx]),mask[nIdx*sSize+sIdx]);
+    out[idx] = mishh(a);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNCHWMishScale8MaskHalfKernel(const half *in, half* out, const half* scale, const half* biases, const half* mask, int cSize, int sSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int sIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int cIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * cSize + cIdx) * sSize + sIdx;
+    half a = __hmul(__hfma(in[idx],scale[cIdx],biases[cIdx]),mask[nIdx*sSize+sIdx]);
+    out[idx] = mishh_scale8(a);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+
+void sharedApplyCScaleBiasNCHW(const void* in, void* out, const void* scale, const void* biases, const void* mask, int nSize, int cSize, int xySize, bool isHalf, int activation) {
+  if(nSize > 65536)
+    throw std::runtime_error("customCudaApplyCScaleBiasNCHW: nSize too large");
+  if(cSize > 65536)
+    throw std::runtime_error("customCudaApplyCScaleBiasNCHW: cSize too large");
+
+  int sSize = xySize;
+  int sThreads;
+  int sBlocks;
+  int cThreads;
+  int cBlocks;
+  splitThreadsAcrossDim01(sSize, cSize, sThreads, sBlocks, cThreads, cBlocks);
+
+  dim3 grid(sBlocks,cBlocks,nSize);
+  dim3 threads(sThreads,cThreads,1);
+  if(mask == NULL) {
+    if(activation == ACTIVATION_IDENTITY) {
+      if(isHalf)
+        applyCScaleBiasNCHWHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,cSize,sSize);
+      else
+        applyCScaleBiasNCHWKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,cSize,sSize);
+    }
+    else if(activation == ACTIVATION_RELU) {
+      if(isHalf)
+        applyCScaleBiasNCHWReluHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,cSize,sSize);
+      else
+        applyCScaleBiasNCHWReluKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,cSize,sSize);
+    }
+    else if(activation == ACTIVATION_MISH) {
+      if(isHalf)
+        applyCScaleBiasNCHWMishHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,cSize,sSize);
+      else
+        applyCScaleBiasNCHWMishKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,cSize,sSize);
+    }
+    else if(activation == ACTIVATION_MISH_SCALE8) {
+      if(isHalf)
+        applyCScaleBiasNCHWMishScale8HalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,cSize,sSize);
+      else
+        applyCScaleBiasNCHWMishScale8Kernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,cSize,sSize);
+    }
+    else {
+      throw std::runtime_error("customCudaApplyCScaleBiasNCHW: unsupported activation");
+    }
+  }
+  else {
+    if(activation == ACTIVATION_IDENTITY) {
+      if(isHalf)
+        applyCScaleBiasNCHWMaskHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,(const half*)mask,cSize,sSize);
+      else
+        applyCScaleBiasNCHWMaskKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,(const float*)mask,cSize,sSize);
+    }
+    else if(activation == ACTIVATION_RELU) {
+      if(isHalf)
+        applyCScaleBiasNCHWReluMaskHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,(const half*)mask,cSize,sSize);
+      else
+        applyCScaleBiasNCHWReluMaskKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,(const float*)mask,cSize,sSize);
+    }
+    else if(activation == ACTIVATION_MISH) {
+      if(isHalf)
+        applyCScaleBiasNCHWMishMaskHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,(const half*)mask,cSize,sSize);
+      else
+        applyCScaleBiasNCHWMishMaskKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,(const float*)mask,cSize,sSize);
+    }
+    else if(activation == ACTIVATION_MISH_SCALE8) {
+      if(isHalf)
+        applyCScaleBiasNCHWMishScale8MaskHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,(const half*)mask,cSize,sSize);
+      else
+        applyCScaleBiasNCHWMishScale8MaskKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,(const float*)mask,cSize,sSize);
+    }
+    else {
+      throw std::runtime_error("customCudaApplyCScaleBiasNCHW: unsupported activation");
+    }
+  }
+}
+
+void customCudaApplyCScaleBiasNCHW(const float* in, float* out, const float* scale, const float* biases, const float* mask, int nSize, int cSize, int xySize, int activation) {
+  sharedApplyCScaleBiasNCHW(in,out,scale,biases,mask,nSize,cSize,xySize,false,activation);
+}
+void customCudaApplyCScaleBiasNCHW(const half* in, half* out, const half* scale, const half* biases, const half* mask, int nSize, int cSize, int xySize, int activation) {
+  sharedApplyCScaleBiasNCHW(in,out,scale,biases,mask,nSize,cSize,xySize,true,activation);
+}
+
+
+//--------------------------------------------------------------------------------------------------------------
+
+__global__
+void applyCScaleBiasNHWCKernel(const float* in, float* out, const float* scale, const float* biases, int sSize, int cSize)
+{
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    out[idx] = in[idx] * scale[cIdx] + biases[cIdx];
+  }
+}
+__global__
+void applyCScaleBiasNHWCReluKernel(const float* in, float* out, const float* scale, const float* biases, int sSize, int cSize)
+{
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    out[idx] = fmaxf(in[idx] * scale[cIdx] + biases[cIdx],0.0f);
+  }
+}
+__global__
+void applyCScaleBiasNHWCMishKernel(const float* in, float* out, const float* scale, const float* biases, int sSize, int cSize)
+{
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    out[idx] = mishf(in[idx] * scale[cIdx] + biases[cIdx]);
+  }
+}
+__global__
+void applyCScaleBiasNHWCMishScale8Kernel(const float* in, float* out, const float* scale, const float* biases, int sSize, int cSize)
+{
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    out[idx] = mishf_scale8(in[idx] * scale[cIdx] + biases[cIdx]);
+  }
+}
+__global__
+void applyCScaleBiasNHWCMaskKernel(const float* in, float* out, const float* scale, const float* biases, const float* mask, int sSize, int cSize)
+{
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    out[idx] = (in[idx] * scale[cIdx] + biases[cIdx]) * mask[nIdx*sSize+sIdx];
+  }
+}
+__global__
+void applyCScaleBiasNHWCReluMaskKernel(const float* in, float* out, const float* scale, const float* biases, const float* mask, int sSize, int cSize)
+{
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    out[idx] = fmaxf(in[idx] * scale[cIdx] + biases[cIdx],0.0f) * mask[nIdx*sSize+sIdx];
+  }
+}
+__global__
+void applyCScaleBiasNHWCMishMaskKernel(const float* in, float* out, const float* scale, const float* biases, const float* mask, int sSize, int cSize)
+{
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    out[idx] = mishf(in[idx] * scale[cIdx] + biases[cIdx]) * mask[nIdx*sSize+sIdx];
+  }
+}
+__global__
+void applyCScaleBiasNHWCMishScale8MaskKernel(const float* in, float* out, const float* scale, const float* biases, const float* mask, int sSize, int cSize)
+{
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    out[idx] = mishf_scale8(in[idx] * scale[cIdx] + biases[cIdx]) * mask[nIdx*sSize+sIdx];
+  }
+}
+__global__
+void applyCScaleBiasNHWCHalfKernel(const half* in, half* out, const half* scale, const half* biases, int sSize, int cSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    out[idx] = __hfma(in[idx],scale[cIdx],biases[cIdx]);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNHWCReluHalfKernel(const half* in, half* out, const half* scale, const half* biases, int sSize, int cSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    half a = __hfma(in[idx],scale[cIdx],biases[cIdx]);
+    const half halfzero = __float2half(0.0f);
+    out[idx] = __hgt(a,halfzero) ? a : halfzero;
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNHWCMishHalfKernel(const half* in, half* out, const half* scale, const half* biases, int sSize, int cSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    half a = __hfma(in[idx],scale[cIdx],biases[cIdx]);
+    out[idx] = mishh(a);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNHWCMishScale8HalfKernel(const half* in, half* out, const half* scale, const half* biases, int sSize, int cSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    half a = __hfma(in[idx],scale[cIdx],biases[cIdx]);
+    out[idx] = mishh_scale8(a);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNHWCMaskHalfKernel(const half* in, half* out, const half* scale, const half* biases, const half* mask, int sSize, int cSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    out[idx] = __hmul(__hfma(in[idx],scale[cIdx],biases[cIdx]),mask[nIdx*sSize+sIdx]);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNHWCReluMaskHalfKernel(const half* in, half* out, const half* scale, const half* biases, const half* mask, int sSize, int cSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    half a = __hmul(__hfma(in[idx],scale[cIdx],biases[cIdx]),mask[nIdx*sSize+sIdx]);
+    const half halfzero = __float2half(0.0f);
+    out[idx] = __hgt(a,halfzero) ? a : halfzero;
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNHWCMishMaskHalfKernel(const half* in, half* out, const half* scale, const half* biases, const half* mask, int sSize, int cSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    half a = __hmul(__hfma(in[idx],scale[cIdx],biases[cIdx]),mask[nIdx*sSize+sIdx]);
+    out[idx] = mishh(a);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+__global__
+void applyCScaleBiasNHWCMishScale8MaskHalfKernel(const half* in, half* out, const half* scale, const half* biases, const half* mask, int sSize, int cSize)
+{
+#ifdef HIP_SUPPORTS_FP16
+  int cIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int sIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  int nIdx = blockIdx.z;
+  if(cIdx < cSize && sIdx < sSize) {
+    int idx = (nIdx * sSize + sIdx) * cSize + cIdx;
+    half a = __hmul(__hfma(in[idx],scale[cIdx],biases[cIdx]),mask[nIdx*sSize+sIdx]);
+    out[idx] = mishh_scale8(a);
+  }
+#else
+  //Do nothing, FP16 not supported
+#endif
+}
+
+void sharedApplyCScaleBiasNHWC(const void* in, void* out, const void* scale, const void* biases, const void* mask, int nSize, int xySize, int cSize, bool isHalf, int activation) {
+  if(nSize > 65536)
+    throw std::runtime_error("customCudaApplyCScaleBiasNHWC: nSize too large");
+  if(xySize > 65536)
+    throw std::runtime_error("customCudaApplyCScaleBiasNHWC: xySize too large");
+
+  int sSize = xySize;
+  int cThreads;
+  int cBlocks;
+  int sThreads;
+  int sBlocks;
+  splitThreadsAcrossDim01(cSize, sSize, cThreads, cBlocks, sThreads, sBlocks);
+
+  dim3 grid(cBlocks,sBlocks,nSize);
+  dim3 threads(cThreads,sThreads,1);
+  if(mask == NULL) {
+    if(activation == ACTIVATION_IDENTITY) {
+      if(isHalf)
+        applyCScaleBiasNHWCHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,sSize,cSize);
+      else
+        applyCScaleBiasNHWCKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,sSize,cSize);
+    }
+    else if(activation == ACTIVATION_RELU) {
+      if(isHalf)
+        applyCScaleBiasNHWCReluHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,sSize,cSize);
+      else
+        applyCScaleBiasNHWCReluKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,sSize,cSize);
+    }
+    else if(activation == ACTIVATION_MISH) {
+      if(isHalf)
+        applyCScaleBiasNHWCMishHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,sSize,cSize);
+      else
+        applyCScaleBiasNHWCMishKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,sSize,cSize);
+    }
+    else if(activation == ACTIVATION_MISH_SCALE8) {
+      if(isHalf)
+        applyCScaleBiasNHWCMishScale8HalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,sSize,cSize);
+      else
+        applyCScaleBiasNHWCMishScale8Kernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,sSize,cSize);
+    }
+    else {
+      throw std::runtime_error("customCudaApplyCScaleBiasNHWC: unsupported activation");
+    }
+  }
+  else {
+    if(activation == ACTIVATION_IDENTITY) {
+      if(isHalf)
+        applyCScaleBiasNHWCMaskHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,(const half*)mask,sSize,cSize);
+      else
+        applyCScaleBiasNHWCMaskKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,(const float*)mask,sSize,cSize);
+    }
+    else if(activation == ACTIVATION_RELU) {
+      if(isHalf)
+        applyCScaleBiasNHWCReluMaskHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,(const half*)mask,sSize,cSize);
+      else
+        applyCScaleBiasNHWCReluMaskKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,(const float*)mask,sSize,cSize);
+    }
+    else if(activation == ACTIVATION_MISH) {
+      if(isHalf)
+        applyCScaleBiasNHWCMishMaskHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,(const half*)mask,sSize,cSize);
+      else
+        applyCScaleBiasNHWCMishMaskKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,(const float*)mask,sSize,cSize);
+    }
+    else if(activation == ACTIVATION_MISH_SCALE8) {
+      if(isHalf)
+        applyCScaleBiasNHWCMishScale8MaskHalfKernel<<<grid,threads>>>((const half*)in,(half*)out,(const half*)scale,(const half*)biases,(const half*)mask,sSize,cSize);
+      else
+        applyCScaleBiasNHWCMishScale8MaskKernel<<<grid,threads>>>((const float*)in,(float*)out,(const float*)scale,(const float*)biases,(const float*)mask,sSize,cSize);
+    }
+    else {
+      throw std::runtime_error("customCudaApplyCScaleBiasNHWC: unsupported activation");
+    }
+  }
+}
+
+void customCudaApplyCScaleBiasNHWC(const float* in, float* out, const float* scale, const float* biases, const float* mask, int nSize, int xySize, int cSize, int activation) {
+  sharedApplyCScaleBiasNHWC(in,out,scale,biases,mask,nSize,xySize,cSize,false,activation);
+}
+void customCudaApplyCScaleBiasNHWC(const half* in, half* out, const half* scale, const half* biases, const half* mask, int nSize, int xySize, int cSize, int activation) {
+  sharedApplyCScaleBiasNHWC(in,out,scale,biases,mask,nSize,xySize,cSize,true,activation);
+}
diff --git a/cpp/neuralnet/rocmincludes.h b/cpp/neuralnet/rocmincludes.h
new file mode 100644
index 000000000..8b494a37e
--- /dev/null
+++ b/cpp/neuralnet/rocmincludes.h
@@ -0,0 +1,15 @@
+#ifndef NEURALNET_ROCMINCLUDES_H
+#define NEURALNET_ROCMINCLUDES_H
+
+//Ensure that CUDA_API_PER_THREAD_DEFAULT_STREAM is always defined
+//before any cuda headers are included so that we get the desired threading behavior for CUDA.
+
+#define CUDA_API_PER_THREAD_DEFAULT_STREAM
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+
+#include <hipblas/hipblas.h>
+#include <miopen/miopen.h>
+
+
+#endif //NEURALNET_ROCMINCLUDES_H
diff --git a/cpp/neuralnet/rocmutils.cpp b/cpp/neuralnet/rocmutils.cpp
new file mode 100644
index 000000000..752298b7f
--- /dev/null
+++ b/cpp/neuralnet/rocmutils.cpp
@@ -0,0 +1,170 @@
+#include "../neuralnet/rocmutils.h"
+
+#include <iomanip>
+#include "../neuralnet/rocmerrorcheck.h"
+#include "../neuralnet/rocmincludes.h"
+#include "../neuralnet/rocmhelpers.h"
+
+#include "../external/half-2.2.0/include/half.hpp"
+
+//------------------------
+#include "../core/using.h"
+//------------------------
+
+using half_t = half_float::half;
+
+void CudaUtils::mallocOnDevice(const string& name, int numWeights, void*& deviceBuf, bool useFP16) {
+  if(useFP16) {
+    size_t halfBytes = numWeights * sizeof(half_t);
+    CUDA_ERR(name.c_str(),hipMalloc(&deviceBuf, halfBytes));
+  }
+  else {
+    size_t floatBytes = numWeights * sizeof(float);
+    CUDA_ERR(name.c_str(),hipMalloc(&deviceBuf, floatBytes));
+  }
+}
+
+void CudaUtils::mallocAndCopyToDevice(const string& name, const vector<float>& weights, void*& deviceBuf, bool useFP16) {
+  size_t numWeights = weights.size();
+  if(useFP16) {
+    size_t halfBytes = numWeights * sizeof(half_t);
+    vector<half_t> weightsHalf(weights.size());
+    for(size_t i = 0; i<weights.size(); i++)
+      weightsHalf[i] = half_float::half_cast<half_t>(weights[i]);
+    CUDA_ERR(name.c_str(),hipMalloc(&deviceBuf, halfBytes));
+    CUDA_ERR(name.c_str(),hipMemcpy(deviceBuf, weightsHalf.data(), halfBytes, hipMemcpyHostToDevice));
+  }
+  else {
+    size_t floatBytes = numWeights * sizeof(float);
+    CUDA_ERR(name.c_str(),hipMalloc(&deviceBuf, floatBytes));
+    CUDA_ERR(name.c_str(),hipMemcpy(deviceBuf, weights.data(), floatBytes, hipMemcpyHostToDevice));
+  }
+}
+
+void CudaUtils::mallocAndCopyToDevice(const string& name, const float* weights, int numWeights, void*& deviceBuf, bool useFP16) {
+  if(useFP16) {
+    size_t halfBytes = numWeights * sizeof(half_t);
+    vector<half_t> weightsHalf(numWeights);
+    for(int i = 0; i<numWeights; i++)
+      weightsHalf[i] = half_float::half_cast<half_t>(weights[i]);
+    CUDA_ERR(name.c_str(),hipMalloc(&deviceBuf, halfBytes));
+    CUDA_ERR(name.c_str(),hipMemcpy(deviceBuf, weightsHalf.data(), halfBytes, hipMemcpyHostToDevice));
+  }
+  else {
+    size_t floatBytes = numWeights * sizeof(float);
+    CUDA_ERR(name.c_str(),hipMalloc(&deviceBuf, floatBytes));
+    CUDA_ERR(name.c_str(),hipMemcpy(deviceBuf, weights, floatBytes, hipMemcpyHostToDevice));
+  }
+}
+
+//Only use in testing, allocates an intermediate buffer in the case of FP16 which will be very slow.
+void CudaUtils::expensiveCopyFromDevice(const string& name, float* weights, int numWeights, const void* deviceBuf, bool useFP16) {
+  if(useFP16) {
+    vector<half_t> weightsHalf(numWeights);
+    size_t halfBytes = numWeights * sizeof(half_t);
+    CUDA_ERR(name.c_str(),hipMemcpy(weightsHalf.data(), deviceBuf, halfBytes, hipMemcpyDeviceToHost));
+    for(int i = 0; i<numWeights; i++)
+      weights[i] = weightsHalf[i];
+  }
+  else {
+    size_t floatBytes = numWeights * sizeof(float);
+    CUDA_ERR(name.c_str(),hipMemcpy(weights, deviceBuf, floatBytes, hipMemcpyDeviceToHost));
+  }
+}
+
+void CudaUtils::debugPrint2D(const string& name, const void* deviceBuf, int batchSize, int cSize, bool useFP16) {
+  vector<float> values(batchSize * cSize);
+  expensiveCopyFromDevice(name, values.data(), values.size(), deviceBuf, useFP16);
+  cout << "=========================================================" << endl;
+  cout << "TENSOR" << endl;
+  cout << name << endl;
+  cout << std::setprecision(8);
+  int i = 0;
+  for(int n = 0; n<batchSize; n++) {
+    cout << "-(n=" << n << ")--------------------" << endl;
+    for(int c = 0; c<cSize; c++)
+      cout << values[i++] << " ";
+    cout << endl;
+  }
+  cout << endl;
+  cout << "=========================================================" << endl;
+}
+
+void CudaUtils::debugPrint4D(const string& name, const void* deviceBuf, int batchSize, int cSize, int xSize, int ySize, bool useNHWC, bool useFP16) {
+  vector<float> values(batchSize * cSize * xSize * ySize);
+  expensiveCopyFromDevice(name, values.data(), values.size(), deviceBuf, useFP16);
+  cout << "=========================================================" << endl;
+  cout << "TENSOR" << endl;
+  cout << name << endl;
+  cout << std::setprecision(8);
+  int i = 0;
+  double total1 = 0;
+  double total2 = 0;
+  double total3 = 0;
+  for(int n = 0; n<batchSize; n++) {
+    cout << "-(n=" << n << ")--------------------" << endl;
+    if(useNHWC) {
+      for(int y = 0; y<ySize; y++) {
+        cout << "(y=" << y << ")" << endl;
+        for(int x = 0; x<xSize; x++) {
+          for(int c = 0; c<cSize; c++) {
+            float value = values[i++];
+            total1 += (((c + y / 2 + x / 3 + n / 4) % 2)*2-1) * value;
+            total2 += (((c + y / 3 + x / 1 + n / 3) % 2)*2-1) * value;
+            total3 += (((c + y / 5 + x / 2 + n / 2) % 2)*2-1) * value;
+            cout << value << " ";
+          }
+          cout << endl;
+        }
+        cout << endl;
+      }
+    }
+    else {
+      for(int c = 0; c<cSize; c++) {
+        cout << "(c=" << c << ")" << endl;
+        for(int y = 0; y<ySize; y++) {
+          for(int x = 0; x<xSize; x++) {
+            float value = values[i++];
+            total1 += (((c + y / 2 + x / 3 + n / 4) % 2)*2-1) * value;
+            total2 += (((c + y / 3 + x / 1 + n / 3) % 2)*2-1) * value;
+            total3 += (((c + y / 5 + x / 2 + n / 2) % 2)*2-1) * value;
+            cout << value << " ";
+          }
+          cout << endl;
+        }
+        cout << endl;
+      }
+    }
+  }
+  cout << "TOTAL " << total1 << " " << total2 << " " << total3 << endl;
+  cout << "=========================================================" << endl;
+}
+
+void CudaUtils::checkBufferSize(int batchSize, int xSize, int ySize, int channels) {
+  if((int64_t)batchSize * xSize * ySize * channels >= (int64_t)1 << 31)
+    throw StringError("Batch size too large, resulting GPU buffers might exceed 2^31 entries which is not currently supported");
+}
+
+void CudaUtils::hostMallocZeroOneBufs(void*& zeroBuf, void*& oneBuf, bool useFP16) {
+  if(!useFP16) {
+    zeroBuf = malloc(sizeof(float));
+    oneBuf = malloc(sizeof(float));
+    *((float*)zeroBuf) = 0.0f;
+    *((float*)oneBuf) = 1.0f;
+  }
+  else {
+    //Convert to FP16 on the device, then copy back so we have it in host memory
+    float zero = 0.0f;
+    float one = 1.0f;
+    void* zeroTmp;
+    void* oneTmp;
+    mallocAndCopyToDevice("Buffers",&zero,1,zeroTmp,useFP16);
+    mallocAndCopyToDevice("Buffers",&one,1,oneTmp,useFP16);
+    zeroBuf = malloc(sizeof(half_t));
+    oneBuf = malloc(sizeof(half_t));
+    CUDA_ERR("Buffers",hipMemcpy(zeroBuf,zeroTmp,sizeof(half_t),hipMemcpyDeviceToHost));
+    CUDA_ERR("Buffers",hipMemcpy(oneBuf,oneTmp,sizeof(half_t),hipMemcpyDeviceToHost));
+    hipFree(zeroTmp);
+    hipFree(oneTmp);
+  }
+}
diff --git a/cpp/neuralnet/rocmutils.h b/cpp/neuralnet/rocmutils.h
new file mode 100644
index 000000000..d43868402
--- /dev/null
+++ b/cpp/neuralnet/rocmutils.h
@@ -0,0 +1,21 @@
+#ifndef NEURALNET_ROCMUTILS_H
+#define NEURALNET_ROCMUTILS_H
+
+#include "../core/global.h"
+
+namespace CudaUtils {
+  void mallocOnDevice(const std::string& name, int numWeights, void*& deviceBuf, bool useFP16);
+  void mallocAndCopyToDevice(const std::string& name, const std::vector<float>& weights, void*& deviceBuf, bool useFP16);
+  void mallocAndCopyToDevice(const std::string& name, const float* weights, int numWeights, void*& deviceBuf, bool useFP16);
+
+  //Only use in testing, allocates an intermediate buffer in the case of FP16 which will be very slow.
+  void expensiveCopyFromDevice(const std::string& name, float* weights, int numWeights, const void* deviceBuf, bool useFP16);
+
+  void debugPrint2D(const std::string& name, const void* deviceBuf, int batchSize, int cSize, bool useFP16);
+  void debugPrint4D(const std::string& name, const void* deviceBuf, int batchSize, int cSize, int xSize, int ySize, bool useNHWC, bool useFP16);
+
+  void checkBufferSize(int batchSize, int xSize, int ySize, int channels);
+  void hostMallocZeroOneBufs(void*& zeroBuf, void*& oneBuf, bool useFP16);
+}
+
+#endif // NEURALNET_ROCMUTILS_H

From b4555304ee827059fda2ee1fbbad323e0e18e717 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Mon, 28 Jul 2025 20:28:51 +0200
Subject: [PATCH 02/24] Fix bugs

---
 cpp/CMakeLists.txt            | 33 +++++++++++++++------------------
 cpp/neuralnet/rocmbackend.cpp |  2 +-
 2 files changed, 16 insertions(+), 19 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e12b7e41b..13c1f3955 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -32,7 +32,6 @@ endif()
 set(BUILD_DISTRIBUTED 0 CACHE BOOL "Build with http support for contributing to distributed training")
 set(USE_BACKEND CACHE STRING "Neural net backend")
 string(TOUPPER "${USE_BACKEND}" USE_BACKEND)
-# set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN)
 set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN ROCM)
 
 set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc")
@@ -140,40 +139,39 @@ elseif(USE_BACKEND STREQUAL "EIGEN")
   set(NEURALNET_BACKEND_SOURCES
     neuralnet/eigenbackend.cpp
     )
-# --------------------------- ROCM 后端（AMD GPU / HIP  MIOpen） ---------------------------
+# --------------------------- ROCM backend（AMD GPU / HIP  MIOpen） ---------------------------
 elseif(USE_BACKEND STREQUAL "ROCM")
   message(STATUS "-DUSE_BACKEND=ROCM, using AMD ROCm backend.")
 
-  # 1) 启用 HIP 语言（.hip / .cpp 均可）并指定 C++17
   enable_language(HIP)
   set(CMAKE_HIP_STANDARD 17)
 
   if(CMAKE_PREFIX_PATH STREQUAL "" OR NOT DEFINED CMAKE_PREFIX_PATH)
     if(DEFINED ENV{HIP_PATH})
-      # Windows HIP‑SDK 或自定义安装
+      # Windows HIP‑SDK
       list(APPEND CMAKE_PREFIX_PATH $ENV{HIP_PATH})
       message(STATUS "Auto‑detected HIP_PATH=$ENV{HIP_PATH} → CMAKE_PREFIX_PATH")
     elseif(EXISTS "/opt/rocm")
-      # Linux 默认路径
+      # Linux
       list(APPEND CMAKE_PREFIX_PATH "/opt/rocm")
       message(STATUS "CMAKE_PREFIX_PATH not given; defaulting to /opt/rocm")
     endif()
   endif()
 
-  # 可让用户用 -DCMAKE_HIP_ARCHITECTURES=gfx90a;gfx942 手动指定 GFX 架构
+  # Users can -DCMAKE_HIP_ARCHITECTURES=gfx90a;gfx942 manually specify GFX architectures
   if(NOT DEFINED CMAKE_HIP_ARCHITECTURES)
-    # 默认同时编译常见 MI200 / RDNA3 卡，可按需精简
+    # Default compile MI200 / RDNA3 cards, can be simplified as needed
     set(CMAKE_HIP_ARCHITECTURES 90a 942 908 1100 1101 1200 1201 CACHE STRING "AMD GPU targets")
   endif()
 
-  # 2) 指定后端源码。rocmhelpers.hip 里是 GPU‑kernel，别漏了
+  # 2) Specify backend source code. rocmhelpers.hip contains GPU kernels, don't forget it
   set(NEURALNET_BACKEND_SOURCES
     neuralnet/rocmbackend.cpp
     neuralnet/rocmutils.cpp
     neuralnet/rocmhelpers.hip
   )
 
-  # 可选：启用 model-size‑based autotuning等额外宏
+  # Optional: Enable model-size‑based autotuning and other macros
   # add_compile_definitions(HIP_SUPPORTS_FP16)
 
 elseif(USE_BACKEND STREQUAL "")
@@ -455,9 +453,9 @@ elseif(USE_BACKEND STREQUAL "OPENCL")
     link_directories(${OpenCL_LIBRARY})
     target_link_libraries(katago ${OpenCL_LIBRARY})
   endif()
-# --------------------------- ROCM 链接阶段 ---------------------------
+# --------------------------- ROCM linking stage ---------------------------
 elseif(USE_BACKEND STREQUAL "ROCM")
-  # 宏：源代码里用 #ifdef USE_ROCM_BACKEND 判断
+  # Macro: used in source code with #ifdef USE_ROCM_BACKEND
   target_compile_definitions(katago PRIVATE USE_ROCM_BACKEND)
   target_compile_definitions(katago PRIVATE HIP_TARGET_VERSION=${CMAKE_HIP_COMPILER_VERSION})
 
@@ -467,12 +465,11 @@ elseif(USE_BACKEND STREQUAL "ROCM")
     message(STATUS "Detected FP16‑capable GFX arch (${CMAKE_HIP_ARCHITECTURES}); defining HIP_SUPPORTS_FP16")
   endif()
 
-  # 3) 找到 ROCm 运行时 & 库。自 ROCm 6.x 起都带 CMake config‑mode 包
-  #    如若找不到，加 -DCMAKE_PREFIX_PATH=/opt/rocm
-  find_package(hip        QUIET CONFIG)   # 导出 hip::device / hip::host
-  find_package(hipblas    QUIET CONFIG)   # 导出 roc::hipblas
-  find_package(miopen     QUIET CONFIG)   # 导出 roc::miopen
-  # ---------- fallback：HIP 运行时 ----------
+  # 3) Find ROCm runtime & libraries. Since ROCm 6.x, CMake config-mode packages are included. If not found, add -DCMAKE_PREFIX_PATH=/opt/rocm
+  find_package(hip        QUIET CONFIG)   # Export hip::device / hip::host
+  find_package(hipblas    QUIET CONFIG)   # Export roc::hipblas
+  find_package(miopen     QUIET CONFIG)   # Export roc::miopen
+  # ---------- fallback：HIP Runtime ----------
   if(NOT hip_FOUND)
     find_path(HIP_INCLUDE_DIR hip/hip_runtime.h
               HINTS ${CMAKE_PREFIX_PATH} /opt/rocm
@@ -507,7 +504,7 @@ elseif(USE_BACKEND STREQUAL "ROCM")
     endif()
   endforeach()
 
-  # 4) 头文件路径已由 config‑mode target 解决，无需硬编码
+  # 4) Header file paths are resolved by config-mode targets, no need to hard-code
   target_link_libraries(katago
     hip::device          # HIP runtime & kernel offload
     roc::hipblas         # BLAS
diff --git a/cpp/neuralnet/rocmbackend.cpp b/cpp/neuralnet/rocmbackend.cpp
index 11489e85a..f8f80a9a1 100644
--- a/cpp/neuralnet/rocmbackend.cpp
+++ b/cpp/neuralnet/rocmbackend.cpp
@@ -1,5 +1,5 @@
 #include "hip/hip_runtime.h"
-// #ifdef USE_ROCM_BACKEND
+#ifdef USE_ROCM_BACKEND
 #include <map>
 #include <string>
 #include <vector>

From 8b30cb965f3586151a0cd517ec6cc63bd8ac0946 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Thu, 31 Jul 2025 23:53:18 +0200
Subject: [PATCH 03/24] Update

---
 cpp/CMakeLists.txt             |   8 +-
 cpp/command/benchmark.cpp      |   5 +
 cpp/neuralnet/rocmbackend.cpp  | 178 ++++++++++++++++++++-------------
 cpp/neuralnet/rocmerrorcheck.h |   4 +-
 cpp/neuralnet/rocmhelpers.hip  |   2 +-
 cpp/program/gtpconfig.cpp      |   3 +
 cpp/program/setup.cpp          |   3 +
 7 files changed, 131 insertions(+), 72 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 13c1f3955..471a67a5f 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,6 +1,10 @@
 cmake_minimum_required(VERSION 3.18.2)
 if(USE_BACKEND STREQUAL "METAL")
   project(katago LANGUAGES CXX Swift)
+elseif(USE_BACKEND STREQUAL "ROCM")
+  set(CMAKE_C_COMPILER  /opt/rocm/bin/hipcc CACHE FILEPATH "" FORCE)
+  set(CMAKE_CXX_COMPILER /opt/rocm/bin/hipcc CACHE FILEPATH "" FORCE)
+  project(katago LANGUAGES C CXX HIP)
 else()
   project(katago)
 endif()
@@ -509,7 +513,7 @@ elseif(USE_BACKEND STREQUAL "ROCM")
     hip::device          # HIP runtime & kernel offload
     roc::hipblas         # BLAS
     MIOpen
-    roc::miopen          # DNN primitives
+    # roc::miopen          # DNN primitives
   )
 elseif(USE_BACKEND STREQUAL "EIGEN")
   target_compile_definitions(katago PRIVATE USE_EIGEN_BACKEND)
@@ -640,7 +644,7 @@ if(MSVC)
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /STACK:8388608")
 elseif(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
   message(STATUS "Setting up build for GNU, Clang or MinGW.")
-  if(NOT (${CMAKE_SYSTEM_PROCESSOR} MATCHES "(arm|aarch32|aarch64)"))
+  if(NOT (${CMAKE_SYSTEM_PROCESSOR} MATCHES "(arm|aarch32|aarch64)") AND NOT USE_BACKEND STREQUAL "ROCM")
     set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -mfpmath=sse")
   else()
     # For ARM architecture, as a hack, ensure that char is signed
diff --git a/cpp/command/benchmark.cpp b/cpp/command/benchmark.cpp
index 949a436fc..cf87303eb 100644
--- a/cpp/command/benchmark.cpp
+++ b/cpp/command/benchmark.cpp
@@ -265,6 +265,11 @@ int MainCmds::benchmark(const vector<string>& args) {
   cout << "If you have a strong GPU capable of FP16 tensor cores (e.g. RTX2080), "
        << "using the Cuda version of KataGo instead may give a mild performance boost." << endl;
 #endif
+#ifdef USE_ROCM_BACKEND
+  cout << "You are currently using the ROCm version of KataGo." << endl;
+  cout << "If you have a strong GPU capable of FP16 tensor cores (e.g. RX6900XT), "
+       << "using the ROCm version of KataGo instead may give a mild performance boost." << endl;
+#endif
 #ifdef USE_EIGEN_BACKEND
   cout << "You are currently using the Eigen (CPU) version of KataGo. Due to having no GPU, it may be slow." << endl;
 #endif
diff --git a/cpp/neuralnet/rocmbackend.cpp b/cpp/neuralnet/rocmbackend.cpp
index f8f80a9a1..59022e2b7 100644
--- a/cpp/neuralnet/rocmbackend.cpp
+++ b/cpp/neuralnet/rocmbackend.cpp
@@ -1,5 +1,5 @@
-#include "hip/hip_runtime.h"
 #ifdef USE_ROCM_BACKEND
+#include "hip/hip_runtime.h"
 #include <map>
 #include <string>
 #include <vector>
@@ -41,7 +41,7 @@ void NeuralNet::globalCleanup() {
 
 struct CudaHandles {
   hipblasHandle_t cublas;
-  miopenStatus_t cudnn;
+  miopenHandle_t cudnn;
   const int majorComputeCapability;
   const int minorComputeCapability;
 
@@ -142,38 +142,38 @@ struct ByBatchSizeView {
 //channels, useFP16, useNHWC
 typedef std::tuple<int, bool, bool> CudnnTensorDesc4DKey;
 
-struct CudnnTensorDesc4DKey {
-  int channels;
-  bool useFP16;
-  bool useNHWC;
-  bool operator<(const CudnnTensorDesc4DKey& other) const {
-    return std::tie(channels, useFP16, useNHWC) <
-           std::tie(other.channels, other.useFP16, other.useNHWC);
-  }
-};
-
-template <typename T>
-struct ByBatchSize {
-  explicit ByBatchSize(int max)
-      : data(max + 1), destroyFunc(nullptr) {}
-  ~ByBatchSize() {
-    if (destroyFunc) {
-      for (auto& d : data) {
-        if (d) destroyFunc(d);
-      }
-    }
-  }
-  T& operator[](int idx) { return data[idx]; }
-  std::vector<T> data;
-  miopenStatus_t (*destroyFunc)(T) = nullptr;
-};
-
-template <typename T>
-struct ByBatchSizeView {
-  explicit ByBatchSizeView(ByBatchSize<T>& ref) : ref(ref) {}
-  T& operator[](int idx) { return ref[idx]; }
-  ByBatchSize<T>& ref;
-};
+// struct CudnnTensorDesc4DKey {
+//   int channels;
+//   bool useFP16;
+//   bool useNHWC;
+//   bool operator<(const CudnnTensorDesc4DKey& other) const {
+//     return std::tie(channels, useFP16, useNHWC) <
+//            std::tie(other.channels, other.useFP16, other.useNHWC);
+//   }
+// };
+
+// template <typename T>
+// struct ByBatchSize {
+//   explicit ByBatchSize(int max)
+//       : data(max + 1), destroyFunc(nullptr) {}
+//   ~ByBatchSize() {
+//     if (destroyFunc) {
+//       for (auto& d : data) {
+//         if (d) destroyFunc(d);
+//       }
+//     }
+//   }
+//   T& operator[](int idx) { return data[idx]; }
+//   std::vector<T> data;
+//   miopenStatus_t (*destroyFunc)(T) = nullptr;
+// };
+
+// template <typename T>
+// struct ByBatchSizeView {
+//   explicit ByBatchSizeView(ByBatchSize<T>& ref) : ref(ref) {}
+//   T& operator[](int idx) { return ref[idx]; }
+//   ByBatchSize<T>& ref;
+// };
 
 // -----------------------------------------------------------------------------
 //                                CudnnManager
@@ -356,14 +356,28 @@ struct ConvLayer {
     bool filterNHWC = useNHWCOut && dilationY == 1 && dilationX == 1;
 
     CUDNN_ERR(name.c_str(),miopenCreateTensorDescriptor(&filterDescriptor));
-    CUDNN_ERR(name.c_str(),miopenSet4dTensorDescriptor(
-      filterDescriptor,
-      (useFP16 ? miopenHalf : miopenFloat),
-      outChannels,
-      inChannels,
-      convYSize,
-      convXSize
-    ));
+    int lens[4];
+    if (filterNHWC) {          // cuDNN 的 OHWI
+        lens[0] = outChannels; // O
+        lens[1] = convYSize;   // H
+        lens[2] = convXSize;   // W
+        lens[3] = inChannels;  // I
+        CUDNN_ERR(name.c_str(),miopenSetNdTensorDescriptorWithLayout(
+            filterDescriptor,
+            useFP16 ? miopenHalf : miopenFloat,
+            miopenTensorNHWC,               // 指定布局
+            lens,
+            4));
+    } else {
+        CUDNN_ERR(name.c_str(),miopenSet4dTensorDescriptor(
+          filterDescriptor,
+          (useFP16 ? miopenHalf : miopenFloat),
+          outChannels,
+          inChannels,
+          convYSize,
+          convXSize
+        )); // cuDNN 的 OIHW
+    }// cuDNN 的 OIHW
 
     int yStride = 1;
     int xStride = 1;
@@ -383,21 +397,38 @@ struct ConvLayer {
     ));
     if(useFP16) {
       int alt = 1; // non‑zero enables alt‑impl on MI2xx+ GPUs
-      miopenSetConvolutionAttribute(convolutionDescriptor,
+      CUDNN_ERR(name.c_str(),miopenSetConvolutionAttribute(convolutionDescriptor,
                                     MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL,
-                                    alt);
+                                    alt));
     }
 
     convolutionAlgorithms = new ByBatchSize<miopenConvFwdAlgorithm_t >(maxBatchSize);
 
         for(int batchSize = 1; batchSize <= maxBatchSize; ++batchSize) {
       if(useFP16 && dilationX <= 1 && dilationY <= 1) {
-        (*convolutionAlgorithms)[batchSize] = miopenConvolutionFwdAlgoImplicitGEMM;
+        // 手动填充最简单的 Perf 结构体
+        miopenConvAlgoPerf_t perf = {};
+        perf.fwd_algo      = miopenConvolutionFwdAlgoImplicitGEMM;   // 固定算法
+        perf.memory        = 0;     // 需 0 workspace
+        perf.time          = 0.0f;  // 不做基准
+        (*convolutionAlgorithms)[batchSize] = perf;
+        continue;
       }
       else {
-        (*convolutionAlgorithms)[batchSize] = miopenConvolutionFwdAlgoDirect;
-        // If desired, call miopenFindConvolutionForwardAlgorithm() here once you
-        // have real device buffers to auto‑tune. See porting notes.
+        miopenConvAlgoPerf_t perfResults[4];
+        int returnedAlgoCount = 0;
+        CUDNN_ERR(name.c_str(),miopenFindConvolutionForwardAlgorithm(
+            handle,
+            xDesc,  inputBuf,
+            wDesc,  filterBuf,
+            convDesc,
+            yDesc,  outputBuf,
+            /*requestAlgoCount=*/1,              // 只要最快
+            &returnedAlgoCount,
+            perfResults,
+            workspaceBuf,
+            wsSize,
+            /*exhaustiveSearch=*/true));
       }
     }
 
@@ -643,21 +674,34 @@ struct MatMulLayer {
       ));
     }
     else {
-      const half* alpha = (const half*)scratch->oneBuf;
-      const half* beta = (const half*)scratch->zeroBuf;
-      CUBLAS_ERR(name.c_str(),hipblasHgemm(
+      // const half* alpha = (const half*)scratch->oneBuf;
+      // const half* beta = (const half*)scratch->zeroBuf;
+      // CUBLAS_ERR(name.c_str(),hipblasHgemm(
+      //   cudaHandles->cublas,
+      //   HIPBLAS_OP_N,
+      //   HIPBLAS_OP_N,
+      //   outChannels,
+      //   batchSize,
+      //   inChannels,
+      //   alpha,
+      //   (const half*)matBuf,outChannels,
+      //   (const half*)inputBuf,inChannels,
+      //   beta,
+      //   (half*)outputBuf,outChannels
+      // ));
+      static const half alpha_h = half(1.0f);
+      static const half beta_h  = half(0.0f);
+      CUBLAS_ERR(name.c_str(), hipblasGemmEx(
         cudaHandles->cublas,
-        HIPBLAS_OP_N,
-        HIPBLAS_OP_N,
-        outChannels,
-        batchSize,
-        inChannels,
-        alpha,
-        (const half*)matBuf,outChannels,
-        (const half*)inputBuf,inChannels,
-        beta,
-        (half*)outputBuf,outChannels
-      ));
+        HIPBLAS_OP_N, HIPBLAS_OP_N,
+        outChannels, batchSize, inChannels,
+        &alpha_h,
+        (const half*)matBuf,   HIPBLAS_R_16F, outChannels,
+        (const half*)inputBuf, HIPBLAS_R_16F, inChannels,
+        &beta_h,
+        (half*)outputBuf,      HIPBLAS_R_16F, outChannels,
+        HIPBLAS_R_16F,               /* compute_type */
+        HIPBLAS_GEMM_DEFAULT));      /* algo */
     }
 
   }
@@ -2365,7 +2409,7 @@ ComputeHandle* NeuralNet::createComputeHandle(
   //Old GPUs - use FP32 and explicitly fail if FP16 enabled
   if(prop.major < 5 || (prop.major == 5 && prop.minor < 3)) {
     if(context->useFP16Mode == enabled_t::True)
-      throw StringError("Cuda device versions below 5.3 do not support useFP16=true");
+      throw StringError("ROCm device versions below 6.0 do not support useFP16=true");
     if(context->useNHWCMode == enabled_t::True)
       useNHWC = true;
   }
@@ -2395,18 +2439,18 @@ ComputeHandle* NeuralNet::createComputeHandle(
 
   if(logger != NULL) {
     logger->write(
-      "Cuda backend thread " + Global::intToString(serverThreadIdx) + ": Found GPU " + string(prop.name)
+      "ROCm backend thread " + Global::intToString(serverThreadIdx) + ": Found GPU " + string(prop.name)
       + " memory " + Global::uint64ToString(prop.totalGlobalMem)
       + " compute capability major " + Global::intToString(prop.major)
       + " minor " + Global::intToString(prop.minor)
     );
     logger->write(
-      "Cuda backend thread " + Global::intToString(serverThreadIdx) + ": Model version " + Global::intToString(loadedModel->modelDesc.modelVersion) +
+      "ROCm backend thread " + Global::intToString(serverThreadIdx) + ": Model version " + Global::intToString(loadedModel->modelDesc.modelVersion) +
       " useFP16 = " + Global::boolToString(useFP16) +
       " useNHWC = " + Global::boolToString(useNHWC)
     );
     logger->write(
-      "Cuda backend thread " + Global::intToString(serverThreadIdx) + ": Model name: " + loadedModel->modelDesc.name
+      "ROCm backend thread " + Global::intToString(serverThreadIdx) + ": Model name: " + loadedModel->modelDesc.name
     );
   }
 
@@ -2432,7 +2476,7 @@ void NeuralNet::printDevices() {
   for(int i = 0; i<numDevices; i++) {
     hipDeviceProp_t prop;
     hipGetDeviceProperties(&prop, i);
-    cout << "Found CUDA device " << i << ": " << prop.name << endl;
+    cout << "Found ROCm device " << i << ": " << prop.name << endl;
   }
 }
 
diff --git a/cpp/neuralnet/rocmerrorcheck.h b/cpp/neuralnet/rocmerrorcheck.h
index 049f1ae95..8cb214ad7 100644
--- a/cpp/neuralnet/rocmerrorcheck.h
+++ b/cpp/neuralnet/rocmerrorcheck.h
@@ -13,7 +13,7 @@ static inline void checkCudaError(hipError_t status,
   if(status != hipSuccess)
     throw StringError(std::string("HIP Error @") + opName + " " +
                       file + ":" + func + ":" + Global::intToString(line) +
-                      " : " + cudaGetErrorString(status));
+                      " : " + hipGetErrorString(status));
 }
 #define CUDA_ERR(opName,x)   checkCudaError((x),opName,__FILE__,#x,__LINE__)
 
@@ -52,7 +52,7 @@ static inline void checkCudnnError(miopenStatus_t status,
   if(status != miopenStatusSuccess)
     throw StringError(std::string("MIOpen Error @") + opName + " " +
                       file + ":" + func + ":" + Global::intToString(line) +
-                      " : " + cudnnGetErrorString(status));
+                      " : " + miopenGetErrorString(status));
 }
 #define CUDNN_ERR(opName,x) checkCudnnError((x),opName,__FILE__,#x,__LINE__)
 
diff --git a/cpp/neuralnet/rocmhelpers.hip b/cpp/neuralnet/rocmhelpers.hip
index 2f9b94951..730b37361 100644
--- a/cpp/neuralnet/rocmhelpers.hip
+++ b/cpp/neuralnet/rocmhelpers.hip
@@ -4,7 +4,7 @@
 
 #include <stdexcept>
 
-#if defined(__HIP_ARCH_HAS_FP16__) || (defined(__HIP_DEVICE_COMPILE__) && (__HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || ...))
+#if defined(__HIP_ARCH_HAS_FP16__) || (defined(__HIP_DEVICE_COMPILE__) && (__HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__))
 #define HIP_SUPPORTS_FP16
 #endif
 
diff --git a/cpp/program/gtpconfig.cpp b/cpp/program/gtpconfig.cpp
index 7a45c02de..d8f1decf3 100644
--- a/cpp/program/gtpconfig.cpp
+++ b/cpp/program/gtpconfig.cpp
@@ -535,6 +535,9 @@ string GTPConfig::makeConfig(
 #endif
 #ifdef USE_OPENCL_BACKEND
       replacement += "openclDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n";
+#endif
+#ifdef USE_ROCM_BACKEND
+      replacement += "rocmDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n";
 #endif
     }
     replace("$$MULTIPLE_GPUS", replacement);
diff --git a/cpp/program/setup.cpp b/cpp/program/setup.cpp
index e0f6e6ced..9c423771b 100644
--- a/cpp/program/setup.cpp
+++ b/cpp/program/setup.cpp
@@ -19,6 +19,7 @@ std::vector<std::string> Setup::getBackendPrefixes() {
   prefixes.push_back("trt");
   prefixes.push_back("metal");
   prefixes.push_back("opencl");
+  prefixes.push_back("rocm");
   prefixes.push_back("eigen");
   prefixes.push_back("dummybackend");
   return prefixes;
@@ -86,6 +87,8 @@ vector<NNEvaluator*> Setup::initializeNNEvaluators(
   string backendPrefix = "metal";
   #elif defined(USE_OPENCL_BACKEND)
   string backendPrefix = "opencl";
+  #elif defined(USE_ROCM_BACKEND)
+  string backendPrefix = "rocm";
   #elif defined(USE_EIGEN_BACKEND)
   string backendPrefix = "eigen";
   #else

From 570ced01af2dc02cc09bbef6ec0bc51c8dcf6c10 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Fri, 1 Aug 2025 18:48:11 +0200
Subject: [PATCH 04/24] Fix bugs

---
 cpp/neuralnet/rocmbackend.cpp | 348 +++++++++++++---------------------
 cpp/program/setup.cpp         |   2 +-
 2 files changed, 134 insertions(+), 216 deletions(-)

diff --git a/cpp/neuralnet/rocmbackend.cpp b/cpp/neuralnet/rocmbackend.cpp
index 59022e2b7..a6bc8862c 100644
--- a/cpp/neuralnet/rocmbackend.cpp
+++ b/cpp/neuralnet/rocmbackend.cpp
@@ -7,9 +7,9 @@
 
 #include "../neuralnet/rocmerrorcheck.h"
 #include "../neuralnet/rocmincludes.h"
+
 #include "../neuralnet/rocmhelpers.h"
 #include "../neuralnet/rocmutils.h"
-
 #include "../neuralnet/modelversion.h"
 #include "../neuralnet/nninterface.h"
 #include "../neuralnet/nninputs.h"
@@ -142,105 +142,54 @@ struct ByBatchSizeView {
 //channels, useFP16, useNHWC
 typedef std::tuple<int, bool, bool> CudnnTensorDesc4DKey;
 
-// struct CudnnTensorDesc4DKey {
-//   int channels;
-//   bool useFP16;
-//   bool useNHWC;
-//   bool operator<(const CudnnTensorDesc4DKey& other) const {
-//     return std::tie(channels, useFP16, useNHWC) <
-//            std::tie(other.channels, other.useFP16, other.useNHWC);
-//   }
-// };
-
-// template <typename T>
-// struct ByBatchSize {
-//   explicit ByBatchSize(int max)
-//       : data(max + 1), destroyFunc(nullptr) {}
-//   ~ByBatchSize() {
-//     if (destroyFunc) {
-//       for (auto& d : data) {
-//         if (d) destroyFunc(d);
-//       }
-//     }
-//   }
-//   T& operator[](int idx) { return data[idx]; }
-//   std::vector<T> data;
-//   miopenStatus_t (*destroyFunc)(T) = nullptr;
-// };
-
-// template <typename T>
-// struct ByBatchSizeView {
-//   explicit ByBatchSizeView(ByBatchSize<T>& ref) : ref(ref) {}
-//   T& operator[](int idx) { return ref[idx]; }
-//   ByBatchSize<T>& ref;
-// };
-
-// -----------------------------------------------------------------------------
-//                                CudnnManager
-// -----------------------------------------------------------------------------
 struct CudnnManager {
-  const std::string name;
+  const string name;
   const int maxBatchSize;
   const int nnXLen;
   const int nnYLen;
-  std::map<CudnnTensorDesc4DKey, ByBatchSize<miopenTensorDescriptor_t>*>
-      tensorDesc4DByBatchSizeByKey;
-
-  CudnnManager(std::string name_, int maxBatchSize_, int nnXLen_, int nnYLen_)
-      : name(std::move(name_)),
-        maxBatchSize(maxBatchSize_),
-        nnXLen(nnXLen_),
-        nnYLen(nnYLen_),
-        tensorDesc4DByBatchSizeByKey() {}
+  std::map<CudnnTensorDesc4DKey, ByBatchSize<miopenTensorDescriptor_t>*> tensorDesc4DByBatchSizeByKey;
+
+  CudnnManager(string name_, int maxBatchSize_, int nnXLen_, int nnYLen_)
+    :name(name_),
+     maxBatchSize(maxBatchSize_),
+     nnXLen(nnXLen_),
+     nnYLen(nnYLen_),
+     tensorDesc4DByBatchSizeByKey()
+  {
+  }
 
   ~CudnnManager() {
-    for (auto& iter : tensorDesc4DByBatchSizeByKey) {
+    for(auto& iter: tensorDesc4DByBatchSizeByKey) {
       delete iter.second;
     }
   }
 
   ByBatchSizeView<miopenTensorDescriptor_t> getTensorDesc4DByBatchSize(
-      int channels, bool useFP16, bool useNHWC) {
+    int channels, bool useFP16, bool useNHWC
+  ) {
     auto iter = tensorDesc4DByBatchSizeByKey.find({channels, useFP16, useNHWC});
-    if (iter != tensorDesc4DByBatchSizeByKey.end()) {
+    if(iter != tensorDesc4DByBatchSizeByKey.end()) {
       return ByBatchSizeView<miopenTensorDescriptor_t>(*(iter->second));
     }
-
-    auto* descs = new ByBatchSize<miopenTensorDescriptor_t>(maxBatchSize);
-
-    for (int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
+    ByBatchSize<miopenTensorDescriptor_t>* descs = new ByBatchSize<miopenTensorDescriptor_t>(maxBatchSize);
+    for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
       miopenTensorDescriptor_t& desc = (*descs)[batchSize];
-      // Create descriptor
-      CUDNN_ERR(name.c_str(), miopenCreateTensorDescriptor(&desc));
-
-      const miopenDataType_t dtype = useFP16 ? miopenHalf : miopenFloat;
-
-      if (!useNHWC) {
-        // Fully‑supported NCHW fast‑path
-        CUDNN_ERR(name.c_str(),
-                  miopenSet4dTensorDescriptor(desc, dtype, batchSize, channels,
-                                              nnYLen, nnXLen));
-      } else {
-        // NHWC path via generic Nd descriptor + explicit strides
-        int dims[4] = {batchSize, nnYLen, nnXLen, channels};  // N H W C
-        int strides[4];
-        strides[3] = 1;                             // C stride
-        strides[2] = strides[3] * channels;         // W stride
-        strides[1] = strides[2] * nnXLen;           // H stride
-        strides[0] = strides[1] * nnYLen;           // N stride
-
-        CUDNN_ERR(name.c_str(),
-                  miopenSetTensorDescriptor(desc, dtype, 4, dims, strides));
-      }
+      CUDNN_ERR(name.c_str(),miopenCreateTensorDescriptor(&desc));
+      CUDNN_ERR(name.c_str(),miopenSet4dTensorDescriptor(
+                  desc,
+                  (useFP16 ? miopenHalf : miopenFloat),
+                  batchSize,
+                  channels,
+                  nnYLen,
+                  nnXLen
+                ));
     }
-
     descs->destroyFunc = miopenDestroyTensorDescriptor;
     tensorDesc4DByBatchSizeByKey[{channels, useFP16, useNHWC}] = descs;
     return ByBatchSizeView<miopenTensorDescriptor_t>(*descs);
   }
 };
 
-
 //---------------------------------------------------------------------------------
 
 struct ScratchBuffers {
@@ -311,7 +260,7 @@ struct ConvLayer {
   ByBatchSizeView<miopenTensorDescriptor_t> outputDescriptors;
   miopenTensorDescriptor_t filterDescriptor;
   miopenConvolutionDescriptor_t convolutionDescriptor;
-  ByBatchSize<miopenConvFwdAlgorithm_t >* convolutionAlgorithms; //array of one for each batch size
+  ByBatchSize<miopenConvAlgoPerf_t>* convolutionAlgorithms; //array of one for each batch size
   void* filterBuf;
 
   ConvLayer() = delete;
@@ -356,33 +305,18 @@ struct ConvLayer {
     bool filterNHWC = useNHWCOut && dilationY == 1 && dilationX == 1;
 
     CUDNN_ERR(name.c_str(),miopenCreateTensorDescriptor(&filterDescriptor));
-    int lens[4];
-    if (filterNHWC) {          // cuDNN 的 OHWI
-        lens[0] = outChannels; // O
-        lens[1] = convYSize;   // H
-        lens[2] = convXSize;   // W
-        lens[3] = inChannels;  // I
-        CUDNN_ERR(name.c_str(),miopenSetNdTensorDescriptorWithLayout(
-            filterDescriptor,
-            useFP16 ? miopenHalf : miopenFloat,
-            miopenTensorNHWC,               // 指定布局
-            lens,
-            4));
-    } else {
-        CUDNN_ERR(name.c_str(),miopenSet4dTensorDescriptor(
-          filterDescriptor,
-          (useFP16 ? miopenHalf : miopenFloat),
-          outChannels,
-          inChannels,
-          convYSize,
-          convXSize
-        )); // cuDNN 的 OIHW
-    }// cuDNN 的 OIHW
+    CUDNN_ERR(name.c_str(),miopenSet4dTensorDescriptor(
+      filterDescriptor,
+      (useFP16 ? miopenHalf : miopenFloat),
+      outChannels,
+      inChannels,
+      convYSize,
+      convXSize
+    ));
 
     int yStride = 1;
     int xStride = 1;
 
-    bool tensorCoresSupported = true;
 
     CUDNN_ERR(name.c_str(),miopenCreateConvolutionDescriptor(&convolutionDescriptor));
     CUDNN_ERR(name.c_str(),miopenInitConvolutionDescriptor(
@@ -397,38 +331,64 @@ struct ConvLayer {
     ));
     if(useFP16) {
       int alt = 1; // non‑zero enables alt‑impl on MI2xx+ GPUs
-      CUDNN_ERR(name.c_str(),miopenSetConvolutionAttribute(convolutionDescriptor,
-                                    MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL,
-                                    alt));
+      CUDNN_ERR(name.c_str(),miopenSetConvolutionAttribute(convolutionDescriptor,MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL,alt));
     }
 
-    convolutionAlgorithms = new ByBatchSize<miopenConvFwdAlgorithm_t >(maxBatchSize);
+    convolutionAlgorithms = new ByBatchSize<miopenConvAlgoPerf_t >(maxBatchSize);
 
-        for(int batchSize = 1; batchSize <= maxBatchSize; ++batchSize) {
+    for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
       if(useFP16 && dilationX <= 1 && dilationY <= 1) {
-        // 手动填充最简单的 Perf 结构体
-        miopenConvAlgoPerf_t perf = {};
-        perf.fwd_algo      = miopenConvolutionFwdAlgoImplicitGEMM;   // 固定算法
-        perf.memory        = 0;     // 需 0 workspace
-        perf.time          = 0.0f;  // 不做基准
-        (*convolutionAlgorithms)[batchSize] = perf;
+        (*convolutionAlgorithms)[batchSize].fwd_algo = miopenConvolutionFwdAlgoImplicitGEMM;
         continue;
       }
       else {
-        miopenConvAlgoPerf_t perfResults[4];
-        int returnedAlgoCount = 0;
-        CUDNN_ERR(name.c_str(),miopenFindConvolutionForwardAlgorithm(
-            handle,
-            xDesc,  inputBuf,
-            wDesc,  filterBuf,
-            convDesc,
-            yDesc,  outputBuf,
-            /*requestAlgoCount=*/1,              // 只要最快
+        const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
+        const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
+        int requestedAlgoCount = 8;
+        int returnedAlgoCount = -1;
+        miopenConvFwdAlgorithm_t results[2 * requestedAlgoCount];
+        miopenConvSolution_t solutions[2 * requestedAlgoCount];
+        CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionCount(
+          cudaHandles->cudnn,
+          filterDescriptor,
+          inputDescriptor,
+          convolutionDescriptor,
+          outputDescriptor,
+          &requestedAlgoCount
+        ));
+        CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolution(
+            cudaHandles->cudnn,
+            filterDescriptor,
+            inputDescriptor,
+            convolutionDescriptor,
+            outputDescriptor,
+            requestedAlgoCount,
             &returnedAlgoCount,
-            perfResults,
-            workspaceBuf,
-            wsSize,
-            /*exhaustiveSearch=*/true));
+            solutions
+          ));
+        if(returnedAlgoCount <= 0)
+          throw StringError("miopenConvolutionForwardGetSolution returned no algorithms?");
+        for (size_t i = 0; i < returnedAlgoCount; i++) {
+          if(solutions[i].algorithm == miopenConvolutionAlgoGEMM) {
+            results[i] = miopenConvolutionFwdAlgoGEMM;
+          }
+          else if(solutions[i].algorithm == miopenConvolutionAlgoDirect) {
+            results[i] = miopenConvolutionFwdAlgoDirect;
+          }
+          else if(solutions[i].algorithm == miopenConvolutionAlgoFFT) {
+            results[i] = miopenConvolutionFwdAlgoFFT;
+          }
+          else if(solutions[i].algorithm == miopenConvolutionAlgoWinograd) {
+            results[i] = miopenConvolutionFwdAlgoWinograd;
+          }
+          else if(solutions[i].algorithm == miopenConvolutionAlgoImplicitGEMM) {
+            results[i] = miopenConvolutionFwdAlgoImplicitGEMM;
+          }
+          else{
+            throw StringError("Unknown miopenConvolutionFwdAlgo: " + std::to_string(solutions[i].algorithm));
+          }
+        }
+        (*convolutionAlgorithms)[batchSize].fwd_algo = results[0];
       }
     }
 
@@ -465,43 +425,42 @@ struct ConvLayer {
     int batchSize
   ) const {
     size_t workspaceBytes = 0;
-    CUDNN_ERR(name.c_str(), miopenConvolutionForwardGetWorkSpaceSize(
-                              cudaHandles->cudnn,
-                              filterDescriptor,
-                              inputDescriptors[batchSize],
-                              convolutionDescriptor,
-                              outputDescriptors[batchSize],
-                              &workspaceBytes));
+    CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetWorkSpaceSize(
+      cudaHandles->cudnn,
+      filterDescriptor,
+      inputDescriptors[batchSize],
+      convolutionDescriptor,
+      outputDescriptors[batchSize],
+      &workspaceBytes
+    ));
     return workspaceBytes;
   }
 
   void apply(
     CudaHandles* cudaHandles,
-    int          batchSize,
-    bool         accumulate,        // if true, beta = 1 (unsupported by MIOpen fwd)
-    void*        inputBuf,
-    void*        outputBuf,
-    void*        workspaceBuf,
-    size_t       workspaceBytes) const
-{
-  const float alpha = 1.0f;
-  const float beta  = accumulate ? 1.0f : 0.0f;
-
-  // New MIOpen API order: ... algo, beta, yDesc, y, workSpace, workSpaceSize
-  CUDNN_ERR(name.c_str(), miopenConvolutionForward(
-                cudaHandles->cudnn,
-                &alpha,
-                inputDescriptors[batchSize],
-                inputBuf,
-                filterDescriptor,
-                filterBuf,
-                convolutionDescriptor,
-                (*convolutionAlgorithms)[batchSize],
-                &beta,
-                outputDescriptors[batchSize],
-                outputBuf,
-                workspaceBuf,
-                workspaceBytes));
+    int batchSize,
+    bool accumulate,
+    void* inputBuf,
+    void* outputBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    const float alpha = 1.0f;
+    const float beta = accumulate ? 1.0f : 0.0f;
+    CUDNN_ERR(name.c_str(), miopenConvolutionForward(
+      cudaHandles->cudnn,
+      &alpha,
+      inputDescriptors[batchSize],
+      inputBuf,
+      filterDescriptor,
+      filterBuf,
+      convolutionDescriptor,
+      (*convolutionAlgorithms)[batchSize].fwd_algo,
+      &beta,
+      outputDescriptors[batchSize],
+      outputBuf,
+      workspaceBuf,
+      workspaceBytes));
   }
 
 };
@@ -674,34 +633,21 @@ struct MatMulLayer {
       ));
     }
     else {
-      // const half* alpha = (const half*)scratch->oneBuf;
-      // const half* beta = (const half*)scratch->zeroBuf;
-      // CUBLAS_ERR(name.c_str(),hipblasHgemm(
-      //   cudaHandles->cublas,
-      //   HIPBLAS_OP_N,
-      //   HIPBLAS_OP_N,
-      //   outChannels,
-      //   batchSize,
-      //   inChannels,
-      //   alpha,
-      //   (const half*)matBuf,outChannels,
-      //   (const half*)inputBuf,inChannels,
-      //   beta,
-      //   (half*)outputBuf,outChannels
-      // ));
-      static const half alpha_h = half(1.0f);
-      static const half beta_h  = half(0.0f);
-      CUBLAS_ERR(name.c_str(), hipblasGemmEx(
+      const hipblasHalf* alpha = (const hipblasHalf*)scratch->oneBuf;
+      const hipblasHalf* beta = (const hipblasHalf*)scratch->zeroBuf;
+      CUBLAS_ERR(name.c_str(),hipblasHgemm(
         cudaHandles->cublas,
-        HIPBLAS_OP_N, HIPBLAS_OP_N,
-        outChannels, batchSize, inChannels,
-        &alpha_h,
-        (const half*)matBuf,   HIPBLAS_R_16F, outChannels,
-        (const half*)inputBuf, HIPBLAS_R_16F, inChannels,
-        &beta_h,
-        (half*)outputBuf,      HIPBLAS_R_16F, outChannels,
-        HIPBLAS_R_16F,               /* compute_type */
-        HIPBLAS_GEMM_DEFAULT));      /* algo */
+        CUBLAS_OP_N,
+        CUBLAS_OP_N,
+        outChannels,
+        batchSize,
+        inChannels,
+        alpha,
+        (const hipblasHalf*)matBuf,outChannels,
+        (const hipblasHalf*)inputBuf,inChannels,
+        beta,
+        (hipblasHalf*)outputBuf,outChannels
+      ));
     }
 
   }
@@ -2406,36 +2352,8 @@ ComputeHandle* NeuralNet::createComputeHandle(
 
   bool useFP16 = false;
   bool useNHWC = false;
-  //Old GPUs - use FP32 and explicitly fail if FP16 enabled
-  if(prop.major < 5 || (prop.major == 5 && prop.minor < 3)) {
-    if(context->useFP16Mode == enabled_t::True)
-      throw StringError("ROCm device versions below 6.0 do not support useFP16=true");
-    if(context->useNHWCMode == enabled_t::True)
-      useNHWC = true;
-  }
-  //In theory these GPUs support FP16, so allow if the user wants.
-  else if(prop.major < 6) {
-    if(context->useFP16Mode == enabled_t::True)
-      useFP16 = true;
-    if(context->useNHWCMode == enabled_t::True)
-      useNHWC = true;
-  }
-  //On Pascal architecture, default to using FP16 operations
-  //Actually, just use FP32 - there's a risk that on certain cards this might just be a lot worse.
-  //A user manually fine-tuning for performance can just enable it themselves if they know how.
-  else if(prop.major < 7) {
-    if(context->useFP16Mode == enabled_t::True)
-      useFP16 = true;
-    if(context->useNHWCMode == enabled_t::True)
-      useNHWC = true;
-  }
-  //On Volta and higher, use FP16 and NHWC together because we have tensor cores.
-  else {
-    if(context->useFP16Mode == enabled_t::True || context->useFP16Mode == enabled_t::Auto)
-      useFP16 = true;
-    if(context->useNHWCMode == enabled_t::True || (context->useNHWCMode == enabled_t::Auto && useFP16))
-      useNHWC = true;
-  }
+  if(context->useFP16Mode == enabled_t::True || context->useFP16Mode == enabled_t::Auto)
+    useFP16 = true;
 
   if(logger != NULL) {
     logger->write(
diff --git a/cpp/program/setup.cpp b/cpp/program/setup.cpp
index 9c423771b..01a742f60 100644
--- a/cpp/program/setup.cpp
+++ b/cpp/program/setup.cpp
@@ -144,7 +144,7 @@ vector<NNEvaluator*> Setup::initializeNNEvaluators(
         requireExactNNLen = cfg.getBool("requireMaxBoardSize");
     }
 
-    bool inputsUseNHWC = backendPrefix == "opencl" || backendPrefix == "trt" || backendPrefix == "metal" ? false : true;
+    bool inputsUseNHWC = backendPrefix == "opencl" || backendPrefix == "trt" || backendPrefix == "metal" || backendPrefix == "rocm" ? false : true;
     if(cfg.contains(backendPrefix+"InputsUseNHWC"+idxStr))
       inputsUseNHWC = cfg.getBool(backendPrefix+"InputsUseNHWC"+idxStr);
     else if(cfg.contains("inputsUseNHWC"+idxStr))

From abb61240eec638b6b955eb31cf514cedb1039a8b Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Fri, 1 Aug 2025 18:03:45 +0200
Subject: [PATCH 05/24] Fix bugs

---
 cpp/neuralnet/rocmbackend.cpp | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/cpp/neuralnet/rocmbackend.cpp b/cpp/neuralnet/rocmbackend.cpp
index a6bc8862c..08857d76d 100644
--- a/cpp/neuralnet/rocmbackend.cpp
+++ b/cpp/neuralnet/rocmbackend.cpp
@@ -1,9 +1,4 @@
 #ifdef USE_ROCM_BACKEND
-#include "hip/hip_runtime.h"
-#include <map>
-#include <string>
-#include <vector>
-#include <cassert>
 
 #include "../neuralnet/rocmerrorcheck.h"
 #include "../neuralnet/rocmincludes.h"
@@ -344,8 +339,8 @@ struct ConvLayer {
       else {
         const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
         const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
-        int requestedAlgoCount = 8;
-        int returnedAlgoCount = -1;
+        size_t requestedAlgoCount = 8;
+        size_t returnedAlgoCount = -1;
         miopenConvFwdAlgorithm_t results[2 * requestedAlgoCount];
         miopenConvSolution_t solutions[2 * requestedAlgoCount];
         CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionCount(
@@ -460,7 +455,8 @@ struct ConvLayer {
       outputDescriptors[batchSize],
       outputBuf,
       workspaceBuf,
-      workspaceBytes));
+      workspaceBytes
+    ));
   }
 
 };
@@ -637,8 +633,8 @@ struct MatMulLayer {
       const hipblasHalf* beta = (const hipblasHalf*)scratch->zeroBuf;
       CUBLAS_ERR(name.c_str(),hipblasHgemm(
         cudaHandles->cublas,
-        CUBLAS_OP_N,
-        CUBLAS_OP_N,
+        HIPBLAS_OP_N,
+        HIPBLAS_OP_N,
         outChannels,
         batchSize,
         inChannels,

From bfb292e7f85397e32b9b0eed64ff4a2e182a595d Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Fri, 1 Aug 2025 19:20:31 +0200
Subject: [PATCH 06/24] All bug fixed

---
 cpp/neuralnet/rocmbackend.cpp | 62 ++++++++++++++---------------------
 1 file changed, 24 insertions(+), 38 deletions(-)

diff --git a/cpp/neuralnet/rocmbackend.cpp b/cpp/neuralnet/rocmbackend.cpp
index 08857d76d..260c2c72e 100644
--- a/cpp/neuralnet/rocmbackend.cpp
+++ b/cpp/neuralnet/rocmbackend.cpp
@@ -255,7 +255,7 @@ struct ConvLayer {
   ByBatchSizeView<miopenTensorDescriptor_t> outputDescriptors;
   miopenTensorDescriptor_t filterDescriptor;
   miopenConvolutionDescriptor_t convolutionDescriptor;
-  ByBatchSize<miopenConvAlgoPerf_t>* convolutionAlgorithms; //array of one for each batch size
+  ByBatchSize<miopenConvSolution_t>* convolutionAlgorithms; //array of one for each batch size
   void* filterBuf;
 
   ConvLayer() = delete;
@@ -329,19 +329,18 @@ struct ConvLayer {
       CUDNN_ERR(name.c_str(),miopenSetConvolutionAttribute(convolutionDescriptor,MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL,alt));
     }
 
-    convolutionAlgorithms = new ByBatchSize<miopenConvAlgoPerf_t >(maxBatchSize);
+    convolutionAlgorithms = new ByBatchSize<miopenConvSolution_t>(maxBatchSize);
 
     for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
-      if(useFP16 && dilationX <= 1 && dilationY <= 1) {
-        (*convolutionAlgorithms)[batchSize].fwd_algo = miopenConvolutionFwdAlgoImplicitGEMM;
-        continue;
-      }
-      else {
+      // if(useFP16 && dilationX <= 1 && dilationY <= 1) {
+      //   (*convolutionAlgorithms)[batchSize].solution_id = 0;
+      //   continue;
+      // }
+      // else {
         const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
         const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
         size_t requestedAlgoCount = 8;
         size_t returnedAlgoCount = -1;
-        miopenConvFwdAlgorithm_t results[2 * requestedAlgoCount];
         miopenConvSolution_t solutions[2 * requestedAlgoCount];
         CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionCount(
           cudaHandles->cudnn,
@@ -363,28 +362,16 @@ struct ConvLayer {
           ));
         if(returnedAlgoCount <= 0)
           throw StringError("miopenConvolutionForwardGetSolution returned no algorithms?");
-        for (size_t i = 0; i < returnedAlgoCount; i++) {
-          if(solutions[i].algorithm == miopenConvolutionAlgoGEMM) {
-            results[i] = miopenConvolutionFwdAlgoGEMM;
-          }
-          else if(solutions[i].algorithm == miopenConvolutionAlgoDirect) {
-            results[i] = miopenConvolutionFwdAlgoDirect;
-          }
-          else if(solutions[i].algorithm == miopenConvolutionAlgoFFT) {
-            results[i] = miopenConvolutionFwdAlgoFFT;
-          }
-          else if(solutions[i].algorithm == miopenConvolutionAlgoWinograd) {
-            results[i] = miopenConvolutionFwdAlgoWinograd;
-          }
-          else if(solutions[i].algorithm == miopenConvolutionAlgoImplicitGEMM) {
-            results[i] = miopenConvolutionFwdAlgoImplicitGEMM;
-          }
-          else{
-            throw StringError("Unknown miopenConvolutionFwdAlgo: " + std::to_string(solutions[i].algorithm));
-          }
-        }
-        (*convolutionAlgorithms)[batchSize].fwd_algo = results[0];
-      }
+        (*convolutionAlgorithms)[batchSize] = solutions[0];
+        CUDNN_ERR(name.c_str(),miopenConvolutionForwardCompileSolution(
+          cudaHandles->cudnn,
+          filterDescriptor,
+          inputDescriptor,
+          convolutionDescriptor,
+          outputDescriptor,
+          (*convolutionAlgorithms)[batchSize].solution_id
+        ));
+      // }
     }
 
     assert(desc->weights.size() == convYSize * convXSize * inChannels * outChannels);
@@ -420,12 +407,13 @@ struct ConvLayer {
     int batchSize
   ) const {
     size_t workspaceBytes = 0;
-    CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetWorkSpaceSize(
+    CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionWorkspaceSize(
       cudaHandles->cudnn,
       filterDescriptor,
       inputDescriptors[batchSize],
       convolutionDescriptor,
       outputDescriptors[batchSize],
+      (*convolutionAlgorithms)[batchSize].solution_id,
       &workspaceBytes
     ));
     return workspaceBytes;
@@ -442,20 +430,18 @@ struct ConvLayer {
   ) const {
     const float alpha = 1.0f;
     const float beta = accumulate ? 1.0f : 0.0f;
-    CUDNN_ERR(name.c_str(), miopenConvolutionForward(
+    CUDNN_ERR(name.c_str(), miopenConvolutionForwardImmediate(
       cudaHandles->cudnn,
-      &alpha,
-      inputDescriptors[batchSize],
-      inputBuf,
       filterDescriptor,
       filterBuf,
+      inputDescriptors[batchSize],
+      inputBuf,
       convolutionDescriptor,
-      (*convolutionAlgorithms)[batchSize].fwd_algo,
-      &beta,
       outputDescriptors[batchSize],
       outputBuf,
       workspaceBuf,
-      workspaceBytes
+      workspaceBytes,
+      (*convolutionAlgorithms)[batchSize].solution_id
     ));
   }
 

From 4606424fa97a1dd73e8ebed13d4804bf773a5187 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Fri, 1 Aug 2025 19:23:26 +0200
Subject: [PATCH 07/24] Update

---
 cpp/neuralnet/rocmhelpers.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/neuralnet/rocmhelpers.h b/cpp/neuralnet/rocmhelpers.h
index 215b1e9fd..489142cfd 100644
--- a/cpp/neuralnet/rocmhelpers.h
+++ b/cpp/neuralnet/rocmhelpers.h
@@ -1,4 +1,3 @@
-#include "hip/hip_runtime.h"
 #ifndef NEURALNET_ROCMHELPERS_H_
 #define NEURALNET_ROCMHELPERS_H_
 

From 1e8ea78876cc57eec4ddfeffee8db8268865629f Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Sat, 2 Aug 2025 01:03:25 +0200
Subject: [PATCH 08/24] test new method

---
 cpp/neuralnet/rocmbackend_new.cpp | 3016 +++++++++++++++++++++++++++++
 1 file changed, 3016 insertions(+)
 create mode 100644 cpp/neuralnet/rocmbackend_new.cpp

diff --git a/cpp/neuralnet/rocmbackend_new.cpp b/cpp/neuralnet/rocmbackend_new.cpp
new file mode 100644
index 000000000..af1164f19
--- /dev/null
+++ b/cpp/neuralnet/rocmbackend_new.cpp
@@ -0,0 +1,3016 @@
+#ifdef USE_ROCM_BACKEND
+
+#include "../neuralnet/rocmerrorcheck.h"
+#include "../neuralnet/rocmincludes.h"
+
+#include "../neuralnet/rocmhelpers.h"
+#include "../neuralnet/rocmutils.h"
+#include "../neuralnet/modelversion.h"
+#include "../neuralnet/nninterface.h"
+#include "../neuralnet/nninputs.h"
+#include "../neuralnet/sgfmetadata.h"
+#include "../neuralnet/nneval.h"
+#include "../neuralnet/desc.h"
+
+#include "../core/simpleallocator.h"
+#include "../core/test.h"
+
+#include "../external/half-2.2.0/include/half.hpp"
+
+//------------------------
+#include "../core/using.h"
+//------------------------
+
+using half_t = half_float::half;
+
+//Define this to print out some of the intermediate values of the neural net
+//#define DEBUG_INTERMEDIATE_VALUES
+
+void NeuralNet::globalInitialize() {
+  //Empty for cudnn backend
+}
+
+void NeuralNet::globalCleanup() {
+  hipDeviceReset();
+}
+
+struct CudaHandles {
+  hipblasHandle_t cublas;
+  miopenHandle_t cudnn;
+  const int majorComputeCapability;
+  const int minorComputeCapability;
+
+  CudaHandles(int major, int minor)
+    : majorComputeCapability(major),
+      minorComputeCapability(minor)
+  {
+    CUBLAS_ERR("CudaHandles",hipblasCreate(&cublas));
+    CUDNN_ERR("CudaHandles",miopenCreate(&cudnn));
+  }
+
+  ~CudaHandles() {
+    hipblasDestroy(cublas);
+    miopenDestroy(cudnn);
+  }
+
+  static CudaHandles* cudaHandlesTesting() {
+    const int gpuIdxForThisThread = 0;
+    hipDeviceProp_t prop;
+    hipGetDeviceProperties(&prop,gpuIdxForThisThread);
+    return new CudaHandles(prop.major, prop.minor);
+  }
+
+  CudaHandles(const CudaHandles&) = delete;
+  CudaHandles& operator=(const CudaHandles&) = delete;
+};
+
+//---------------------------------------------------------------------------------
+
+template<typename T>
+struct ByBatchSize {
+  const int maxBatchSize;
+  T* data;
+  miopenStatus_t (*destroyFunc)(T);
+
+  ByBatchSize()
+    : maxBatchSize(0), data(nullptr), destroyFunc(nullptr)
+  {}
+
+  ByBatchSize(
+    int maxBatchSize_
+  ) : maxBatchSize(maxBatchSize_), data(nullptr), destroyFunc(nullptr) {
+    data = new T[maxBatchSize];
+  }
+
+  ByBatchSize(const ByBatchSize&) = delete;
+  ByBatchSize& operator=(const ByBatchSize&) = delete;
+
+  ~ByBatchSize() {
+    if(destroyFunc != nullptr && data != nullptr) {
+      for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
+        (*destroyFunc)(data[batchSize-1]);
+      }
+    }
+    if(data != nullptr) {
+      delete[] data;
+      data = nullptr;
+    }
+  }
+  T& operator[](int batchSize) {
+    return data[batchSize-1];
+  }
+  const T& operator[](int batchSize) const {
+    return data[batchSize-1];
+  }
+};
+
+template<typename T>
+struct ByBatchSizeView {
+  int maxBatchSize;
+  T* data;
+
+  ByBatchSizeView()
+    : maxBatchSize(0), data(nullptr)
+  {}
+
+  ByBatchSizeView(const ByBatchSize<T>& toView)
+    : maxBatchSize(toView.maxBatchSize), data(toView.data)
+  {}
+  ByBatchSizeView& operator=(const ByBatchSize<T>& toView) {
+    maxBatchSize = toView.maxBatchSize;
+    data = toView.data;
+  }
+
+  ~ByBatchSizeView() {
+  }
+  T& operator[](int batchSize) {
+    return data[batchSize-1];
+  }
+  const T& operator[](int batchSize) const {
+    return data[batchSize-1];
+  }
+};
+
+//---------------------------------------------------------------------------------
+
+
+//channels, useFP16, useNHWC
+typedef std::tuple<int, bool, bool> CudnnTensorDesc4DKey;
+
+struct CudnnManager {
+  const string name;
+  const int maxBatchSize;
+  const int nnXLen;
+  const int nnYLen;
+  std::map<CudnnTensorDesc4DKey, ByBatchSize<miopenTensorDescriptor_t>*> tensorDesc4DByBatchSizeByKey;
+
+  CudnnManager(string name_, int maxBatchSize_, int nnXLen_, int nnYLen_)
+    :name(name_),
+     maxBatchSize(maxBatchSize_),
+     nnXLen(nnXLen_),
+     nnYLen(nnYLen_),
+     tensorDesc4DByBatchSizeByKey()
+  {
+  }
+
+  ~CudnnManager() {
+    for(auto& iter: tensorDesc4DByBatchSizeByKey) {
+      delete iter.second;
+    }
+  }
+
+  ByBatchSizeView<miopenTensorDescriptor_t> getTensorDesc4DByBatchSize(
+    int channels, bool useFP16, bool useNHWC
+  ) {
+    auto iter = tensorDesc4DByBatchSizeByKey.find({channels, useFP16, useNHWC});
+    if(iter != tensorDesc4DByBatchSizeByKey.end()) {
+      return ByBatchSizeView<miopenTensorDescriptor_t>(*(iter->second));
+    }
+    ByBatchSize<miopenTensorDescriptor_t>* descs = new ByBatchSize<miopenTensorDescriptor_t>(maxBatchSize);
+    for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
+      miopenTensorDescriptor_t& desc = (*descs)[batchSize];
+      CUDNN_ERR(name.c_str(),miopenCreateTensorDescriptor(&desc));
+      CUDNN_ERR(name.c_str(),miopenSet4dTensorDescriptor(
+                  desc,
+                  (useFP16 ? miopenHalf : miopenFloat),
+                  batchSize,
+                  channels,
+                  nnYLen,
+                  nnXLen
+                ));
+    }
+    descs->destroyFunc = miopenDestroyTensorDescriptor;
+    tensorDesc4DByBatchSizeByKey[{channels, useFP16, useNHWC}] = descs;
+    return ByBatchSizeView<miopenTensorDescriptor_t>(*descs);
+  }
+};
+
+//---------------------------------------------------------------------------------
+
+struct ScratchBuffers {
+
+  const size_t batchXYFloatBytes;
+  const size_t batchFloatBytes;
+  const size_t batchXYBytes;
+  const size_t batchBytes;
+
+  SimpleAllocator<void*>* allocator;
+
+  // Not scratch, but convenient to have here
+  void* zeroBuf;
+  void* oneBuf;
+
+  ScratchBuffers() = delete;
+  ScratchBuffers(const ScratchBuffers&) = delete;
+  ScratchBuffers& operator=(const ScratchBuffers&) = delete;
+
+  ScratchBuffers(int maxBatchSize, int nnXLen, int nnYLen, bool useFP16)
+    : batchXYFloatBytes((size_t)maxBatchSize * nnXLen * nnYLen * sizeof(float)),
+      batchFloatBytes((size_t)maxBatchSize * sizeof(float)),
+      batchXYBytes((size_t)maxBatchSize * nnXLen * nnYLen * (useFP16 ? sizeof(half_t) : sizeof(float))),
+      batchBytes((size_t)maxBatchSize * (useFP16 ? sizeof(half_t) : sizeof(float)))
+  {
+    std::function<void*(size_t)> allocateFunc = [](size_t size) {
+      void* buf;
+      CUDA_ERR("ScratchBuffers",hipMalloc(&buf, size));
+      return buf;
+    };
+    std::function<void(void*)> releaseFunc = [](void* buf) {
+      hipFree(buf);
+    };
+
+    allocator = new SimpleAllocator<void*>(allocateFunc, releaseFunc);
+
+    CudaUtils::hostMallocZeroOneBufs(zeroBuf, oneBuf, useFP16);
+  }
+  ~ScratchBuffers() {
+    delete allocator;
+    free(zeroBuf);
+    free(oneBuf);
+  }
+
+  size_t getBufSizeXY(int channels) const {
+    return channels * batchXYBytes;
+  }
+  size_t getBufSizeXYFloat(int channels) const {
+    return channels * batchXYFloatBytes;
+  }
+  size_t getBufSizeFloat(int channels) const {
+    return channels * batchFloatBytes;
+  }
+  size_t getBufSize(int channels) const {
+    return channels * batchBytes;
+  }
+
+};
+
+
+//---------------------------------------------------------------------------------
+
+struct ConvLayer {
+  const string name;
+  const int inChannels;
+  const int outChannels;
+  ByBatchSizeView<miopenTensorDescriptor_t> inputDescriptors;
+  ByBatchSizeView<miopenTensorDescriptor_t> outputDescriptors;
+  miopenTensorDescriptor_t filterDescriptor;
+  miopenConvolutionDescriptor_t convolutionDescriptor;
+  ByBatchSize<miopenConvAlgoPerf_t>* convolutionAlgorithms; //array of one for each batch size
+  void* filterBuf;
+  void* inputTmp;
+  void* outputTmp;
+  void* workspaceTmp;
+
+  ConvLayer() = delete;
+  ConvLayer(const ConvLayer&) = delete;
+  ConvLayer& operator=(const ConvLayer&) = delete;
+
+  ConvLayer(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const ConvLayerDesc* desc,
+    bool useFP16,
+    bool useNHWC
+  ) : ConvLayer(cudaHandles, manager, desc, useFP16, useNHWC, useNHWC)
+  {}
+
+  ConvLayer(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const ConvLayerDesc* desc,
+    bool useFP16,
+    bool useNHWCIn,
+    bool useNHWCOut
+  ) :
+    name(desc->name),
+    inChannels(desc->inChannels),
+    outChannels(desc->outChannels)
+  {
+    int convYSize = desc->convYSize;
+    int convXSize = desc->convXSize;
+    int dilationY = desc->dilationY;
+    int dilationX = desc->dilationX;
+    int paddingX = (convXSize / 2) * dilationX;
+    int paddingY = (convYSize / 2) * dilationY;
+
+    assert(convXSize % 2 == 1);
+    assert(convYSize % 2 == 1);
+
+    inputDescriptors = manager->getTensorDesc4DByBatchSize(inChannels,useFP16,useNHWCIn);
+    outputDescriptors = manager->getTensorDesc4DByBatchSize(outChannels,useFP16,useNHWCOut);
+    int maxBatchSize = manager->maxBatchSize;
+    int xLen = manager->nnXLen;
+    int yLen = manager->nnYLen;
+
+    bool filterNHWC = useNHWCOut && dilationY == 1 && dilationX == 1;
+
+    CUDNN_ERR(name.c_str(),miopenCreateTensorDescriptor(&filterDescriptor));
+    CUDNN_ERR(name.c_str(),miopenSet4dTensorDescriptor(
+      filterDescriptor,
+      (useFP16 ? miopenHalf : miopenFloat),
+      outChannels,
+      inChannels,
+      convYSize,
+      convXSize
+    ));
+
+    int yStride = 1;
+    int xStride = 1;
+
+
+    CUDNN_ERR(name.c_str(),miopenCreateConvolutionDescriptor(&convolutionDescriptor));
+    CUDNN_ERR(name.c_str(),miopenInitConvolutionDescriptor(
+      convolutionDescriptor,
+      miopenConvolution,
+      paddingY,
+      paddingX,
+      yStride,
+      xStride,
+      dilationY,
+      dilationX
+    ));
+    if(useFP16) {
+      int alt = 1; // non‑zero enables alt‑impl on MI2xx+ GPUs
+      CUDNN_ERR(name.c_str(),miopenSetConvolutionAttribute(convolutionDescriptor,MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL,alt));
+    }
+
+    convolutionAlgorithms = new ByBatchSize<miopenConvAlgoPerf_t>(maxBatchSize);
+
+    size_t inBytes  = maxBatchSize * inChannels  * xLen * yLen;
+    size_t outBytes = maxBatchSize * outChannels * xLen * yLen;
+    size_t workspaceBytes = requiredWorkspaceBytes(cudaHandles, maxBatchSize); 
+    
+    CudaUtils::mallocOnDevice(name, inBytes, inputTmp, useFP16);
+    CudaUtils::mallocOnDevice(name, outBytes, outputTmp, useFP16);
+    CudaUtils::mallocOnDevice(name, workspaceBytes, workspaceTmp, useFP16);
+    CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
+
+    for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
+      // if(useFP16 && dilationX <= 1 && dilationY <= 1) {
+      //   (*convolutionAlgorithms)[batchSize].fwd_algo = miopenConvolutionFwdAlgoGEMM;
+      // }
+      // else {
+        const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
+        const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
+        const int requestedAlgoCount = 8;
+        int returnedAlgoCount = -1;
+        miopenConvAlgoPerf_t results[2 * requestedAlgoCount];
+        CUDNN_ERR(name.c_str(),miopenFindConvolutionForwardAlgorithm(
+            cudaHandles->cudnn,
+            inputDescriptor,
+            inputTmp,
+            filterDescriptor,
+            filterBuf,
+            convolutionDescriptor,
+            outputDescriptor,
+            outputTmp,
+            requestedAlgoCount,
+            &returnedAlgoCount,
+            results,
+            workspaceTmp,
+            workspaceBytes,
+            true
+          ));
+        if(returnedAlgoCount <= 0)
+          throw StringError("miopenFindConvolutionForwardAlgorithm returned no algorithms?");
+        (*convolutionAlgorithms)[batchSize] = results[0];
+        printf("%d / %d\n", batchSize, maxBatchSize);
+      // }
+    }
+
+    assert(desc->weights.size() == convYSize * convXSize * inChannels * outChannels);
+
+    CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
+  }
+
+  ~ConvLayer() {
+    hipFree(filterBuf);
+    hipFree(inputTmp);
+    hipFree(outputTmp);
+    hipFree(workspaceTmp);
+    miopenDestroyTensorDescriptor(filterDescriptor);
+    miopenDestroyConvolutionDescriptor(convolutionDescriptor);
+    delete convolutionAlgorithms;
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t workspaceBytes = 0;
+    CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetWorkSpaceSize(
+      cudaHandles->cudnn,
+      filterDescriptor,
+      inputDescriptors[batchSize],
+      convolutionDescriptor,
+      outputDescriptors[batchSize],
+      &workspaceBytes
+    ));
+    return workspaceBytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    int batchSize,
+    bool accumulate,
+    void* inputBuf,
+    void* outputBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    const float alpha = 1.0f;
+    const float beta = accumulate ? 1.0f : 0.0f;
+    CUDNN_ERR(name.c_str(), miopenConvolutionForward(
+      cudaHandles->cudnn,
+      &alpha,
+      inputDescriptors[batchSize],
+      inputBuf,
+      filterDescriptor,
+      filterBuf,
+      convolutionDescriptor,
+      (*convolutionAlgorithms)[batchSize].fwd_algo,
+      &beta,
+      outputDescriptors[batchSize],
+      outputBuf,
+      workspaceBuf,
+      workspaceBytes
+    ));
+  }
+
+};
+
+
+//---------------------------------------------------------------------------------
+
+struct BatchNormLayer {
+  const string name;
+  const int numChannels;
+  const float epsilon;
+  const int activation;
+  const int nnXLen;
+  const int nnYLen;
+
+  const bool usingFP16;
+  const bool usingNHWC;
+
+  void* mergedScaleBuf;
+  void* mergedBiasBuf;
+
+  BatchNormLayer() = delete;
+  BatchNormLayer(const BatchNormLayer&) = delete;
+  BatchNormLayer& operator=(const BatchNormLayer&) = delete;
+
+  BatchNormLayer(
+    CudaHandles* cudaHandles,
+    const BatchNormLayerDesc* desc,
+    const ActivationLayerDesc* actDesc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ) :
+    name(desc->name),
+    numChannels(desc->numChannels),
+    epsilon(desc->epsilon),
+    activation(actDesc->activation),
+    nnXLen(nnX),
+    nnYLen(nnY),
+    usingFP16(useFP16),
+    usingNHWC(useNHWC)
+  {
+    (void)cudaHandles;
+
+    assert(desc->mean.size() == numChannels);
+    assert(desc->variance.size() == numChannels);
+    assert(desc->scale.size() == numChannels);
+    assert(desc->bias.size() == numChannels);
+    assert(desc->mergedScale.size() == numChannels);
+    assert(desc->mergedBias.size() == numChannels);
+    CudaUtils::mallocAndCopyToDevice(name,desc->mergedScale,mergedScaleBuf,useFP16);
+    CudaUtils::mallocAndCopyToDevice(name,desc->mergedBias,mergedBiasBuf,useFP16);
+  }
+  ~BatchNormLayer() {
+    hipFree(mergedScaleBuf);
+    hipFree(mergedBiasBuf);
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    int batchSize,
+    void* inputBuf,
+    const void* maskBuf, //ok to be null
+    void* outputBuf
+  ) const {
+    (void)cudaHandles;
+    if(!usingFP16) {
+      if(!usingNHWC)
+        customCudaApplyCScaleBiasNCHW((const float*)inputBuf,(float*)outputBuf,(const float*)mergedScaleBuf,(const float*)mergedBiasBuf,
+                                      (const float*)maskBuf,
+                                      batchSize,numChannels,nnXLen*nnYLen,activation);
+      else
+        customCudaApplyCScaleBiasNHWC((const float*)inputBuf,(float*)outputBuf,(const float*)mergedScaleBuf,(const float*)mergedBiasBuf,
+                                      (const float*)maskBuf,
+                                      batchSize,nnXLen*nnYLen,numChannels,activation);
+    }
+    else {
+      if(!usingNHWC)
+        customCudaApplyCScaleBiasNCHW((const half*)inputBuf,(half*)outputBuf,(const half*)mergedScaleBuf,(const half*)mergedBiasBuf,
+                                      (const half*)maskBuf,
+                                      batchSize,numChannels,nnXLen*nnYLen,activation);
+      else
+        customCudaApplyCScaleBiasNHWC((const half*)inputBuf,(half*)outputBuf,(const half*)mergedScaleBuf,(const half*)mergedBiasBuf,
+                                      (const half*)maskBuf,
+                                      batchSize,nnXLen*nnYLen,numChannels,activation);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+    }
+
+  }
+
+};
+
+
+//---------------------------------------------------------------------------------
+
+struct MatMulLayer {
+  const string name;
+  const int inChannels;
+  const int outChannels;
+  const bool usingFP16;
+  void* matBuf;
+
+  MatMulLayer() = delete;
+  MatMulLayer(const MatMulLayer&) = delete;
+  MatMulLayer& operator=(const MatMulLayer&) = delete;
+
+  MatMulLayer(
+    CudaHandles* cudaHandles,
+    const MatMulLayerDesc* desc,
+    bool useFP16
+  ) :
+    name(desc->name),
+    inChannels(desc->inChannels),
+    outChannels(desc->outChannels),
+    usingFP16(useFP16)
+  {
+    (void)cudaHandles;
+
+    if(inChannels > 0 && outChannels > 0) {
+      assert(desc->weights.size() == inChannels * outChannels);
+      CudaUtils::mallocAndCopyToDevice(name,desc->weights,matBuf,useFP16);
+    }
+    else {
+      matBuf = NULL;
+    }
+  }
+
+  ~MatMulLayer() {
+    if(inChannels > 0 && outChannels > 0)
+      hipFree(matBuf);
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles
+  ) const {
+    (void)cudaHandles;
+    size_t workspaceBytes = 0;
+    return workspaceBytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* inputBuf,
+    void* outputBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    (void)workspaceBuf;
+    (void)workspaceBytes;
+    assert(inChannels > 0 && outChannels > 0);
+
+    if(!usingFP16) {
+      const float alpha = 1.0f;
+      const float beta = 0.0f;
+      CUBLAS_ERR(name.c_str(),hipblasSgemm(
+        cudaHandles->cublas,
+        HIPBLAS_OP_N,
+        HIPBLAS_OP_N,
+        outChannels,
+        batchSize,
+        inChannels,
+        &alpha,
+        (const float*)matBuf,outChannels,
+        (const float*)inputBuf,inChannels,
+        &beta,
+        (float*)outputBuf,outChannels
+      ));
+    }
+    else {
+      const hipblasHalf* alpha = (const hipblasHalf*)scratch->oneBuf;
+      const hipblasHalf* beta = (const hipblasHalf*)scratch->zeroBuf;
+      CUBLAS_ERR(name.c_str(),hipblasHgemm(
+        cudaHandles->cublas,
+        HIPBLAS_OP_N,
+        HIPBLAS_OP_N,
+        outChannels,
+        batchSize,
+        inChannels,
+        alpha,
+        (const hipblasHalf*)matBuf,outChannels,
+        (const hipblasHalf*)inputBuf,inChannels,
+        beta,
+        (hipblasHalf*)outputBuf,outChannels
+      ));
+    }
+
+  }
+
+};
+
+//---------------------------------------------------------------------------------
+
+struct MatBiasLayer {
+  const string name;
+  const int numChannels;
+  const bool usingFP16;
+  const int activation;
+
+  void* biasBuf;
+
+  MatBiasLayer() = delete;
+  MatBiasLayer(const MatBiasLayer&) = delete;
+  MatBiasLayer& operator=(const MatBiasLayer&) = delete;
+
+  MatBiasLayer(
+    CudaHandles* cudaHandles,
+    const MatBiasLayerDesc* desc,
+    bool useFP16,
+    int activation_
+  ) :
+    name(desc->name),
+    numChannels(desc->numChannels),
+    usingFP16(useFP16),
+    activation(activation_)
+  {
+    (void)cudaHandles;
+    if(numChannels > 0) {
+      assert(desc->weights.size() == numChannels);
+      CudaUtils::mallocAndCopyToDevice(name,desc->weights,biasBuf,useFP16);
+    }
+    else
+      biasBuf = NULL;
+  }
+
+  ~MatBiasLayer() {
+    if(numChannels > 0)
+      hipFree(biasBuf);
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    int batchSize,
+    void* matBuf
+  ) const {
+    (void)cudaHandles;
+    assert(numChannels > 0);
+    if(!usingFP16) {
+      customCudaAddCBiasInplaceNC((float*)matBuf,(const float*)biasBuf,batchSize,numChannels,activation);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+    }
+    else {
+      customCudaAddCBiasInplaceNC((half*)matBuf,(const half*)biasBuf,batchSize,numChannels,activation);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+    }
+  }
+
+};
+
+//---------------------------------------------------------------------------------
+
+struct NormActConv {
+  const BatchNormLayer norm;
+  const ConvLayer conv;
+
+  const int inChannels;
+  const int outChannels;
+  const int nnXLen;
+  const int nnYLen;
+  const bool usingFP16;
+  const bool usingNHWC;
+
+  NormActConv() = delete;
+  NormActConv(const NormActConv&) = delete;
+  NormActConv& operator=(const NormActConv&) = delete;
+
+  NormActConv(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const BatchNormLayerDesc* normDesc,
+    const ActivationLayerDesc* actDesc,
+    const ConvLayerDesc* convDesc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ): norm(cudaHandles,normDesc,actDesc,nnX,nnY,useFP16,useNHWC),
+     conv(cudaHandles,manager,convDesc,useFP16,useNHWC),
+     inChannels(norm.numChannels),
+     outChannels(conv.outChannels),
+     nnXLen(nnX),
+     nnYLen(nnY),
+     usingFP16(useFP16),
+     usingNHWC(useNHWC)
+  {
+    assert(norm.numChannels == conv.inChannels);
+  }
+
+  ~NormActConv()
+  {}
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+    b = conv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    int batchSize,
+    bool accumulate,
+    void* inBuf,
+    void* inScratchBuf,
+    void* outBuf,
+    void* maskBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    norm.apply(cudaHandles,batchSize,inBuf,maskBuf,inScratchBuf);
+#ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("AFTER NORM "), inScratchBuf, batchSize, inChannels, nnXLen, nnYLen, usingNHWC, usingFP16);
+#endif
+    conv.apply(cudaHandles,batchSize,accumulate,inScratchBuf,outBuf,workspaceBuf,workspaceBytes);
+  }
+
+};
+
+
+//---------------------------------------------------------------------------------
+
+struct ResidualBlock {
+  const string name;
+  const NormActConv normActConv1;
+  const NormActConv normActConv2;
+
+  ResidualBlock() = delete;
+  ResidualBlock(const ResidualBlock&) = delete;
+  ResidualBlock& operator=(const ResidualBlock&) = delete;
+
+  ResidualBlock(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const ResidualBlockDesc* desc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ): name(desc->name),
+     normActConv1(cudaHandles,manager,&desc->preBN,&desc->preActivation,&desc->regularConv,nnX,nnY,useFP16,useNHWC),
+     normActConv2(cudaHandles,manager,&desc->midBN,&desc->midActivation,&desc->finalConv,nnX,nnY,useFP16,useNHWC)
+  {
+  }
+
+  ~ResidualBlock()
+  {}
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+    b = normActConv1.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = normActConv2.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* trunkBuf,
+    void* trunkScratchBuf,
+    void* maskBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    SizedBuf<void*> midIn(scratch->allocator, scratch->getBufSizeXY(normActConv1.outChannels));
+    SizedBuf<void*> midScratch(scratch->allocator, scratch->getBufSizeXY(normActConv1.outChannels));
+    normActConv1.apply(cudaHandles,batchSize,false,trunkBuf,trunkScratchBuf,midIn.buf,maskBuf,workspaceBuf,workspaceBytes);
+    normActConv2.apply(cudaHandles,batchSize,true,midIn.buf,midScratch.buf,trunkBuf,maskBuf,workspaceBuf,workspaceBytes);
+  }
+
+};
+
+
+//----------------------------------------------------------------------------
+
+
+struct GlobalPoolingResidualBlock {
+  const string name;
+  const BatchNormLayer preBN;
+  const ConvLayer regularConv;
+  const ConvLayer gpoolConv;
+  const BatchNormLayer gpoolBN;
+  const MatMulLayer gpoolToBiasMul;
+  const NormActConv normActConv2;
+
+  const int nnXLen;
+  const int nnYLen;
+  const int regularChannels;
+  const int gpoolChannels;
+  const bool usingFP16;
+  const bool usingNHWC;
+
+  GlobalPoolingResidualBlock() = delete;
+  GlobalPoolingResidualBlock(const GlobalPoolingResidualBlock&) = delete;
+  GlobalPoolingResidualBlock& operator=(const GlobalPoolingResidualBlock&) = delete;
+
+  GlobalPoolingResidualBlock(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const GlobalPoolingResidualBlockDesc* desc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ): name(desc->name),
+     preBN(cudaHandles,&desc->preBN,&desc->preActivation,nnX,nnY,useFP16,useNHWC),
+     regularConv(cudaHandles,manager,&desc->regularConv,useFP16,useNHWC),
+     gpoolConv(cudaHandles,manager,&desc->gpoolConv,useFP16,useNHWC),
+     gpoolBN(cudaHandles,&desc->gpoolBN,&desc->gpoolActivation,nnX,nnY,useFP16,useNHWC),
+     gpoolToBiasMul(cudaHandles,&desc->gpoolToBiasMul,useFP16),
+     normActConv2(cudaHandles,manager,&desc->midBN,&desc->midActivation,&desc->finalConv,nnX,nnY,useFP16,useNHWC),
+     nnXLen(nnX),
+     nnYLen(nnY),
+     regularChannels(desc->regularConv.outChannels),
+     gpoolChannels(desc->gpoolConv.outChannels),
+     usingFP16(useFP16),
+     usingNHWC(useNHWC)
+  {
+  }
+
+  ~GlobalPoolingResidualBlock() {
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+    b = regularConv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = gpoolConv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = gpoolToBiasMul.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = normActConv2.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = sizeof(float)*batchSize*gpoolChannels*nnXLen*nnYLen;
+    bytes = std::max(bytes,b);
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* trunkBuf,
+    void* trunkScratchBuf,
+    void* maskBuf,
+    float* maskSumBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    SizedBuf<void*> regularOut(scratch->allocator, scratch->getBufSizeXY(regularChannels));
+    SizedBuf<void*> regularScratch(scratch->allocator, scratch->getBufSizeXY(regularChannels));
+    SizedBuf<void*> gpoolOut(scratch->allocator, scratch->getBufSizeXY(gpoolChannels));
+    SizedBuf<void*> gpoolOut2(scratch->allocator, scratch->getBufSizeXY(gpoolChannels));
+    SizedBuf<void*> gpoolConcat(scratch->allocator, scratch->getBufSize(gpoolChannels*3));
+    SizedBuf<void*> gpoolBias(scratch->allocator, scratch->getBufSize(regularChannels));
+
+    preBN.apply(cudaHandles,batchSize,trunkBuf,maskBuf,trunkScratchBuf);
+    regularConv.apply(cudaHandles,batchSize,false,trunkScratchBuf,regularOut.buf,workspaceBuf,workspaceBytes);
+    gpoolConv.apply(cudaHandles,batchSize,false,trunkScratchBuf,gpoolOut.buf,workspaceBuf,workspaceBytes);
+    gpoolBN.apply(cudaHandles,batchSize,gpoolOut.buf,maskBuf,gpoolOut2.buf);
+
+    if(!usingFP16) {
+      if(!usingNHWC)
+        customCudaPoolRowsGPoolNCHW((const float*)gpoolOut2.buf,(float*)gpoolConcat.buf,batchSize,gpoolChannels,nnXLen*nnYLen,(const float*)maskBuf,maskSumBuf);
+      else
+        customCudaPoolRowsGPoolNHWC((const float*)gpoolOut2.buf,(float*)gpoolConcat.buf,batchSize,nnXLen*nnYLen,gpoolChannels,(const float*)maskBuf,maskSumBuf);
+    }
+    else {
+      if(!usingNHWC)
+        customCudaPoolRowsGPoolNCHW((const half*)gpoolOut2.buf,(half*)gpoolConcat.buf,batchSize,gpoolChannels,nnXLen*nnYLen,(const half*)maskBuf,maskSumBuf);
+      else
+        customCudaPoolRowsGPoolNHWC((const half*)gpoolOut2.buf,(half*)gpoolConcat.buf,batchSize,nnXLen*nnYLen,gpoolChannels,(const half*)maskBuf,maskSumBuf);
+    }
+    CUDA_ERR(name.c_str(),hipPeekAtLastError());
+
+    gpoolToBiasMul.apply(cudaHandles,scratch,batchSize,gpoolConcat.buf,gpoolBias.buf,workspaceBuf,workspaceBytes);
+
+    if(!usingFP16) {
+      if(!usingNHWC)
+        customCudaAddNCBiasInplaceNCHW((float*)regularOut.buf,(const float*)gpoolBias.buf,batchSize,regularChannels,nnXLen*nnYLen);
+      else
+        customCudaAddNCBiasInplaceNHWC((float*)regularOut.buf,(const float*)gpoolBias.buf,batchSize,nnXLen*nnYLen,regularChannels);
+    }
+    else {
+      if(!usingNHWC)
+        customCudaAddNCBiasInplaceNCHW((half*)regularOut.buf,(const half*)gpoolBias.buf,batchSize,regularChannels,nnXLen*nnYLen);
+      else
+        customCudaAddNCBiasInplaceNHWC((half*)regularOut.buf,(const half*)gpoolBias.buf,batchSize,nnXLen*nnYLen,regularChannels);
+    }
+    CUDA_ERR(name.c_str(),hipPeekAtLastError());
+
+    normActConv2.apply(cudaHandles,batchSize,true,regularOut.buf,regularScratch.buf,trunkBuf,maskBuf,workspaceBuf,workspaceBytes);
+  }
+
+};
+
+//------------------------------------------------------------------------------
+
+struct BlockStack {
+  const int numBlocks;
+  const int trunkNumChannels;
+  const int nnXLen;
+  const int nnYLen;
+  const bool usingFP16;
+  const bool usingNHWC;
+  vector<pair<int,unique_ptr_void>> blocks;
+
+  BlockStack() = delete;
+  BlockStack(const BlockStack&) = delete;
+  BlockStack& operator=(const BlockStack&) = delete;
+
+  BlockStack(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    int nBlocks,
+    int trunkChannels,
+    const std::vector<std::pair<int, unique_ptr_void>>& descBlocks,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  );
+  ~BlockStack();
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const;
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* maskBuf,
+    float* maskSumBuf,
+    void* trunkBuf,
+    void* trunkScratchBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const;
+
+};
+
+//------------------------------------------------------------------------------
+
+struct NestedBottleneckResidualBlock {
+  const string name;
+  const NormActConv normActConv1;
+  const BlockStack blocks;
+  const NormActConv normActConv2;
+
+  NestedBottleneckResidualBlock() = delete;
+  NestedBottleneckResidualBlock(const NestedBottleneckResidualBlock&) = delete;
+  NestedBottleneckResidualBlock& operator=(const NestedBottleneckResidualBlock&) = delete;
+
+  NestedBottleneckResidualBlock(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const NestedBottleneckResidualBlockDesc* desc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ): name(desc->name),
+     normActConv1(cudaHandles,manager,&desc->preBN,&desc->preActivation,&desc->preConv,nnX,nnY,useFP16,useNHWC),
+     blocks(cudaHandles,manager,desc->numBlocks,desc->preConv.outChannels,desc->blocks,nnX,nnY,useFP16,useNHWC),
+     normActConv2(cudaHandles,manager,&desc->postBN,&desc->postActivation,&desc->postConv,nnX,nnY,useFP16,useNHWC)
+  {
+  }
+
+  ~NestedBottleneckResidualBlock()
+  {}
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+    b = normActConv1.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = blocks.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = normActConv2.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* trunkBuf,
+    void* trunkScratchBuf,
+    void* maskBuf,
+    float* maskSumBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    SizedBuf<void*> mid(scratch->allocator, scratch->getBufSizeXY(normActConv1.outChannels));
+    SizedBuf<void*> midScratch(scratch->allocator, scratch->getBufSizeXY(normActConv1.outChannels));
+    assert(normActConv1.outChannels == normActConv2.inChannels);
+    normActConv1.apply(cudaHandles,batchSize,false,trunkBuf,trunkScratchBuf,mid.buf,maskBuf,workspaceBuf,workspaceBytes);
+    blocks.apply(
+      cudaHandles,
+      scratch,
+      batchSize,
+      maskBuf,
+      maskSumBuf,
+      mid.buf,
+      midScratch.buf,
+      workspaceBuf,
+      workspaceBytes
+    );
+    normActConv2.apply(cudaHandles,batchSize,true,mid.buf,midScratch.buf,trunkBuf,maskBuf,workspaceBuf,workspaceBytes);
+  }
+
+};
+
+//------------------------------------------------------------------------------
+
+BlockStack::BlockStack(
+  CudaHandles* cudaHandles,
+  CudnnManager* manager,
+  int nBlocks,
+  int trunkChannels,
+  const std::vector<std::pair<int, unique_ptr_void>>& descBlocks,
+  int nnX,
+  int nnY,
+  bool useFP16,
+  bool useNHWC
+) :
+  numBlocks(nBlocks),
+  trunkNumChannels(trunkChannels),
+  nnXLen(nnX),
+  nnYLen(nnY),
+  usingFP16(useFP16),
+  usingNHWC(useNHWC)
+{
+  assert(numBlocks == descBlocks.size());
+  for(int i = 0; i<numBlocks; i++) {
+    if(descBlocks[i].first == ORDINARY_BLOCK_KIND) {
+      ResidualBlockDesc* blockDesc = (ResidualBlockDesc*)descBlocks[i].second.get();
+      unique_ptr_void blockPtr = make_unique_void(
+        new ResidualBlock(
+          cudaHandles,
+          manager,
+          blockDesc,
+          nnXLen,
+          nnYLen,
+          useFP16,
+          useNHWC
+        )
+      );
+      blocks.push_back(make_pair(ORDINARY_BLOCK_KIND,std::move(blockPtr)));
+    }
+    else if(descBlocks[i].first == GLOBAL_POOLING_BLOCK_KIND) {
+      GlobalPoolingResidualBlockDesc* blockDesc = (GlobalPoolingResidualBlockDesc*)descBlocks[i].second.get();
+      unique_ptr_void blockPtr = make_unique_void(
+        new GlobalPoolingResidualBlock(
+          cudaHandles,
+          manager,
+          blockDesc,
+          nnXLen,
+          nnYLen,
+          useFP16,
+          useNHWC
+        )
+      );
+      blocks.push_back(make_pair(GLOBAL_POOLING_BLOCK_KIND,std::move(blockPtr)));
+    }
+    else if(descBlocks[i].first == NESTED_BOTTLENECK_BLOCK_KIND) {
+      NestedBottleneckResidualBlockDesc* blockDesc = (NestedBottleneckResidualBlockDesc*)descBlocks[i].second.get();
+      unique_ptr_void blockPtr = make_unique_void(
+        new NestedBottleneckResidualBlock(
+          cudaHandles,
+          manager,
+          blockDesc,
+          nnXLen,
+          nnYLen,
+          useFP16,
+          useNHWC
+        )
+      );
+      blocks.push_back(make_pair(NESTED_BOTTLENECK_BLOCK_KIND,std::move(blockPtr)));
+    }
+    else {
+      ASSERT_UNREACHABLE;
+    }
+  }
+}
+BlockStack::~BlockStack() {
+}
+
+size_t BlockStack::requiredWorkspaceBytes(
+  CudaHandles* cudaHandles,
+  int batchSize
+) const {
+  size_t bytes = 0;
+  size_t b;
+
+  for(int i = 0; i<blocks.size(); i++) {
+    if(blocks[i].first == ORDINARY_BLOCK_KIND) {
+      ResidualBlock* block = (ResidualBlock*)blocks[i].second.get();
+      b = block->requiredWorkspaceBytes(cudaHandles,batchSize);
+      bytes = std::max(bytes,b);
+    }
+    else if(blocks[i].first == GLOBAL_POOLING_BLOCK_KIND) {
+      GlobalPoolingResidualBlock* block = (GlobalPoolingResidualBlock*)blocks[i].second.get();
+      b = block->requiredWorkspaceBytes(cudaHandles,batchSize);
+      bytes = std::max(bytes,b);
+    }
+    else if(blocks[i].first == NESTED_BOTTLENECK_BLOCK_KIND) {
+      NestedBottleneckResidualBlock* block = (NestedBottleneckResidualBlock*)blocks[i].second.get();
+      b = block->requiredWorkspaceBytes(cudaHandles,batchSize);
+      bytes = std::max(bytes,b);
+    }
+    else {
+      ASSERT_UNREACHABLE;
+    }
+  }
+  return bytes;
+}
+
+void BlockStack::apply(
+  CudaHandles* cudaHandles,
+  ScratchBuffers* scratch,
+  int batchSize,
+  void* maskBuf,
+  float* maskSumBuf,
+  void* trunkBuf,
+  void* trunkScratchBuf,
+  void* workspaceBuf,
+  size_t workspaceBytes
+) const {
+
+  for(int i = 0; i<blocks.size(); i++) {
+#ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("Blockstack before block " + Global::intToString(i)), trunkBuf, batchSize, trunkNumChannels, nnXLen, nnYLen, usingNHWC, usingFP16);
+#endif
+
+    if(blocks[i].first == ORDINARY_BLOCK_KIND) {
+      ResidualBlock* block = (ResidualBlock*)blocks[i].second.get();
+      block->apply(
+        cudaHandles,
+        scratch,
+        batchSize,
+        trunkBuf,
+        trunkScratchBuf,
+        maskBuf,
+        workspaceBuf,
+        workspaceBytes
+      );
+    }
+    else if(blocks[i].first == GLOBAL_POOLING_BLOCK_KIND) {
+      GlobalPoolingResidualBlock* block = (GlobalPoolingResidualBlock*)blocks[i].second.get();
+      block->apply(
+        cudaHandles,
+        scratch,
+        batchSize,
+        trunkBuf,
+        trunkScratchBuf,
+        maskBuf,
+        maskSumBuf,
+        workspaceBuf,
+        workspaceBytes
+      );
+    }
+    else if(blocks[i].first == NESTED_BOTTLENECK_BLOCK_KIND) {
+      NestedBottleneckResidualBlock* block = (NestedBottleneckResidualBlock*)blocks[i].second.get();
+      block->apply(
+        cudaHandles,
+        scratch,
+        batchSize,
+        trunkBuf,
+        trunkScratchBuf,
+        maskBuf,
+        maskSumBuf,
+        workspaceBuf,
+        workspaceBytes
+      );
+    }
+    else {
+      ASSERT_UNREACHABLE;
+    }
+  }
+}
+//------------------------------------------------------------------------------
+
+struct SGFMetadataEncoder {
+  const string name;
+
+  const bool usingFP16;
+
+  const MatMulLayer mul1;
+  const MatBiasLayer bias1;
+  const MatMulLayer mul2;
+  const MatBiasLayer bias2;
+  const MatMulLayer mul3;
+
+  SGFMetadataEncoder() = delete;
+  SGFMetadataEncoder(const SGFMetadataEncoder&) = delete;
+  SGFMetadataEncoder& operator=(const SGFMetadataEncoder&) = delete;
+
+  SGFMetadataEncoder(
+    CudaHandles* cudaHandles,
+    const SGFMetadataEncoderDesc* desc,
+    bool useFP16
+  ) :
+    name(desc->name),
+    usingFP16(useFP16),
+    mul1(cudaHandles,&desc->mul1,useFP16),
+    bias1(cudaHandles,&desc->bias1,useFP16,desc->act1.activation),
+    mul2(cudaHandles,&desc->mul2,useFP16),
+    bias2(cudaHandles,&desc->bias2,useFP16,desc->act2.activation),
+    mul3(cudaHandles,&desc->mul3,useFP16)
+  {
+  }
+
+  ~SGFMetadataEncoder()
+  {
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    (void)batchSize;
+    size_t bytes = 0;
+    size_t b;
+
+    b = mul1.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = mul2.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = mul3.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* inputBuf,
+    void* outputBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    SizedBuf<void*> internalBuf1(scratch->allocator, scratch->getBufSizeFloat(std::max(mul1.outChannels,mul2.outChannels)));
+    SizedBuf<void*> internalBuf2(scratch->allocator, scratch->getBufSizeFloat(std::max(mul1.outChannels,mul2.outChannels)));
+
+    mul1.apply(cudaHandles,scratch,batchSize,inputBuf,internalBuf1.buf,workspaceBuf,workspaceBytes);
+    bias1.apply(cudaHandles,batchSize,internalBuf1.buf);
+    mul2.apply(cudaHandles,scratch,batchSize,internalBuf1.buf,internalBuf2.buf,workspaceBuf,workspaceBytes);
+    bias2.apply(cudaHandles,batchSize,internalBuf2.buf);
+    mul3.apply(cudaHandles,scratch,batchSize,internalBuf2.buf,outputBuf,workspaceBuf,workspaceBytes);
+  }
+
+};
+
+
+//----------------------------------------------------------------------------
+
+struct Trunk {
+  const string name;
+  const int modelVersion;
+  const int numBlocks;
+  const int trunkNumChannels;
+
+  const int nnXLen;
+  const int nnYLen;
+  const bool usingFP16;
+  const bool usingNHWC;
+
+  std::unique_ptr<ConvLayer> initialConv;
+  std::unique_ptr<MatMulLayer> initialMatMul;
+  std::unique_ptr<SGFMetadataEncoder> sgfMetadataEncoder;
+  const BlockStack blocks;
+  std::unique_ptr<BatchNormLayer> trunkTipBN;
+
+  Trunk() = delete;
+  Trunk(const Trunk&) = delete;
+  Trunk& operator=(const Trunk&) = delete;
+
+  Trunk(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const TrunkDesc* desc,
+    int nnX,
+    int nnY,
+    bool inputsUseNHWC,
+    bool useFP16,
+    bool useNHWC
+  ) :
+    name(desc->name),
+    modelVersion(desc->modelVersion),
+    numBlocks(desc->numBlocks),
+    trunkNumChannels(desc->trunkNumChannels),
+    nnXLen(nnX),
+    nnYLen(nnY),
+    usingFP16(useFP16),
+    usingNHWC(useNHWC),
+    blocks(cudaHandles,manager,desc->numBlocks,desc->trunkNumChannels,desc->blocks,nnX,nnY,useFP16,useNHWC)
+  {
+    int midNumChannels = desc->midNumChannels;
+    int regularNumChannels = desc->regularNumChannels;
+    int gpoolNumChannels = desc->gpoolNumChannels;
+
+    int maxBatchSize = manager->maxBatchSize;
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,trunkNumChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,midNumChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,regularNumChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,gpoolNumChannels);
+
+    initialConv = std::make_unique<ConvLayer>(cudaHandles,manager,&desc->initialConv,useFP16,inputsUseNHWC,useNHWC);
+    initialMatMul = std::make_unique<MatMulLayer>(cudaHandles,&desc->initialMatMul,useFP16);
+    if(desc->metaEncoderVersion > 0) {
+      sgfMetadataEncoder = std::make_unique<SGFMetadataEncoder>(cudaHandles,&desc->sgfMetadataEncoder,useFP16);
+      testAssert(sgfMetadataEncoder->mul3.outChannels == initialMatMul->outChannels);
+    }
+
+    trunkTipBN = std::make_unique<BatchNormLayer>(cudaHandles,&desc->trunkTipBN,&desc->trunkTipActivation,nnXLen,nnYLen,useFP16,useNHWC);
+    assert(desc->blocks.size() == numBlocks);
+  }
+
+  ~Trunk()
+  {
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+
+    b = initialConv->requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+
+    b = initialMatMul->requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+
+    if(sgfMetadataEncoder != nullptr) {
+      b = sgfMetadataEncoder->requiredWorkspaceBytes(cudaHandles,batchSize);
+      bytes = std::max(bytes,b);
+    }
+
+    b = blocks.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* inputBuf,
+    void* inputGlobalBuf,
+    void* inputMetaBuf,
+    void* maskBuf,
+    float* maskSumBuf,
+    void* trunkBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+
+    SizedBuf<void*> trunkScratch(scratch->allocator, scratch->getBufSizeXY(trunkNumChannels));
+
+    //Feed the conv into trunkScratch.buf, not trunkBuf
+    initialConv->apply(cudaHandles,batchSize,false,inputBuf,trunkScratch.buf,workspaceBuf,workspaceBytes);
+
+    #ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("After initial conv"), trunkScratch.buf, batchSize, trunkNumChannels, nnXLen, nnYLen, usingNHWC, usingFP16);
+    #endif
+
+    //Feed the matmul into trunkBuf
+    initialMatMul->apply(cudaHandles,scratch,batchSize,inputGlobalBuf,trunkBuf,workspaceBuf,workspaceBytes);
+    //Then accumulate it into trunkScratch.buf, broadcasting during the process
+    if(!usingFP16) {
+      if(!usingNHWC)
+        customCudaAddNCBiasInplaceNCHW((float*)trunkScratch.buf,(const float*)trunkBuf,batchSize,trunkNumChannels,nnXLen*nnYLen);
+      else
+        customCudaAddNCBiasInplaceNHWC((float*)trunkScratch.buf,(const float*)trunkBuf,batchSize,nnXLen*nnYLen,trunkNumChannels);
+    }
+    else {
+      if(!usingNHWC)
+        customCudaAddNCBiasInplaceNCHW((half*)trunkScratch.buf,(const half*)trunkBuf,batchSize,trunkNumChannels,nnXLen*nnYLen);
+      else
+        customCudaAddNCBiasInplaceNHWC((half*)trunkScratch.buf,(const half*)trunkBuf,batchSize,nnXLen*nnYLen,trunkNumChannels);
+    }
+    CUDA_ERR(name.c_str(),hipPeekAtLastError());
+
+    if(sgfMetadataEncoder != nullptr) {
+      testAssert(inputMetaBuf != NULL);
+      //Feed the result into trunkBuf
+      sgfMetadataEncoder->apply(cudaHandles,scratch,batchSize,inputMetaBuf,trunkBuf,workspaceBuf,workspaceBytes);
+      //Then accumulate it into trunkScratch.buf, broadcasting during the process
+      if(!usingFP16) {
+        if(!usingNHWC)
+          customCudaAddNCBiasInplaceNCHW((float*)trunkScratch.buf,(const float*)trunkBuf,batchSize,trunkNumChannels,nnXLen*nnYLen);
+        else
+          customCudaAddNCBiasInplaceNHWC((float*)trunkScratch.buf,(const float*)trunkBuf,batchSize,nnXLen*nnYLen,trunkNumChannels);
+      }
+      else {
+        if(!usingNHWC)
+          customCudaAddNCBiasInplaceNCHW((half*)trunkScratch.buf,(const half*)trunkBuf,batchSize,trunkNumChannels,nnXLen*nnYLen);
+        else
+          customCudaAddNCBiasInplaceNHWC((half*)trunkScratch.buf,(const half*)trunkBuf,batchSize,nnXLen*nnYLen,trunkNumChannels);
+      }
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+    }
+    else {
+      testAssert(inputMetaBuf == NULL);
+    }
+
+    //Flip trunkBuf and trunkScratch.buf so that the result gets accumulated in trunkScratch.buf
+    blocks.apply(
+      cudaHandles,
+      scratch,
+      batchSize,
+      maskBuf,
+      maskSumBuf,
+      trunkScratch.buf,
+      trunkBuf,
+      workspaceBuf,
+      workspaceBytes
+    );
+
+    //And now with the final BN port it from trunkScratch.buf to trunkBuf.
+    trunkTipBN->apply(cudaHandles,batchSize,trunkScratch.buf,maskBuf,trunkBuf);
+
+    #ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("Trunk tip"), trunkBuf, batchSize, trunkNumChannels, nnXLen, nnYLen, usingNHWC, usingFP16);
+    #endif
+  }
+
+};
+
+//------------------------------------------------------------------------------
+
+static void fillMaskFloatBufAndMaskSumBuf(void* maskBuf, float*& maskFloatBuf, float*& maskSumBuf, bool usingFP16, int batchSize, int nnXLen, int nnYLen) {
+  if(!usingFP16) {
+    maskFloatBuf = (float*)maskBuf;
+    customCudaPoolRowsSumNCHW((const float*)maskFloatBuf,maskSumBuf,batchSize,1,nnXLen*nnYLen,1.0);
+    CUDA_ERR("sumMask",hipPeekAtLastError());
+  }
+  else {
+    customCudaCopyFromHalf((const half*)maskBuf,maskFloatBuf,batchSize*nnXLen*nnYLen);
+    CUDA_ERR("copyMaskFromHalf",hipPeekAtLastError());
+    customCudaPoolRowsSumNCHW((const float*)maskFloatBuf,maskSumBuf,batchSize,1,nnXLen*nnYLen,1.0);
+    CUDA_ERR("sumMask",hipPeekAtLastError());
+  }
+}
+
+
+//------------------------------------------------------------------------------
+
+struct PolicyHead {
+  const string name;
+  const int modelVersion;
+  const int nnXLen;
+  const int nnYLen;
+  const int p1Channels;
+  const int g1Channels;
+  const int p2Channels;
+  const bool usingFP16;
+  const bool usingNHWC;
+
+  const ConvLayer p1Conv;
+  const ConvLayer g1Conv;
+  const BatchNormLayer g1BN;
+  const MatMulLayer gpoolToBiasMul;
+  const BatchNormLayer p1BN;
+  const ConvLayer p2Conv;
+  const MatMulLayer gpoolToPassMul;
+  const MatBiasLayer gpoolToPassBias;
+  const MatMulLayer gpoolToPassMul2;
+
+  PolicyHead() = delete;
+  PolicyHead(const PolicyHead&) = delete;
+  PolicyHead& operator=(const PolicyHead&) = delete;
+
+  PolicyHead(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const PolicyHeadDesc* desc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ) :
+    name(desc->name),
+    modelVersion(desc->modelVersion),
+    nnXLen(nnX),
+    nnYLen(nnY),
+    p1Channels(desc->p1Conv.outChannels),
+    g1Channels(desc->g1Conv.outChannels),
+    p2Channels(desc->p2Conv.outChannels),
+    usingFP16(useFP16),
+    usingNHWC(useNHWC),
+    p1Conv(cudaHandles,manager,&desc->p1Conv,useFP16,useNHWC),
+    g1Conv(cudaHandles,manager,&desc->g1Conv,useFP16,useNHWC),
+    g1BN(cudaHandles,&desc->g1BN,&desc->g1Activation,nnX,nnY,useFP16,useNHWC),
+    gpoolToBiasMul(cudaHandles,&desc->gpoolToBiasMul,false),
+    p1BN(cudaHandles,&desc->p1BN,&desc->p1Activation,nnX,nnY,false,useNHWC),
+    p2Conv(cudaHandles,manager,&desc->p2Conv,false,useNHWC),
+    gpoolToPassMul(cudaHandles,&desc->gpoolToPassMul,false),
+    gpoolToPassBias(cudaHandles,&desc->gpoolToPassBias,false,desc->passActivation.activation),
+    gpoolToPassMul2(cudaHandles,&desc->gpoolToPassMul2,false)
+  {
+  }
+
+  ~PolicyHead()
+  {
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+
+    b = p1Conv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = g1Conv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = gpoolToBiasMul.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = p2Conv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = gpoolToPassMul.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = gpoolToPassMul2.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = sizeof(float)*batchSize*g1Channels*nnXLen*nnYLen;
+    bytes = std::max(bytes,b);
+
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* maskBuf,
+    float* maskFloatBuf,
+    float* maskSumBuf,
+    void* trunkBuf,
+    float* policyPassBuf,
+    float* policyBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+
+    SizedBuf<void*> p1Out(scratch->allocator, scratch->getBufSizeXYFloat(p1Channels)); //Need to hold floats, not just halfs
+    SizedBuf<void*> p1Out2(scratch->allocator, scratch->getBufSizeXYFloat(p1Channels)); //Need to hold floats, not just halfs
+    SizedBuf<void*> g1Out(scratch->allocator, scratch->getBufSizeXY(g1Channels));
+    SizedBuf<void*> g1Out2(scratch->allocator, scratch->getBufSizeXY(g1Channels));
+    SizedBuf<void*> g1Concat(scratch->allocator, scratch->getBufSizeFloat(g1Channels*3));
+    SizedBuf<void*> g1Bias(scratch->allocator, scratch->getBufSizeFloat(p1Channels));
+    SizedBuf<void*> p1Pass(scratch->allocator, scratch->getBufSizeFloat(p1Channels));
+
+    p1Conv.apply(cudaHandles,batchSize,false,trunkBuf,p1Out.buf,workspaceBuf,workspaceBytes);
+    g1Conv.apply(cudaHandles,batchSize,false,trunkBuf,g1Out.buf,workspaceBuf,workspaceBytes);
+    g1BN.apply(cudaHandles,batchSize,g1Out.buf,maskBuf,g1Out2.buf);
+
+    if(!usingFP16) {
+      if(!usingNHWC)
+        customCudaPoolRowsGPoolNCHW((const float*)g1Out2.buf,(float*)g1Concat.buf,batchSize,g1Channels,nnXLen*nnYLen,maskFloatBuf,maskSumBuf);
+      else
+        customCudaPoolRowsGPoolNHWC((const float*)g1Out2.buf,(float*)g1Concat.buf,batchSize,nnXLen*nnYLen,g1Channels,maskFloatBuf,maskSumBuf);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+    }
+    else {
+      customCudaCopyFromHalf((const half*)g1Out2.buf,(float*)workspaceBuf,batchSize*g1Channels*nnXLen*nnYLen);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+      if(!usingNHWC)
+        customCudaPoolRowsGPoolNCHW((const float*)workspaceBuf,(float*)g1Concat.buf,batchSize,g1Channels,nnXLen*nnYLen,maskFloatBuf,maskSumBuf);
+      else
+        customCudaPoolRowsGPoolNHWC((const float*)workspaceBuf,(float*)g1Concat.buf,batchSize,nnXLen*nnYLen,g1Channels,maskFloatBuf,maskSumBuf);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+    }
+
+    gpoolToBiasMul.apply(cudaHandles,scratch,batchSize,g1Concat.buf,g1Bias.buf,workspaceBuf,workspaceBytes);
+
+    #ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("p1 pre-gpool-sum"), p1Out.buf, batchSize, p1Channels, nnXLen, nnYLen, usingNHWC, usingFP16);
+    CudaUtils::debugPrint4D(string("g1 pre-gpool"), g1Out.buf, batchSize, g1Channels, nnXLen, nnYLen, usingNHWC, usingFP16);
+    CudaUtils::debugPrint2D(string("g1 pooled"), g1Concat.buf, batchSize, g1Channels*3, false);
+    CudaUtils::debugPrint2D(string("g1 biases"), g1Bias.buf, batchSize, p1Channels, false);
+    #endif
+
+    float* p1OutBufA;
+    float* p1OutBufB;
+    if(!usingFP16) {
+      p1OutBufA = (float*)p1Out.buf;
+      p1OutBufB = (float*)p1Out2.buf;
+    }
+    else {
+      customCudaCopyFromHalf((const half*)p1Out.buf,(float*)p1Out2.buf,batchSize*p1Channels*nnXLen*nnYLen);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+      p1OutBufA = (float*)p1Out2.buf;
+      p1OutBufB = (float*)p1Out.buf;
+    }
+
+    if(!usingNHWC)
+      customCudaAddNCBiasInplaceNCHW(p1OutBufA,(float*)g1Bias.buf,batchSize,p1Channels,nnXLen*nnYLen);
+    else
+      customCudaAddNCBiasInplaceNHWC(p1OutBufA,(float*)g1Bias.buf,batchSize,nnXLen*nnYLen,p1Channels);
+    CUDA_ERR(name.c_str(),hipPeekAtLastError());
+
+    p1BN.apply(cudaHandles,batchSize,p1OutBufA,maskFloatBuf,p1OutBufB);
+    p2Conv.apply(cudaHandles,batchSize,false,p1OutBufB,(float*)policyBuf,workspaceBuf,workspaceBytes);
+
+    if(modelVersion >= 15) {
+      gpoolToPassMul.apply(cudaHandles,scratch,batchSize,g1Concat.buf,p1Pass.buf,workspaceBuf,workspaceBytes);
+      gpoolToPassBias.apply(cudaHandles,batchSize,p1Pass.buf);
+      gpoolToPassMul2.apply(cudaHandles,scratch,batchSize,p1Pass.buf,policyPassBuf,workspaceBuf,workspaceBytes);
+    }
+    else {
+      gpoolToPassMul.apply(cudaHandles,scratch,batchSize,g1Concat.buf,policyPassBuf,workspaceBuf,workspaceBytes);
+    }
+
+    #ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("p1 after-gpool-sum"), p1OutBufA, batchSize, p1Channels, nnXLen, nnYLen, usingNHWC, false);
+    CudaUtils::debugPrint2D(string("policypass"), policyPassBuf, batchSize, 1, false);
+    CudaUtils::debugPrint4D(string("policy"), policyBuf, batchSize, p2Channels, nnXLen, nnYLen, usingNHWC, false);
+    #endif
+
+  }
+
+};
+
+//------------------------------------------------------------------------------
+
+struct ValueHead {
+  const string name;
+  const int modelVersion;
+  const int nnXLen;
+  const int nnYLen;
+  const int v1Channels;
+  const int v2Channels;
+  const int valueChannels;
+  const int scoreValueChannels;
+  const int ownershipChannels;
+  const bool usingFP16;
+  const bool usingNHWC;
+
+  const ConvLayer v1Conv;
+  const BatchNormLayer v1BN;
+  const MatMulLayer v2Mul;
+  const MatBiasLayer v2Bias;
+  const MatMulLayer v3Mul;
+  const MatBiasLayer v3Bias;
+  const MatMulLayer sv3Mul;
+  const MatBiasLayer sv3Bias;
+  const ConvLayer vOwnershipConv;
+
+  ValueHead() = delete;
+  ValueHead(const ValueHead&) = delete;
+  ValueHead& operator=(const ValueHead&) = delete;
+
+  ValueHead(
+    CudaHandles* cudaHandles,
+    CudnnManager* manager,
+    const ValueHeadDesc* desc,
+    int nnX,
+    int nnY,
+    bool useFP16,
+    bool useNHWC
+  ) :
+    name(desc->name),
+    modelVersion(desc->modelVersion),
+    nnXLen(nnX),
+    nnYLen(nnY),
+    v1Channels(desc->v1Conv.outChannels),
+    v2Channels(desc->v2Mul.outChannels),
+    valueChannels(desc->v3Mul.outChannels),
+    scoreValueChannels(desc->sv3Mul.outChannels),
+    ownershipChannels(desc->vOwnershipConv.outChannels),
+    usingFP16(useFP16),
+    usingNHWC(useNHWC),
+    v1Conv(cudaHandles,manager,&desc->v1Conv,useFP16,useNHWC),
+    v1BN(cudaHandles,&desc->v1BN,&desc->v1Activation,nnX,nnY,useFP16,useNHWC),
+    v2Mul(cudaHandles,&desc->v2Mul,false),
+    v2Bias(cudaHandles,&desc->v2Bias,false,desc->v2Activation.activation),
+    v3Mul(cudaHandles,&desc->v3Mul,false),
+    v3Bias(cudaHandles,&desc->v3Bias,false,ACTIVATION_IDENTITY),
+    sv3Mul(cudaHandles,&desc->sv3Mul,false),
+    sv3Bias(cudaHandles,&desc->sv3Bias,false,ACTIVATION_IDENTITY),
+    vOwnershipConv(cudaHandles,manager,&desc->vOwnershipConv,useFP16,useNHWC)
+  {
+  }
+
+  ~ValueHead()
+  {
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+
+    b = v1Conv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = v2Mul.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = v3Mul.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = sizeof(float)*batchSize*v1Channels*nnXLen*nnYLen;
+    bytes = std::max(bytes,b);
+
+    b = sv3Mul.requiredWorkspaceBytes(cudaHandles);
+    bytes = std::max(bytes,b);
+    b = vOwnershipConv.requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = sizeof(float)*batchSize*ownershipChannels*nnXLen*nnYLen;
+    bytes = std::max(bytes,b);
+
+    return bytes;
+  }
+
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    void* maskBuf,
+    float* maskSumBuf,
+    void* trunkBuf,
+    float* valueBuf,
+    float* scoreValueBuf,
+    void* ownershipBuf,
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    SizedBuf<void*> v1Out(scratch->allocator, scratch->getBufSizeXY(v1Channels));
+    SizedBuf<void*> v1Out2(scratch->allocator, scratch->getBufSizeXY(v1Channels));
+    SizedBuf<void*> v1Mean(scratch->allocator, scratch->getBufSizeFloat(v1Channels*3));
+    SizedBuf<void*> v2Out(scratch->allocator, scratch->getBufSizeFloat(v2Channels));
+    SizedBuf<void*> ownershipScratch(scratch->allocator, scratch->getBufSizeXYFloat(ownershipChannels));
+
+    v1Conv.apply(cudaHandles,batchSize,false,trunkBuf,v1Out.buf,workspaceBuf,workspaceBytes);
+    v1BN.apply(cudaHandles,batchSize,v1Out.buf,maskBuf,v1Out2.buf);
+
+    void* bufToBePooled = v1Out2.buf;
+    if(usingFP16) {
+      customCudaCopyFromHalf((const half*)v1Out2.buf,(float*)workspaceBuf,batchSize*v1Channels*nnXLen*nnYLen);
+      CUDA_ERR(name.c_str(),hipPeekAtLastError());
+      bufToBePooled = workspaceBuf;
+    }
+
+    if(!usingNHWC)
+      customCudaValueHeadPoolNCHW((float*)bufToBePooled,(float*)v1Mean.buf,batchSize,v1Channels,nnXLen*nnYLen,maskSumBuf);
+    else
+      customCudaValueHeadPoolNHWC((const float*)bufToBePooled,(float*)v1Mean.buf,batchSize,nnXLen*nnYLen,v1Channels,maskSumBuf);
+    CUDA_ERR(name.c_str(),hipPeekAtLastError());
+
+    v2Mul.apply(cudaHandles,scratch,batchSize,v1Mean.buf,v2Out.buf,workspaceBuf,workspaceBytes);
+    v2Bias.apply(cudaHandles,batchSize,v2Out.buf);
+    v3Mul.apply(cudaHandles,scratch,batchSize,v2Out.buf,valueBuf,workspaceBuf,workspaceBytes);
+    v3Bias.apply(cudaHandles,batchSize,valueBuf);
+
+    sv3Mul.apply(cudaHandles,scratch,batchSize,v2Out.buf,scoreValueBuf,workspaceBuf,workspaceBytes);
+    sv3Bias.apply(cudaHandles,batchSize,scoreValueBuf);
+
+    #ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("v1"), v1Out.buf, batchSize, v1Channels, nnXLen, nnYLen, usingNHWC, usingFP16);
+    CudaUtils::debugPrint2D(string("v1 pooled"), v1Mean.buf, batchSize, v1Channels, false);
+    CudaUtils::debugPrint2D(string("v2"), v2Out.buf, batchSize, v1Channels, false);
+    #endif
+
+    if(!usingFP16) {
+      vOwnershipConv.apply(cudaHandles,batchSize,false,v1Out2.buf,ownershipBuf,workspaceBuf,workspaceBytes);
+    }
+    else {
+      vOwnershipConv.apply(cudaHandles,batchSize,false,v1Out2.buf,ownershipScratch.buf,workspaceBuf,workspaceBytes);
+      customCudaCopyFromHalf((const half*)ownershipScratch.buf,(float*)ownershipBuf,batchSize*ownershipChannels*nnXLen*nnYLen);
+      CUDA_ERR("vOwnership copy",hipPeekAtLastError());
+    }
+
+  }
+
+};
+
+//------------------------------------------------------------------------------
+
+struct Model {
+  const string name;
+  const int modelVersion;
+  const int maxBatchSize;
+  const int nnXLen;
+  const int nnYLen;
+  const int numInputChannels;
+  const int numInputGlobalChannels;
+  const int numInputMetaChannels;
+  const int numPolicyChannels;
+  const int numValueChannels;
+  const int numScoreValueChannels;
+  const int numOwnershipChannels;
+  const bool usingFP16;
+  const bool usingNHWC;
+  const bool inputsUsingNHWC;
+
+  std::unique_ptr<Trunk> trunk;
+  std::unique_ptr<PolicyHead> policyHead;
+  std::unique_ptr<ValueHead> valueHead;
+  std::unique_ptr<CudnnManager> manager;
+
+  Model() = delete;
+  Model(const Model&) = delete;
+  Model& operator=(const Model&) = delete;
+
+  Model(
+    CudaHandles* cudaHandles,
+    const ModelDesc* desc,
+    int maxBatchSz,
+    int nnX,
+    int nnY,
+    bool inputsUseNHWC,
+    bool useFP16,
+    bool useNHWC
+  ) :
+    name(desc->name),
+    modelVersion(desc->modelVersion),
+    maxBatchSize(maxBatchSz),
+    nnXLen(nnX),
+    nnYLen(nnY),
+    numInputChannels(desc->numInputChannels),
+    numInputGlobalChannels(desc->numInputGlobalChannels),
+    numInputMetaChannels(desc->numInputMetaChannels),
+    numPolicyChannels(desc->numPolicyChannels),
+    numValueChannels(desc->numValueChannels),
+    numScoreValueChannels(desc->numScoreValueChannels),
+    numOwnershipChannels(desc->numOwnershipChannels),
+    usingFP16(useFP16),
+    usingNHWC(useNHWC),
+    inputsUsingNHWC(inputsUseNHWC)
+  {
+    if(nnXLen > NNPos::MAX_BOARD_LEN)
+      throw StringError(Global::strprintf("nnXLen (%d) is greater than NNPos::MAX_BOARD_LEN (%d)",
+        nnXLen, NNPos::MAX_BOARD_LEN
+      ));
+    if(nnYLen > NNPos::MAX_BOARD_LEN)
+      throw StringError(Global::strprintf("nnYLen (%d) is greater than NNPos::MAX_BOARD_LEN (%d)",
+        nnYLen, NNPos::MAX_BOARD_LEN
+      ));
+
+    int numFeatures = NNModelVersion::getNumSpatialFeatures(modelVersion);
+    if(numInputChannels != numFeatures)
+      throw StringError(Global::strprintf("Neural net numInputChannels (%d) was not the expected number based on version (%d)",
+        numInputChannels, numFeatures
+      ));
+    int numGlobalFeatures = NNModelVersion::getNumGlobalFeatures(modelVersion);
+    if(numInputGlobalChannels != numGlobalFeatures)
+      throw StringError(Global::strprintf("Neural net numInputGlobalChannels (%d) was not the expected number based on version (%d)",
+        numInputGlobalChannels, numGlobalFeatures
+      ));
+    if(numInputMetaChannels > 0) {
+      if(numInputMetaChannels != SGFMetadata::METADATA_INPUT_NUM_CHANNELS)
+        throw StringError(Global::strprintf("Neural net numInputMetaChannels (%d) was not the expected number (%d)",
+          numInputMetaChannels, SGFMetadata::METADATA_INPUT_NUM_CHANNELS
+        ));
+    }
+
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numInputChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numInputGlobalChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numInputMetaChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numPolicyChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numValueChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numScoreValueChannels);
+    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numOwnershipChannels);
+
+    manager = std::make_unique<CudnnManager>(name, maxBatchSize, nnXLen, nnYLen);
+    trunk = std::make_unique<Trunk>(cudaHandles,manager.get(),&desc->trunk,nnXLen,nnYLen,inputsUseNHWC,useFP16,useNHWC);
+    policyHead = std::make_unique<PolicyHead>(cudaHandles,manager.get(),&desc->policyHead,nnXLen,nnYLen,useFP16,useNHWC);
+    valueHead = std::make_unique<ValueHead>(cudaHandles,manager.get(),&desc->valueHead,nnXLen,nnYLen,useFP16,useNHWC);
+  }
+
+  ~Model()
+  {
+  }
+
+  size_t requiredWorkspaceBytes(
+    CudaHandles* cudaHandles,
+    int batchSize
+  ) const {
+    size_t bytes = 0;
+    size_t b;
+
+    b = trunk->requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = policyHead->requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+    b = valueHead->requiredWorkspaceBytes(cudaHandles,batchSize);
+    bytes = std::max(bytes,b);
+
+    return bytes;
+  }
+
+  void apply(
+    CudaHandles* cudaHandles,
+    ScratchBuffers* scratch,
+    int batchSize,
+    bool requireExactNNLen,
+
+    void* inputBuf,
+    void* inputGlobalBuf,
+    void* inputMetaBuf,
+
+    float* policyPassBuf,
+    float* policyBuf,
+
+    float* valueBuf,
+    float* scoreValueBuf,
+    void* ownershipBuf,
+
+    void* workspaceBuf,
+    size_t workspaceBytes
+  ) const {
+    SizedBuf<void*> mask(scratch->allocator, scratch->getBufSizeXY(1));
+    SizedBuf<void*> maskFloat(scratch->allocator, scratch->getBufSizeXYFloat(1));
+    SizedBuf<void*> maskSum(scratch->allocator, scratch->getBufSizeFloat(1));
+
+    void* maskBuf = mask.buf;
+    float* maskFloatBuf = (float*)maskFloat.buf;
+    float* maskSumBuf = (float*)maskSum.buf;
+
+    if(!usingFP16) {
+      if(inputsUsingNHWC)
+        customCudaChannel0ExtractNHWC((const float*)inputBuf, (float*)maskBuf, batchSize, nnXLen*nnYLen, numInputChannels);
+      else
+        customCudaChannel0ExtractNCHW((const float*)inputBuf, (float*)maskBuf, batchSize, numInputChannels, nnXLen*nnYLen);
+      CUDA_ERR("modelExtractMask",hipPeekAtLastError());
+    }
+    else {
+      if(inputsUsingNHWC)
+        customCudaChannel0ExtractNHWC((const half*)inputBuf, (half*)maskBuf, batchSize, nnXLen*nnYLen, numInputChannels);
+      else
+        customCudaChannel0ExtractNCHW((const half*)inputBuf, (half*)maskBuf, batchSize, numInputChannels, nnXLen*nnYLen);
+      CUDA_ERR("modelExtractMask",hipPeekAtLastError());
+    }
+
+    fillMaskFloatBufAndMaskSumBuf(maskBuf,maskFloatBuf,maskSumBuf,usingFP16,batchSize,nnXLen,nnYLen);
+
+    //Don't do any masking if we know the board is exactly the desired size
+    if(requireExactNNLen) {
+      //Set to NULL to signal downstream that this buf doesn't need to be used
+      maskBuf = NULL;
+      maskFloatBuf = NULL;
+      //The global pooling structures need this no matter what, for normalizing based on this and its sqrt.
+      //maskSumBuf = NULL;
+    }
+
+    #ifdef DEBUG_INTERMEDIATE_VALUES
+    CudaUtils::debugPrint4D(string("Initial bin features"), inputBuf, batchSize, trunk->initialConv->inChannels, nnXLen, nnYLen, inputsUsingNHWC, usingFP16);
+    CudaUtils::debugPrint2D(string("Initial global features"), inputGlobalBuf, batchSize, trunk->initialMatMul->inChannels, usingFP16);
+    if(trunk->sgfMetadataEncoder != nullptr) {
+      assert(inputMetaBuf != NULL);
+      CudaUtils::debugPrint2D(string("Initial meta features"), inputMetaBuf, batchSize, trunk->sgfMetadataEncoder->mul1.inChannels, usingFP16);
+    }
+    #endif
+
+    SizedBuf<void*> trunkBuf(scratch->allocator, scratch->getBufSizeXY(trunk->trunkNumChannels));
+
+    trunk->apply(
+      cudaHandles,
+      scratch,
+      batchSize,
+      inputBuf,
+      inputGlobalBuf,
+      inputMetaBuf,
+      maskBuf,
+      maskSumBuf,
+      trunkBuf.buf,
+      workspaceBuf,
+      workspaceBytes
+    );
+    policyHead->apply(
+      cudaHandles,
+      scratch,
+      batchSize,
+      maskBuf,
+      maskFloatBuf,
+      maskSumBuf,
+      trunkBuf.buf,
+      policyPassBuf,
+      policyBuf,
+      workspaceBuf,
+      workspaceBytes
+    );
+    valueHead->apply(
+      cudaHandles,
+      scratch,
+      batchSize,
+      maskBuf,
+      maskSumBuf,
+      trunkBuf.buf,
+      valueBuf,
+      scoreValueBuf,
+      ownershipBuf,
+      workspaceBuf,
+      workspaceBytes
+    );
+  }
+
+};
+
+
+//------------------------------------------------------------------------------
+
+struct LoadedModel {
+  ModelDesc modelDesc;
+
+  LoadedModel(const string& fileName, const string& expectedSha256) {
+    ModelDesc::loadFromFileMaybeGZipped(fileName,modelDesc,expectedSha256);
+    modelDesc.applyScale8ToReduceActivations();
+  }
+
+  LoadedModel() = delete;
+  LoadedModel(const LoadedModel&) = delete;
+  LoadedModel& operator=(const LoadedModel&) = delete;
+};
+
+LoadedModel* NeuralNet::loadModelFile(const string& file, const string& expectedSha256) {
+  LoadedModel* loadedModel = new LoadedModel(file,expectedSha256);
+  return loadedModel;
+}
+
+void NeuralNet::freeLoadedModel(LoadedModel* loadedModel) {
+  delete loadedModel;
+}
+
+const ModelDesc& NeuralNet::getModelDesc(const LoadedModel* loadedModel) {
+  return loadedModel->modelDesc;
+}
+
+//------------------------------------------------------------------------------
+
+struct Buffers {
+  //All of these are device pointers
+
+  float* inputBufFloat;
+  void* inputBuf;
+  float* inputGlobalBufFloat;
+  void* inputGlobalBuf;
+  float* inputMetaBufFloat;
+  void* inputMetaBuf;
+  size_t inputBufBytesFloat;
+  size_t inputBufBytes;
+  size_t inputGlobalBufBytesFloat;
+  size_t inputGlobalBufBytes;
+  size_t inputMetaBufBytesFloat;
+  size_t inputMetaBufBytes;
+
+  float* policyPassBuf;
+  size_t policyPassBufBytes;
+  float* policyBuf;
+  size_t policyBufBytes;
+
+  float* valueBuf;
+  size_t valueBufBytes;
+  float* scoreValueBuf;
+  size_t scoreValueBufBytes;
+  void* ownershipBuf;
+  size_t ownershipBufBytes;
+
+  void* workspaceBuf;
+  size_t workspaceBytes;
+
+  Buffers() = delete;
+  Buffers(const Buffers&) = delete;
+  Buffers& operator=(const Buffers&) = delete;
+
+  Buffers(CudaHandles* cudaHandles, const Model& m, const ScratchBuffers& scratch) {
+    size_t batchXYFloatBytes = (size_t)scratch.batchXYFloatBytes;
+    size_t batchFloatBytes = (size_t)scratch.batchFloatBytes;
+    size_t batchXYBytes = (size_t)scratch.batchXYBytes;
+    size_t batchBytes = (size_t)scratch.batchBytes;
+
+    inputBufBytesFloat = m.numInputChannels * batchXYFloatBytes;
+    inputBufBytes = m.numInputChannels * batchXYBytes;
+    inputGlobalBufBytesFloat = m.numInputGlobalChannels * batchFloatBytes;
+    inputGlobalBufBytes = m.numInputGlobalChannels * batchBytes;
+    inputMetaBufBytesFloat = m.numInputMetaChannels * batchFloatBytes;
+    inputMetaBufBytes = m.numInputMetaChannels * batchBytes;
+
+    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&inputBufFloat), inputBufBytesFloat));
+    CUDA_ERR("Buffers",hipMalloc(&inputBuf, inputBufBytes));
+    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&inputGlobalBufFloat), inputGlobalBufBytesFloat));
+    CUDA_ERR("Buffers",hipMalloc(&inputGlobalBuf, inputGlobalBufBytes));
+    if(m.numInputMetaChannels > 0) {
+      CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&inputMetaBufFloat), inputMetaBufBytesFloat));
+      CUDA_ERR("Buffers",hipMalloc(&inputMetaBuf, inputMetaBufBytes));
+    }
+    else {
+      inputMetaBufFloat = NULL;
+      inputMetaBuf = NULL;
+    }
+
+    if(m.modelVersion >= 16)
+      testAssert(m.policyHead->p2Channels == 4);
+    else if(m.modelVersion >= 12)
+      testAssert(m.policyHead->p2Channels == 2);
+    else
+      testAssert(m.policyHead->p2Channels == 1);
+
+    policyPassBufBytes = m.policyHead->p2Channels * batchFloatBytes;
+    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&policyPassBuf), policyPassBufBytes));
+    policyBufBytes = m.policyHead->p2Channels * batchXYFloatBytes;
+    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&policyBuf), policyBufBytes));
+
+    valueBufBytes = m.valueHead->valueChannels * batchFloatBytes;
+    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&valueBuf), valueBufBytes));
+
+    scoreValueBufBytes = m.valueHead->scoreValueChannels * batchFloatBytes;
+    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&scoreValueBuf), scoreValueBufBytes));
+
+    //This buf is used for both an intermdiate fp16 result in fp16 mode, and ALSO the final fp32 output, so always must be fp32-sized
+    ownershipBufBytes = m.valueHead->ownershipChannels * batchXYFloatBytes;
+    CUDA_ERR("Buffers",hipMalloc(&ownershipBuf, ownershipBufBytes));
+
+    //In theory the requiredWorkspaceBytes calls could give us values non-monotone in batch size
+    //such as if the convolution algorithm changes between batch size 1 and larger.
+    //So we call it for all the batch sizes.
+    size_t bytes = 0;
+    size_t b;
+    for(int batchSize = 1; batchSize <= m.maxBatchSize; batchSize++) {
+      b = m.requiredWorkspaceBytes(cudaHandles,batchSize);
+      bytes = std::max(bytes,b);
+    }
+
+    CUDA_ERR("Buffers",hipMalloc(&workspaceBuf, bytes));
+    workspaceBytes = bytes;
+  }
+
+  ~Buffers() {
+    hipFree(inputBufFloat);
+    hipFree(inputBuf);
+    hipFree(inputGlobalBufFloat);
+    hipFree(inputGlobalBuf);
+    if(inputMetaBufFloat != NULL)
+      hipFree(inputMetaBufFloat);
+    if(inputMetaBuf != NULL)
+      hipFree(inputMetaBuf);
+
+    hipFree(policyPassBuf);
+    hipFree(policyBuf);
+
+    hipFree(valueBuf);
+    hipFree(scoreValueBuf);
+    hipFree(ownershipBuf);
+
+    hipFree(workspaceBuf);
+  }
+
+};
+
+//------------------------------------------------------------------------------
+
+struct ComputeContext {
+  int nnXLen;
+  int nnYLen;
+  enabled_t useFP16Mode;
+  enabled_t useNHWCMode;
+};
+
+ComputeContext* NeuralNet::createComputeContext(
+  const std::vector<int>& gpuIdxs,
+  Logger* logger,
+  int nnXLen,
+  int nnYLen,
+  const string& openCLTunerFile,
+  const string& homeDataDirOverride,
+  bool openCLReTunePerBoardSize,
+  enabled_t useFP16Mode,
+  enabled_t useNHWCMode,
+  const LoadedModel* loadedModel
+) {
+  (void)gpuIdxs;
+  (void)logger;
+  (void)openCLTunerFile;
+  (void)homeDataDirOverride;
+  (void)openCLReTunePerBoardSize;
+  (void)loadedModel;
+
+  ComputeContext* context = new ComputeContext();
+  context->nnXLen = nnXLen;
+  context->nnYLen = nnYLen;
+  context->useFP16Mode = useFP16Mode;
+  context->useNHWCMode = useNHWCMode;
+  return context;
+}
+
+void NeuralNet::freeComputeContext(ComputeContext* computeContext) {
+  delete computeContext;
+}
+
+//------------------------------------------------------------------------------
+
+struct ComputeHandle {
+  std::unique_ptr<CudaHandles> cudaHandles;
+  std::unique_ptr<Model> model;
+  std::unique_ptr<ScratchBuffers> scratch;
+  std::unique_ptr<Buffers> buffers;
+  const bool usingFP16;
+  const int nnXLen;
+  const int nnYLen;
+  const bool requireExactNNLen;
+  const bool inputsUseNHWC;
+  const bool usingNHWC;
+
+  ComputeHandle(
+    const ComputeContext* context,
+    const LoadedModel* loadedModel,
+    int majorComputeCapability,
+    int minorComputeCapability,
+    int maxBatchSize,
+    bool requireExactNNLen_,
+    bool inputsUseNHWC_,
+    bool useFP16,
+    bool useNHWC
+  ) :
+    usingFP16(useFP16),
+    nnXLen(context->nnXLen),
+    nnYLen(context->nnYLen),
+    requireExactNNLen(requireExactNNLen_),
+    inputsUseNHWC(inputsUseNHWC_),
+    usingNHWC(useNHWC)
+  {
+    cudaHandles = std::make_unique<CudaHandles>(majorComputeCapability,minorComputeCapability);
+    model = std::make_unique<Model>(
+      cudaHandles.get(), &(loadedModel->modelDesc), maxBatchSize,
+      nnXLen, nnYLen, inputsUseNHWC, useFP16, useNHWC
+    );
+    scratch = std::make_unique<ScratchBuffers>(maxBatchSize, nnXLen, nnYLen, useFP16);
+    buffers = std::make_unique<Buffers>(cudaHandles.get(), *model, *scratch);
+
+    //Synchronize after creating buffers and copying all the weights, just in case
+    CUDA_ERR("ComputeHandle", hipDeviceSynchronize());
+  }
+  ~ComputeHandle() {
+  }
+
+  ComputeHandle() = delete;
+  ComputeHandle(const ComputeHandle&) = delete;
+  ComputeHandle& operator=(const ComputeHandle&) = delete;
+};
+
+ComputeHandle* NeuralNet::createComputeHandle(
+  ComputeContext* context,
+  const LoadedModel* loadedModel,
+  Logger* logger,
+  int maxBatchSize,
+  bool requireExactNNLen,
+  bool inputsUseNHWC,
+  int gpuIdxForThisThread,
+  int serverThreadIdx
+) {
+  //Use whatever CUDA believes GPU 0 to be.
+  if(gpuIdxForThisThread == -1)
+    gpuIdxForThisThread = 0;
+
+  CUDA_ERR("createComputeHandle",hipSetDevice(gpuIdxForThisThread));
+
+  hipDeviceProp_t prop;
+  hipGetDeviceProperties(&prop,gpuIdxForThisThread);
+
+  bool useFP16 = false;
+  bool useNHWC = false;
+  if(context->useFP16Mode == enabled_t::True || context->useFP16Mode == enabled_t::Auto)
+    useFP16 = true;
+
+  if(logger != NULL) {
+    logger->write(
+      "ROCm backend thread " + Global::intToString(serverThreadIdx) + ": Found GPU " + string(prop.name)
+      + " memory " + Global::uint64ToString(prop.totalGlobalMem)
+      + " compute capability major " + Global::intToString(prop.major)
+      + " minor " + Global::intToString(prop.minor)
+    );
+    logger->write(
+      "ROCm backend thread " + Global::intToString(serverThreadIdx) + ": Model version " + Global::intToString(loadedModel->modelDesc.modelVersion) +
+      " useFP16 = " + Global::boolToString(useFP16) +
+      " useNHWC = " + Global::boolToString(useNHWC)
+    );
+    logger->write(
+      "ROCm backend thread " + Global::intToString(serverThreadIdx) + ": Model name: " + loadedModel->modelDesc.name
+    );
+    logger->write(
+      "MIOpen finding convolution algorithms for GPU " + string(prop.name) + ". This may take a while, please wait............"
+    );
+  }
+
+  ComputeHandle* gpuHandle = new ComputeHandle(
+    context,loadedModel,prop.major,prop.minor,maxBatchSize,requireExactNNLen,inputsUseNHWC,useFP16,useNHWC
+  );
+  return gpuHandle;
+}
+
+void NeuralNet::freeComputeHandle(ComputeHandle* gpuHandle) {
+  delete gpuHandle;
+}
+
+bool NeuralNet::isUsingFP16(const ComputeHandle* handle) {
+  return handle->usingFP16;
+}
+
+//------------------------------------------------------------------------------
+
+void NeuralNet::printDevices() {
+  int numDevices = 0;
+  hipGetDeviceCount(&numDevices);
+  for(int i = 0; i<numDevices; i++) {
+    hipDeviceProp_t prop;
+    hipGetDeviceProperties(&prop, i);
+    cout << "Found ROCm device " << i << ": " << prop.name << endl;
+  }
+}
+
+
+//------------------------------------------------------------------------------
+
+struct InputBuffers {
+  int maxBatchSize;
+
+  size_t singleInputElts;
+  size_t singleInputBytes;
+  size_t singleInputGlobalElts;
+  size_t singleInputGlobalBytes;
+  size_t singleInputMetaElts;
+  size_t singleInputMetaBytes;
+  size_t singlePolicyPassResultElts;
+  size_t singlePolicyPassResultBytes;
+  size_t singlePolicyResultElts;
+  size_t singlePolicyResultBytes;
+  size_t singleValueResultElts;
+  size_t singleValueResultBytes;
+  size_t singleScoreValueResultElts;
+  size_t singleScoreValueResultBytes;
+  size_t singleOwnershipResultElts;
+  size_t singleOwnershipResultBytes;
+
+  size_t userInputBufferBytes;
+  size_t userInputGlobalBufferBytes;
+  size_t userInputMetaBufferBytes;
+  size_t policyPassResultBufferBytes;
+  size_t policyResultBufferBytes;
+  size_t valueResultBufferBytes;
+  size_t scoreValueResultBufferBytes;
+  size_t ownershipResultBufferBytes;
+
+  float* userInputBuffer; //Host pointer
+  float* userInputGlobalBuffer; //Host pointer
+  float* userInputMetaBuffer; //Host pointer
+
+  float* policyPassResults; //Host pointer
+  float* policyResults; //Host pointer
+  float* valueResults; //Host pointer
+  float* scoreValueResults; //Host pointer
+  float* ownershipResults; //Host pointer
+
+  InputBuffers(const LoadedModel* loadedModel, int maxBatchSz, int nnXLen, int nnYLen) {
+    const ModelDesc& m = loadedModel->modelDesc;
+
+    maxBatchSize = maxBatchSz;
+    singleInputElts = (size_t)m.numInputChannels * nnXLen * nnYLen;
+    singleInputBytes = (size_t)m.numInputChannels * nnXLen * nnYLen * sizeof(float);
+    singleInputGlobalElts = (size_t)m.numInputGlobalChannels;
+    singleInputGlobalBytes = (size_t)m.numInputGlobalChannels * sizeof(float);
+    singleInputMetaElts = (size_t)m.numInputMetaChannels;
+    singleInputMetaBytes = (size_t)m.numInputMetaChannels * sizeof(float);
+    singlePolicyPassResultElts = (size_t)(m.numPolicyChannels);
+    singlePolicyPassResultBytes = (size_t)(m.numPolicyChannels) * sizeof(float);
+    singlePolicyResultElts = (size_t)(m.numPolicyChannels * nnXLen * nnYLen);
+    singlePolicyResultBytes = (size_t)(m.numPolicyChannels * nnXLen * nnYLen) * sizeof(float);
+    singleValueResultElts = (size_t)m.numValueChannels;
+    singleValueResultBytes = (size_t)m.numValueChannels * sizeof(float);
+    singleScoreValueResultElts = (size_t)m.numScoreValueChannels;
+    singleScoreValueResultBytes = (size_t)m.numScoreValueChannels * sizeof(float);
+    singleOwnershipResultElts = (size_t)m.numOwnershipChannels * nnXLen * nnYLen;
+    singleOwnershipResultBytes = (size_t)m.numOwnershipChannels * nnXLen * nnYLen * sizeof(float);
+
+    assert(NNModelVersion::getNumSpatialFeatures(m.modelVersion) == m.numInputChannels);
+    assert(NNModelVersion::getNumGlobalFeatures(m.modelVersion) == m.numInputGlobalChannels);
+    if(m.numInputMetaChannels > 0) {
+      assert(SGFMetadata::METADATA_INPUT_NUM_CHANNELS == m.numInputMetaChannels);
+    }
+
+    userInputBufferBytes = (size_t)m.numInputChannels * maxBatchSize * nnXLen * nnYLen * sizeof(float);
+    userInputGlobalBufferBytes = (size_t)m.numInputGlobalChannels * maxBatchSize * sizeof(float);
+    userInputMetaBufferBytes = (size_t)m.numInputMetaChannels * maxBatchSize * sizeof(float);
+    policyPassResultBufferBytes = (size_t)maxBatchSize * m.numPolicyChannels * sizeof(float);
+    policyResultBufferBytes = (size_t)maxBatchSize * m.numPolicyChannels * nnXLen * nnYLen * sizeof(float);
+    valueResultBufferBytes = (size_t)maxBatchSize * m.numValueChannels * sizeof(float);
+    scoreValueResultBufferBytes = (size_t)maxBatchSize * m.numScoreValueChannels * sizeof(float);
+    ownershipResultBufferBytes = (size_t)maxBatchSize * nnXLen * nnYLen * m.numOwnershipChannels * sizeof(float);
+
+    userInputBuffer = new float[(size_t)m.numInputChannels * maxBatchSize * nnXLen * nnYLen];
+    userInputGlobalBuffer = new float[(size_t)m.numInputGlobalChannels * maxBatchSize];
+    if(m.numInputMetaChannels > 0)
+      userInputMetaBuffer = new float[(size_t)m.numInputMetaChannels * maxBatchSize];
+    else
+      userInputMetaBuffer = NULL;
+
+    policyPassResults = new float[(size_t)maxBatchSize * m.numPolicyChannels];
+    policyResults = new float[(size_t)maxBatchSize * m.numPolicyChannels * nnXLen * nnYLen];
+    valueResults = new float[(size_t)maxBatchSize * m.numValueChannels];
+
+    scoreValueResults = new float[(size_t)maxBatchSize * m.numScoreValueChannels];
+    ownershipResults = new float[(size_t)maxBatchSize * nnXLen * nnYLen * m.numOwnershipChannels];
+  }
+
+  ~InputBuffers() {
+    delete[] userInputBuffer;
+    delete[] userInputGlobalBuffer;
+    if(userInputMetaBuffer != NULL)
+      delete[] userInputMetaBuffer;
+    delete[] policyPassResults;
+    delete[] policyResults;
+    delete[] valueResults;
+    delete[] scoreValueResults;
+    delete[] ownershipResults;
+  }
+
+  InputBuffers() = delete;
+  InputBuffers(const InputBuffers&) = delete;
+  InputBuffers& operator=(const InputBuffers&) = delete;
+
+};
+
+InputBuffers* NeuralNet::createInputBuffers(const LoadedModel* loadedModel, int maxBatchSize, int nnXLen, int nnYLen) {
+  return new InputBuffers(loadedModel,maxBatchSize,nnXLen,nnYLen);
+}
+void NeuralNet::freeInputBuffers(InputBuffers* inputBuffers) {
+  delete inputBuffers;
+}
+
+//---------------------------------------------------------------------------------------
+
+
+void NeuralNet::getOutput(
+  ComputeHandle* gpuHandle,
+  InputBuffers* inputBuffers,
+  int numBatchEltsFilled,
+  NNResultBuf** inputBufs,
+  vector<NNOutput*>& outputs
+) {
+  assert(numBatchEltsFilled <= inputBuffers->maxBatchSize);
+  assert(numBatchEltsFilled > 0);
+  const int batchSize = numBatchEltsFilled;
+  const int nnXLen = gpuHandle->nnXLen;
+  const int nnYLen = gpuHandle->nnYLen;
+  const int modelVersion = gpuHandle->model->modelVersion;
+
+  const int numSpatialFeatures = NNModelVersion::getNumSpatialFeatures(modelVersion);
+  const int numGlobalFeatures = NNModelVersion::getNumGlobalFeatures(modelVersion);
+  const int numMetaFeatures = inputBuffers->singleInputMetaElts;
+  assert(numSpatialFeatures == gpuHandle->model->numInputChannels);
+  assert(numSpatialFeatures * nnXLen * nnYLen == inputBuffers->singleInputElts);
+  assert(numGlobalFeatures == inputBuffers->singleInputGlobalElts);
+  const int numPolicyChannels = gpuHandle->model->numPolicyChannels;
+
+  for(int nIdx = 0; nIdx<batchSize; nIdx++) {
+    float* rowSpatialInput = inputBuffers->userInputBuffer + (inputBuffers->singleInputElts * nIdx);
+    float* rowGlobalInput = inputBuffers->userInputGlobalBuffer + (inputBuffers->singleInputGlobalElts * nIdx);
+    float* rowMetaInput = inputBuffers->userInputMetaBuffer + (inputBuffers->singleInputMetaElts * nIdx);
+
+    const float* rowGlobal = inputBufs[nIdx]->rowGlobalBuf.data();
+    const float* rowSpatial = inputBufs[nIdx]->rowSpatialBuf.data();
+    const float* rowMeta = inputBufs[nIdx]->rowMetaBuf.data();
+    bool hasRowMeta = inputBufs[nIdx]->hasRowMeta;
+    std::copy(rowGlobal,rowGlobal+numGlobalFeatures,rowGlobalInput);
+    if(numMetaFeatures > 0) {
+      testAssert(rowMeta != NULL);
+      testAssert(hasRowMeta);
+      std::copy(rowMeta,rowMeta+numMetaFeatures,rowMetaInput);
+    }
+    else {
+      testAssert(!hasRowMeta);
+    }
+    SymmetryHelpers::copyInputsWithSymmetry(rowSpatial, rowSpatialInput, 1, nnYLen, nnXLen, numSpatialFeatures, gpuHandle->inputsUseNHWC, inputBufs[nIdx]->symmetry);
+  }
+
+  Buffers* buffers = gpuHandle->buffers.get();
+  ScratchBuffers* scratch = gpuHandle->scratch.get();
+
+  if(!gpuHandle->usingFP16) {
+    assert(inputBuffers->userInputBufferBytes == buffers->inputBufBytes);
+    assert(inputBuffers->userInputGlobalBufferBytes == buffers->inputGlobalBufBytes);
+    assert(inputBuffers->userInputMetaBufferBytes == buffers->inputMetaBufBytes);
+    assert(inputBuffers->policyPassResultBufferBytes == buffers->policyPassBufBytes);
+    assert(inputBuffers->policyResultBufferBytes == buffers->policyBufBytes);
+    assert(inputBuffers->valueResultBufferBytes == buffers->valueBufBytes);
+    assert(inputBuffers->singleInputBytes == inputBuffers->singleInputElts*4);
+    assert(inputBuffers->singleInputGlobalBytes == inputBuffers->singleInputGlobalElts*4);
+    assert(inputBuffers->singleInputMetaBytes == inputBuffers->singleInputMetaElts*4);
+    assert(inputBuffers->singlePolicyPassResultElts == numPolicyChannels);
+    assert(inputBuffers->singlePolicyPassResultBytes == numPolicyChannels * sizeof(float));
+    assert(inputBuffers->singlePolicyResultElts == numPolicyChannels*nnXLen*nnYLen);
+    assert(inputBuffers->singlePolicyResultBytes == numPolicyChannels*nnXLen*nnYLen * sizeof(float));
+    assert(inputBuffers->scoreValueResultBufferBytes == buffers->scoreValueBufBytes);
+    assert(inputBuffers->ownershipResultBufferBytes == buffers->ownershipBufBytes);
+    assert(inputBuffers->singleOwnershipResultElts == nnXLen*nnYLen);
+    assert(inputBuffers->singleOwnershipResultBytes == nnXLen*nnYLen * sizeof(float));
+
+    CUDA_ERR("getOutput",hipMemcpy(buffers->inputBuf, inputBuffers->userInputBuffer, inputBuffers->singleInputBytes*batchSize, hipMemcpyHostToDevice));
+    CUDA_ERR("getOutput",hipMemcpy(buffers->inputGlobalBuf, inputBuffers->userInputGlobalBuffer, inputBuffers->singleInputGlobalBytes*batchSize, hipMemcpyHostToDevice));
+    if(numMetaFeatures > 0) {
+      CUDA_ERR("getOutput",hipMemcpy(buffers->inputMetaBuf, inputBuffers->userInputMetaBuffer, inputBuffers->singleInputMetaBytes*batchSize, hipMemcpyHostToDevice));
+    }
+  }
+  else {
+    assert(inputBuffers->userInputBufferBytes == buffers->inputBufBytesFloat);
+    assert(inputBuffers->userInputGlobalBufferBytes == buffers->inputGlobalBufBytesFloat);
+    assert(inputBuffers->userInputMetaBufferBytes == buffers->inputMetaBufBytesFloat);
+    assert(inputBuffers->policyResultBufferBytes == buffers->policyBufBytes);
+    assert(inputBuffers->valueResultBufferBytes == buffers->valueBufBytes);
+    assert(inputBuffers->userInputBufferBytes == buffers->inputBufBytes*2);
+    assert(inputBuffers->userInputGlobalBufferBytes == buffers->inputGlobalBufBytes*2);
+    assert(inputBuffers->userInputMetaBufferBytes == buffers->inputMetaBufBytes*2);
+    assert(inputBuffers->singleInputBytes == inputBuffers->singleInputElts*4);
+    assert(inputBuffers->singleInputGlobalBytes == inputBuffers->singleInputGlobalElts*4);
+    assert(inputBuffers->singleInputMetaBytes == inputBuffers->singleInputMetaElts*4);
+    assert(inputBuffers->singlePolicyPassResultElts == numPolicyChannels);
+    assert(inputBuffers->singlePolicyPassResultBytes == numPolicyChannels * sizeof(float));
+    assert(inputBuffers->singlePolicyResultElts == numPolicyChannels*nnXLen*nnYLen);
+    assert(inputBuffers->singlePolicyResultBytes == numPolicyChannels*nnXLen*nnYLen * sizeof(float));
+    assert(inputBuffers->scoreValueResultBufferBytes == buffers->scoreValueBufBytes);
+    assert(inputBuffers->ownershipResultBufferBytes == buffers->ownershipBufBytes);
+    assert(inputBuffers->singleOwnershipResultElts == nnXLen*nnYLen);
+    assert(inputBuffers->singleOwnershipResultBytes == nnXLen*nnYLen * sizeof(float));
+
+    CUDA_ERR("getOutput",hipMemcpy(buffers->inputBufFloat, inputBuffers->userInputBuffer, inputBuffers->singleInputBytes*batchSize, hipMemcpyHostToDevice));
+    CUDA_ERR("getOutput",hipMemcpy(buffers->inputGlobalBufFloat, inputBuffers->userInputGlobalBuffer, inputBuffers->singleInputGlobalBytes*batchSize, hipMemcpyHostToDevice));
+    if(numMetaFeatures > 0) {
+      CUDA_ERR("getOutput",hipMemcpy(buffers->inputMetaBufFloat, inputBuffers->userInputMetaBuffer, inputBuffers->singleInputMetaBytes*batchSize, hipMemcpyHostToDevice));
+    }
+
+    customCudaCopyToHalf((const float*)buffers->inputBufFloat,(half*)buffers->inputBuf,inputBuffers->singleInputElts*batchSize);
+    CUDA_ERR("getOutput",hipPeekAtLastError());
+    customCudaCopyToHalf((const float*)buffers->inputGlobalBufFloat,(half*)buffers->inputGlobalBuf,inputBuffers->singleInputGlobalElts*batchSize);
+    CUDA_ERR("getOutput",hipPeekAtLastError());
+    if(numMetaFeatures > 0) {
+      customCudaCopyToHalf((const float*)buffers->inputMetaBufFloat,(half*)buffers->inputMetaBuf,inputBuffers->singleInputMetaElts*batchSize);
+      CUDA_ERR("getOutput",hipPeekAtLastError());
+    }
+  }
+
+  gpuHandle->model->apply(
+    gpuHandle->cudaHandles.get(),
+    scratch,
+    batchSize,
+    gpuHandle->requireExactNNLen,
+
+    buffers->inputBuf,
+    buffers->inputGlobalBuf,
+    buffers->inputMetaBuf,
+
+    buffers->policyPassBuf,
+    buffers->policyBuf,
+
+    buffers->valueBuf,
+    buffers->scoreValueBuf,
+    buffers->ownershipBuf,
+
+    buffers->workspaceBuf,
+    buffers->workspaceBytes
+  );
+
+  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->policyPassResults, buffers->policyPassBuf, inputBuffers->singlePolicyPassResultBytes*batchSize, hipMemcpyDeviceToHost));
+  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->policyResults, buffers->policyBuf, inputBuffers->singlePolicyResultBytes*batchSize, hipMemcpyDeviceToHost));
+  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->valueResults, buffers->valueBuf, inputBuffers->singleValueResultBytes*batchSize, hipMemcpyDeviceToHost));
+  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->scoreValueResults, buffers->scoreValueBuf, inputBuffers->singleScoreValueResultBytes*batchSize, hipMemcpyDeviceToHost));
+  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->ownershipResults, buffers->ownershipBuf, inputBuffers->singleOwnershipResultBytes*batchSize, hipMemcpyDeviceToHost));
+
+  assert(outputs.size() == batchSize);
+
+  float policyProbsTmp[NNPos::MAX_NN_POLICY_SIZE];
+
+  for(int row = 0; row < batchSize; row++) {
+    NNOutput* output = outputs[row];
+    assert(output->nnXLen == nnXLen);
+    assert(output->nnYLen == nnYLen);
+    float policyOptimism = (float)inputBufs[row]->policyOptimism;
+
+    const float* policyPassSrcBuf = inputBuffers->policyPassResults + row * numPolicyChannels;
+    const float* policySrcBuf = inputBuffers->policyResults + row * numPolicyChannels * nnXLen * nnYLen;
+    float* policyProbs = output->policyProbs;
+
+    // These are in logits, the client does the postprocessing to turn them into
+    // policy probabilities and white game outcome probabilities
+    // Also we don't fill in the nnHash here either
+    // Handle version >= 12 policy optimism
+    if(numPolicyChannels == 2 || (numPolicyChannels == 4 && modelVersion >= 16)) {
+       if(gpuHandle->usingNHWC) {
+        for(int i = 0; i<nnXLen*nnYLen; i++) {
+          float p = policySrcBuf[i*numPolicyChannels];
+          float pOpt = policySrcBuf[i*numPolicyChannels+1];
+          policyProbsTmp[i] = p + (pOpt-p) * policyOptimism;
+        }
+        SymmetryHelpers::copyOutputsWithSymmetry(policyProbsTmp, policyProbs, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
+        policyProbs[nnXLen*nnYLen] = policyPassSrcBuf[0] + (policyPassSrcBuf[1] - policyPassSrcBuf[0]) * policyOptimism;
+      }
+      else {
+        for(int i = 0; i<nnXLen*nnYLen; i++) {
+          float p = policySrcBuf[i];
+          float pOpt = policySrcBuf[i+nnXLen*nnYLen];
+          policyProbsTmp[i] = p + (pOpt-p) * policyOptimism;
+        }
+        SymmetryHelpers::copyOutputsWithSymmetry(policyProbsTmp, policyProbs, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
+        policyProbs[nnXLen*nnYLen] = policyPassSrcBuf[0] + (policyPassSrcBuf[1] - policyPassSrcBuf[0]) * policyOptimism;
+      }
+    }
+    else {
+      assert(numPolicyChannels == 1);
+      SymmetryHelpers::copyOutputsWithSymmetry(policySrcBuf, policyProbs, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
+      policyProbs[nnXLen*nnYLen] = policyPassSrcBuf[0];
+    }
+
+    int numValueChannels = gpuHandle->model->numValueChannels;
+    assert(numValueChannels == 3);
+    output->whiteWinProb = inputBuffers->valueResults[row * numValueChannels];
+    output->whiteLossProb = inputBuffers->valueResults[row * numValueChannels + 1];
+    output->whiteNoResultProb = inputBuffers->valueResults[row * numValueChannels + 2];
+
+    //As above, these are NOT actually from white's perspective, but rather the player to move.
+    //As usual the client does the postprocessing.
+    if(output->whiteOwnerMap != NULL) {
+      const float* ownershipSrcBuf = inputBuffers->ownershipResults + row * nnXLen * nnYLen;
+      assert(gpuHandle->model->numOwnershipChannels == 1);
+      SymmetryHelpers::copyOutputsWithSymmetry(ownershipSrcBuf, output->whiteOwnerMap, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
+    }
+
+    if(modelVersion >= 9) {
+      int numScoreValueChannels = gpuHandle->model->numScoreValueChannels;
+      assert(numScoreValueChannels == 6);
+      output->whiteScoreMean = inputBuffers->scoreValueResults[row * numScoreValueChannels];
+      output->whiteScoreMeanSq = inputBuffers->scoreValueResults[row * numScoreValueChannels + 1];
+      output->whiteLead = inputBuffers->scoreValueResults[row * numScoreValueChannels + 2];
+      output->varTimeLeft = inputBuffers->scoreValueResults[row * numScoreValueChannels + 3];
+      output->shorttermWinlossError = inputBuffers->scoreValueResults[row * numScoreValueChannels + 4];
+      output->shorttermScoreError = inputBuffers->scoreValueResults[row * numScoreValueChannels + 5];
+    }
+    else if(modelVersion >= 8) {
+      int numScoreValueChannels = gpuHandle->model->numScoreValueChannels;
+      assert(numScoreValueChannels == 4);
+      output->whiteScoreMean = inputBuffers->scoreValueResults[row * numScoreValueChannels];
+      output->whiteScoreMeanSq = inputBuffers->scoreValueResults[row * numScoreValueChannels + 1];
+      output->whiteLead = inputBuffers->scoreValueResults[row * numScoreValueChannels + 2];
+      output->varTimeLeft = inputBuffers->scoreValueResults[row * numScoreValueChannels + 3];
+      output->shorttermWinlossError = 0;
+      output->shorttermScoreError = 0;
+    }
+    else if(modelVersion >= 4) {
+      int numScoreValueChannels = gpuHandle->model->numScoreValueChannels;
+      assert(numScoreValueChannels == 2);
+      output->whiteScoreMean = inputBuffers->scoreValueResults[row * numScoreValueChannels];
+      output->whiteScoreMeanSq = inputBuffers->scoreValueResults[row * numScoreValueChannels + 1];
+      output->whiteLead = output->whiteScoreMean;
+      output->varTimeLeft = 0;
+      output->shorttermWinlossError = 0;
+      output->shorttermScoreError = 0;
+    }
+    else if(modelVersion >= 3) {
+      int numScoreValueChannels = gpuHandle->model->numScoreValueChannels;
+      assert(numScoreValueChannels == 1);
+      output->whiteScoreMean = inputBuffers->scoreValueResults[row * numScoreValueChannels];
+      //Version 3 neural nets don't have any second moment output, implicitly already folding it in, so we just use the mean squared
+      output->whiteScoreMeanSq = output->whiteScoreMean * output->whiteScoreMean;
+      output->whiteLead = output->whiteScoreMean;
+      output->varTimeLeft = 0;
+      output->shorttermWinlossError = 0;
+      output->shorttermScoreError = 0;
+    }
+    else {
+      ASSERT_UNREACHABLE;
+    }
+  }
+
+}
+
+//TESTING ----------------------------------------------------------------------------------
+
+
+bool NeuralNet::testEvaluateConv(
+  const ConvLayerDesc* desc,
+  int desiredBatchSize,
+  int nnXLen,
+  int nnYLen,
+  bool useFP16,
+  bool useNHWC,
+  const vector<float>& inputBuffer,
+  vector<float>& outputBuffer
+) {
+  hipDeviceSynchronize();
+  CudaHandles* cudaHandles = CudaHandles::cudaHandlesTesting();
+
+  size_t numInputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->inChannels;
+  size_t numOutputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->outChannels;
+  if(numInputFloats != inputBuffer.size())
+    throw StringError("testEvaluateConv: unexpected input buffer size");
+
+  void* deviceInput;
+  void* deviceOutput;
+  CudaUtils::mallocAndCopyToDevice("deviceInput", inputBuffer.data(), numInputFloats, deviceInput, useFP16);
+  CudaUtils::mallocOnDevice("deviceOutput", numOutputFloats, deviceOutput, useFP16);
+
+  int maxBatchSize = desiredBatchSize;
+
+  CudnnManager* manager = new CudnnManager("manager",maxBatchSize,nnXLen,nnYLen);
+  ConvLayer* convLayer = new ConvLayer(cudaHandles,manager,desc,useFP16,useNHWC);
+
+  size_t workspaceBytes =
+    convLayer->requiredWorkspaceBytes(cudaHandles,desiredBatchSize);
+  void* deviceWorkspace;
+  CUDA_ERR("deviceWorkspace",hipMalloc(&deviceWorkspace, workspaceBytes));
+
+
+  bool accumulate = false;
+  convLayer->apply(
+    cudaHandles,
+    desiredBatchSize,
+    accumulate,
+    deviceInput,
+    deviceOutput,
+    deviceWorkspace,
+    workspaceBytes
+  );
+
+  outputBuffer.resize(numOutputFloats);
+  CudaUtils::expensiveCopyFromDevice("copyResultsToHost", outputBuffer.data(), numOutputFloats, deviceOutput, useFP16);
+
+  hipFree(deviceWorkspace);
+
+  delete convLayer;
+  delete manager;
+  hipFree(deviceInput);
+  hipFree(deviceOutput);
+  delete cudaHandles;
+
+  return true;
+}
+
+
+bool NeuralNet::testEvaluateBatchNorm(
+  const BatchNormLayerDesc* desc,
+  int desiredBatchSize,
+  int nnXLen,
+  int nnYLen,
+  bool useFP16,
+  bool useNHWC,
+  const vector<float>& inputBuffer,
+  const vector<float>& maskBuffer,
+  vector<float>& outputBuffer
+) {
+  hipDeviceSynchronize();
+  CudaHandles* cudaHandles = CudaHandles::cudaHandlesTesting();
+
+  size_t numInputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->numChannels;
+  size_t numMaskFloats = (size_t)desiredBatchSize * nnXLen * nnYLen;
+  size_t numOutputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->numChannels;
+  if(numInputFloats != inputBuffer.size())
+    throw StringError("testEvaluateBatchNorm: unexpected input buffer size");
+  if(numMaskFloats != maskBuffer.size())
+    throw StringError("testEvaluateBatchNorm: unexpected mask buffer size");
+
+  ActivationLayerDesc actDesc;
+  actDesc.activation = ACTIVATION_IDENTITY;
+
+  void* deviceInput;
+  void* deviceMask;
+  void* deviceOutput;
+  CudaUtils::mallocAndCopyToDevice("deviceInput", inputBuffer.data(), numInputFloats, deviceInput, useFP16);
+  CudaUtils::mallocAndCopyToDevice("deviceMask", maskBuffer.data(), numMaskFloats, deviceMask, useFP16);
+  CudaUtils::mallocOnDevice("deviceOutput", numOutputFloats, deviceOutput, useFP16);
+
+  BatchNormLayer* batchNormLayer = new BatchNormLayer(cudaHandles,desc,&actDesc,nnXLen,nnYLen,useFP16,useNHWC);
+
+  batchNormLayer->apply(
+    cudaHandles,
+    desiredBatchSize,
+    deviceInput,
+    deviceMask,
+    deviceOutput
+  );
+
+  outputBuffer.resize(numOutputFloats);
+  CudaUtils::expensiveCopyFromDevice("copyResultsToHost", outputBuffer.data(), numOutputFloats, deviceOutput, useFP16);
+
+  delete batchNormLayer;
+
+  hipFree(deviceInput);
+  hipFree(deviceMask);
+  hipFree(deviceOutput);
+  delete cudaHandles;
+
+  return true;
+}
+
+
+bool NeuralNet::testEvaluateResidualBlock(
+  const ResidualBlockDesc* desc,
+  int desiredBatchSize,
+  int nnXLen,
+  int nnYLen,
+  bool useFP16,
+  bool useNHWC,
+  const vector<float>& inputBuffer,
+  const vector<float>& maskBuffer,
+  vector<float>& outputBuffer
+) {
+  hipDeviceSynchronize();
+  CudaHandles* cudaHandles = CudaHandles::cudaHandlesTesting();
+
+  size_t numInputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->preBN.numChannels;
+  size_t numMaskFloats = (size_t)desiredBatchSize * nnXLen * nnYLen;
+  size_t numOutputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->finalConv.outChannels;
+  if(numInputFloats != inputBuffer.size())
+    throw StringError("testEvaluateResidualBlock: unexpected input buffer size");
+  if(numMaskFloats != maskBuffer.size())
+    throw StringError("testEvaluateResidualBlock: unexpected mask buffer size");
+
+  ScratchBuffers* scratch = new ScratchBuffers(desiredBatchSize, nnXLen, nnYLen, useFP16);
+
+  void* deviceInput;
+  void* deviceMask;
+  void* deviceScratch;
+  CudaUtils::mallocAndCopyToDevice("deviceInput", inputBuffer.data(), numInputFloats, deviceInput, useFP16);
+  CudaUtils::mallocAndCopyToDevice("deviceMask", maskBuffer.data(), numMaskFloats, deviceMask, useFP16);
+  CudaUtils::mallocOnDevice("deviceScratch", numInputFloats, deviceScratch, useFP16);
+
+  int maxBatchSize = desiredBatchSize;
+
+  CudnnManager* manager = new CudnnManager("manager",maxBatchSize,nnXLen,nnYLen);
+  ResidualBlock* residualBlock = new ResidualBlock(cudaHandles,manager,desc,nnXLen,nnYLen,useFP16,useNHWC);
+
+  size_t workspaceBytes =
+    residualBlock->requiredWorkspaceBytes(cudaHandles,desiredBatchSize);
+  void* deviceWorkspace;
+  CUDA_ERR("deviceWorkspace",hipMalloc(&deviceWorkspace, workspaceBytes));
+
+  residualBlock->apply(
+    cudaHandles,
+    scratch,
+    desiredBatchSize,
+    deviceInput,
+    deviceScratch,
+    deviceMask,
+    deviceWorkspace,
+    workspaceBytes
+  );
+
+  outputBuffer.resize(numOutputFloats);
+  CudaUtils::expensiveCopyFromDevice("copyResultsToHost", outputBuffer.data(), numOutputFloats, deviceInput, useFP16);
+
+  hipFree(deviceWorkspace);
+
+  delete residualBlock;
+  delete manager;
+  hipFree(deviceInput);
+  hipFree(deviceMask);
+  hipFree(deviceScratch);
+  delete scratch;
+  delete cudaHandles;
+
+  return true;
+}
+
+bool NeuralNet::testEvaluateGlobalPoolingResidualBlock(
+  const GlobalPoolingResidualBlockDesc* desc,
+  int desiredBatchSize,
+  int nnXLen,
+  int nnYLen,
+  bool useFP16,
+  bool useNHWC,
+  const vector<float>& inputBuffer,
+  const vector<float>& maskBuffer,
+  vector<float>& outputBuffer
+) {
+  hipDeviceSynchronize();
+  CudaHandles* cudaHandles = CudaHandles::cudaHandlesTesting();
+
+  size_t numInputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->preBN.numChannels;
+  size_t numMaskFloats = (size_t)desiredBatchSize * nnXLen * nnYLen;
+  size_t numMaskSumFloats = (size_t)desiredBatchSize;
+  size_t numOutputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->finalConv.outChannels;
+
+  if(numInputFloats != inputBuffer.size())
+    throw StringError("testEvaluateGlobalPoolingResidualBlock: unexpected input buffer size");
+  if(numMaskFloats != maskBuffer.size())
+    throw StringError("testEvaluateGlobalPoolingResidualBlock: unexpected mask buffer size");
+
+  ScratchBuffers* scratch = new ScratchBuffers(desiredBatchSize, nnXLen, nnYLen, useFP16);
+
+  void* deviceInput;
+  void* deviceMask;
+  float* deviceMaskFloatOrig;
+  float* deviceMaskFloat;
+  float* deviceMaskSum;
+  void* deviceScratch;
+
+  CudaUtils::mallocAndCopyToDevice("deviceInput", inputBuffer.data(), numInputFloats, deviceInput, useFP16);
+  CudaUtils::mallocAndCopyToDevice("deviceMask", maskBuffer.data(), numMaskFloats, deviceMask, useFP16);
+  CUDA_ERR("deviceMaskFloat",hipMalloc(reinterpret_cast<void**>(&deviceMaskFloat), numMaskFloats * sizeof(float)));
+  CUDA_ERR("deviceMaskSum",hipMalloc(reinterpret_cast<void**>(&deviceMaskSum), numMaskSumFloats * sizeof(float)));
+  deviceMaskFloatOrig = deviceMaskFloat;
+  CudaUtils::mallocOnDevice("deviceScratch", numInputFloats, deviceScratch, useFP16);
+
+  fillMaskFloatBufAndMaskSumBuf(deviceMask, deviceMaskFloat, deviceMaskSum, useFP16, desiredBatchSize, nnXLen, nnYLen);
+
+  int maxBatchSize = desiredBatchSize;
+
+  CudnnManager* manager = new CudnnManager("manager",maxBatchSize,nnXLen,nnYLen);
+  GlobalPoolingResidualBlock* residualBlock = new GlobalPoolingResidualBlock(
+    cudaHandles,manager,desc,nnXLen,nnYLen,useFP16,useNHWC
+  );
+
+  size_t workspaceBytes =
+    residualBlock->requiredWorkspaceBytes(
+      cudaHandles,desiredBatchSize
+    );
+
+  void* deviceWorkspace;
+  CUDA_ERR("deviceWorkspace",hipMalloc(&deviceWorkspace, workspaceBytes));
+
+  residualBlock->apply(
+    cudaHandles,
+    scratch,
+    desiredBatchSize,
+    deviceInput,
+    deviceScratch,
+    deviceMask,
+    deviceMaskSum,
+    deviceWorkspace,
+    workspaceBytes
+  );
+
+  outputBuffer.resize(numOutputFloats);
+  CudaUtils::expensiveCopyFromDevice("copyResultsToHost", outputBuffer.data(), numOutputFloats, deviceInput, useFP16);
+
+  hipFree(deviceWorkspace);
+
+  delete residualBlock;
+  delete manager;
+
+  hipFree(deviceInput);
+  hipFree(deviceMask);
+  hipFree(deviceMaskFloatOrig);
+  hipFree(deviceMaskSum);
+  hipFree(deviceScratch);
+  delete scratch;
+  delete cudaHandles;
+
+  return true;
+}
+
+
+#endif  // USE_ROCM_BACKEND

From c1a09cf343054b1ad9ba5f93d56ac723bf4aadcc Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Sat, 2 Aug 2025 02:44:33 +0200
Subject: [PATCH 09/24] Update

---
 cpp/neuralnet/rocmbackend_new.cpp | 55 ++++++++++++++-----------------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/cpp/neuralnet/rocmbackend_new.cpp b/cpp/neuralnet/rocmbackend_new.cpp
index af1164f19..af6bee51e 100644
--- a/cpp/neuralnet/rocmbackend_new.cpp
+++ b/cpp/neuralnet/rocmbackend_new.cpp
@@ -346,36 +346,30 @@ struct ConvLayer {
     CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
 
     for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
-      // if(useFP16 && dilationX <= 1 && dilationY <= 1) {
-      //   (*convolutionAlgorithms)[batchSize].fwd_algo = miopenConvolutionFwdAlgoGEMM;
-      // }
-      // else {
-        const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
-        const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
-        const int requestedAlgoCount = 8;
-        int returnedAlgoCount = -1;
-        miopenConvAlgoPerf_t results[2 * requestedAlgoCount];
-        CUDNN_ERR(name.c_str(),miopenFindConvolutionForwardAlgorithm(
-            cudaHandles->cudnn,
-            inputDescriptor,
-            inputTmp,
-            filterDescriptor,
-            filterBuf,
-            convolutionDescriptor,
-            outputDescriptor,
-            outputTmp,
-            requestedAlgoCount,
-            &returnedAlgoCount,
-            results,
-            workspaceTmp,
-            workspaceBytes,
-            true
-          ));
-        if(returnedAlgoCount <= 0)
-          throw StringError("miopenFindConvolutionForwardAlgorithm returned no algorithms?");
-        (*convolutionAlgorithms)[batchSize] = results[0];
-        printf("%d / %d\n", batchSize, maxBatchSize);
-      // }
+      const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
+      const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
+      const int requestedAlgoCount = 8;
+      int returnedAlgoCount = -1;
+      miopenConvAlgoPerf_t results[2 * requestedAlgoCount];
+      CUDNN_ERR(name.c_str(),miopenFindConvolutionForwardAlgorithm(
+          cudaHandles->cudnn,
+          inputDescriptor,
+          inputTmp,
+          filterDescriptor,
+          filterBuf,
+          convolutionDescriptor,
+          outputDescriptor,
+          outputTmp,
+          requestedAlgoCount,
+          &returnedAlgoCount,
+          results,
+          workspaceTmp,
+          workspaceBytes,
+          false
+        ));
+      if(returnedAlgoCount <= 0)
+        throw StringError("miopenFindConvolutionForwardAlgorithm returned no algorithms?");
+      (*convolutionAlgorithms)[batchSize] = results[0];
     }
 
     assert(desc->weights.size() == convYSize * convXSize * inChannels * outChannels);
@@ -418,6 +412,7 @@ struct ConvLayer {
     void* workspaceBuf,
     size_t workspaceBytes
   ) const {
+    accumulate = false;
     const float alpha = 1.0f;
     const float beta = accumulate ? 1.0f : 0.0f;
     CUDNN_ERR(name.c_str(), miopenConvolutionForward(

From 0957b88b53515c33c9670a0ac00796557bbc3332 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Sat, 2 Aug 2025 03:50:20 +0200
Subject: [PATCH 10/24] Test finished

---
 cpp/neuralnet/rocmbackend.cpp     |  305 ++-
 cpp/neuralnet/rocmbackend_new.cpp | 3011 -----------------------------
 2 files changed, 245 insertions(+), 3071 deletions(-)
 delete mode 100644 cpp/neuralnet/rocmbackend_new.cpp

diff --git a/cpp/neuralnet/rocmbackend.cpp b/cpp/neuralnet/rocmbackend.cpp
index 260c2c72e..539f0b91a 100644
--- a/cpp/neuralnet/rocmbackend.cpp
+++ b/cpp/neuralnet/rocmbackend.cpp
@@ -255,8 +255,11 @@ struct ConvLayer {
   ByBatchSizeView<miopenTensorDescriptor_t> outputDescriptors;
   miopenTensorDescriptor_t filterDescriptor;
   miopenConvolutionDescriptor_t convolutionDescriptor;
-  ByBatchSize<miopenConvSolution_t>* convolutionAlgorithms; //array of one for each batch size
+  ByBatchSize<miopenConvAlgoPerf_t>* convolutionAlgorithms; //array of one for each batch size
   void* filterBuf;
+  void* inputTmp;
+  void* outputTmp;
+  void* workspaceTmp;
 
   ConvLayer() = delete;
   ConvLayer(const ConvLayer&) = delete;
@@ -296,6 +299,8 @@ struct ConvLayer {
     inputDescriptors = manager->getTensorDesc4DByBatchSize(inChannels,useFP16,useNHWCIn);
     outputDescriptors = manager->getTensorDesc4DByBatchSize(outChannels,useFP16,useNHWCOut);
     int maxBatchSize = manager->maxBatchSize;
+    int xLen = manager->nnXLen;
+    int yLen = manager->nnYLen;
 
     bool filterNHWC = useNHWCOut && dilationY == 1 && dilationX == 1;
 
@@ -329,74 +334,54 @@ struct ConvLayer {
       CUDNN_ERR(name.c_str(),miopenSetConvolutionAttribute(convolutionDescriptor,MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL,alt));
     }
 
-    convolutionAlgorithms = new ByBatchSize<miopenConvSolution_t>(maxBatchSize);
+    convolutionAlgorithms = new ByBatchSize<miopenConvAlgoPerf_t>(maxBatchSize);
+
+    size_t inBytes  = maxBatchSize * inChannels  * xLen * yLen;
+    size_t outBytes = maxBatchSize * outChannels * xLen * yLen;
+    size_t workspaceBytes = requiredWorkspaceBytes(cudaHandles, maxBatchSize) + 10305856; //1661440; 
+    
+    CudaUtils::mallocOnDevice(name, inBytes, inputTmp, useFP16);
+    CudaUtils::mallocOnDevice(name, outBytes, outputTmp, useFP16);
+    CudaUtils::mallocOnDevice(name, workspaceBytes, workspaceTmp, useFP16);
+    CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
 
     for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
-      // if(useFP16 && dilationX <= 1 && dilationY <= 1) {
-      //   (*convolutionAlgorithms)[batchSize].solution_id = 0;
-      //   continue;
-      // }
-      // else {
-        const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
-        const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
-        size_t requestedAlgoCount = 8;
-        size_t returnedAlgoCount = -1;
-        miopenConvSolution_t solutions[2 * requestedAlgoCount];
-        CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionCount(
+      const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
+      const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
+      const int requestedAlgoCount = 8;
+      int returnedAlgoCount = -1;
+      miopenConvAlgoPerf_t results[2 * requestedAlgoCount];
+      CUDNN_ERR(name.c_str(),miopenFindConvolutionForwardAlgorithm(
           cudaHandles->cudnn,
-          filterDescriptor,
           inputDescriptor,
-          convolutionDescriptor,
-          outputDescriptor,
-          &requestedAlgoCount
-        ));
-        CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolution(
-            cudaHandles->cudnn,
-            filterDescriptor,
-            inputDescriptor,
-            convolutionDescriptor,
-            outputDescriptor,
-            requestedAlgoCount,
-            &returnedAlgoCount,
-            solutions
-          ));
-        if(returnedAlgoCount <= 0)
-          throw StringError("miopenConvolutionForwardGetSolution returned no algorithms?");
-        (*convolutionAlgorithms)[batchSize] = solutions[0];
-        CUDNN_ERR(name.c_str(),miopenConvolutionForwardCompileSolution(
-          cudaHandles->cudnn,
+          inputTmp,
           filterDescriptor,
-          inputDescriptor,
+          filterBuf,
           convolutionDescriptor,
           outputDescriptor,
-          (*convolutionAlgorithms)[batchSize].solution_id
+          outputTmp,
+          requestedAlgoCount,
+          &returnedAlgoCount,
+          results,
+          workspaceTmp,
+          workspaceBytes,
+          false
         ));
-      // }
+      if(returnedAlgoCount <= 0)
+        throw StringError("miopenFindConvolutionForwardAlgorithm returned no algorithms?");
+      (*convolutionAlgorithms)[batchSize] = results[0];
     }
 
     assert(desc->weights.size() == convYSize * convXSize * inChannels * outChannels);
 
-    if(filterNHWC) {
-      vector<float> weightsTransposed(desc->weights.size());
-      for(int y = 0; y < convYSize; y++) {
-        for(int x = 0; x < convXSize; x++) {
-          for(int ic = 0; ic < inChannels; ic++) {
-            for(int oc = 0; oc < outChannels; oc++) {
-              weightsTransposed[((oc*convYSize + y)*convXSize + x)*inChannels + ic] =
-                desc->weights[((oc*inChannels + ic)*convYSize + y)*convXSize + x];
-            }
-          }
-        }
-      }
-      CudaUtils::mallocAndCopyToDevice(name,weightsTransposed,filterBuf,useFP16);
-      hipDeviceSynchronize();
-    }
-    else
-      CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
+    CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
   }
 
   ~ConvLayer() {
     hipFree(filterBuf);
+    hipFree(inputTmp);
+    hipFree(outputTmp);
+    hipFree(workspaceTmp);
     miopenDestroyTensorDescriptor(filterDescriptor);
     miopenDestroyConvolutionDescriptor(convolutionDescriptor);
     delete convolutionAlgorithms;
@@ -407,13 +392,12 @@ struct ConvLayer {
     int batchSize
   ) const {
     size_t workspaceBytes = 0;
-    CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionWorkspaceSize(
+    CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetWorkSpaceSize(
       cudaHandles->cudnn,
       filterDescriptor,
       inputDescriptors[batchSize],
       convolutionDescriptor,
       outputDescriptors[batchSize],
-      (*convolutionAlgorithms)[batchSize].solution_id,
       &workspaceBytes
     ));
     return workspaceBytes;
@@ -428,25 +412,223 @@ struct ConvLayer {
     void* workspaceBuf,
     size_t workspaceBytes
   ) const {
+    accumulate = false;
     const float alpha = 1.0f;
     const float beta = accumulate ? 1.0f : 0.0f;
-    CUDNN_ERR(name.c_str(), miopenConvolutionForwardImmediate(
+    CUDNN_ERR(name.c_str(), miopenConvolutionForward(
       cudaHandles->cudnn,
-      filterDescriptor,
-      filterBuf,
+      &alpha,
       inputDescriptors[batchSize],
       inputBuf,
+      filterDescriptor,
+      filterBuf,
       convolutionDescriptor,
+      (*convolutionAlgorithms)[batchSize].fwd_algo,
+      &beta,
       outputDescriptors[batchSize],
       outputBuf,
       workspaceBuf,
-      workspaceBytes,
-      (*convolutionAlgorithms)[batchSize].solution_id
+      workspaceBytes
     ));
   }
 
 };
 
+// New ConvLayer structure with MIOpen API
+
+// struct ConvLayer {
+//   const string name;
+//   const int inChannels;
+//   const int outChannels;
+//   ByBatchSizeView<miopenTensorDescriptor_t> inputDescriptors;
+//   ByBatchSizeView<miopenTensorDescriptor_t> outputDescriptors;
+//   miopenTensorDescriptor_t filterDescriptor;
+//   miopenConvolutionDescriptor_t convolutionDescriptor;
+//   ByBatchSize<miopenConvSolution_t>* convolutionAlgorithms; //array of one for each batch size
+//   void* filterBuf;
+
+//   ConvLayer() = delete;
+//   ConvLayer(const ConvLayer&) = delete;
+//   ConvLayer& operator=(const ConvLayer&) = delete;
+
+//   ConvLayer(
+//     CudaHandles* cudaHandles,
+//     CudnnManager* manager,
+//     const ConvLayerDesc* desc,
+//     bool useFP16,
+//     bool useNHWC
+//   ) : ConvLayer(cudaHandles, manager, desc, useFP16, useNHWC, useNHWC)
+//   {}
+
+//   ConvLayer(
+//     CudaHandles* cudaHandles,
+//     CudnnManager* manager,
+//     const ConvLayerDesc* desc,
+//     bool useFP16,
+//     bool useNHWCIn,
+//     bool useNHWCOut
+//   ) :
+//     name(desc->name),
+//     inChannels(desc->inChannels),
+//     outChannels(desc->outChannels)
+//   {
+//     int convYSize = desc->convYSize;
+//     int convXSize = desc->convXSize;
+//     int dilationY = desc->dilationY;
+//     int dilationX = desc->dilationX;
+//     int paddingX = (convXSize / 2) * dilationX;
+//     int paddingY = (convYSize / 2) * dilationY;
+
+//     assert(convXSize % 2 == 1);
+//     assert(convYSize % 2 == 1);
+
+//     inputDescriptors = manager->getTensorDesc4DByBatchSize(inChannels,useFP16,useNHWCIn);
+//     outputDescriptors = manager->getTensorDesc4DByBatchSize(outChannels,useFP16,useNHWCOut);
+//     int maxBatchSize = manager->maxBatchSize;
+
+//     bool filterNHWC = useNHWCOut && dilationY == 1 && dilationX == 1;
+
+//     CUDNN_ERR(name.c_str(),miopenCreateTensorDescriptor(&filterDescriptor));
+//     CUDNN_ERR(name.c_str(),miopenSet4dTensorDescriptor(
+//       filterDescriptor,
+//       (useFP16 ? miopenHalf : miopenFloat),
+//       outChannels,
+//       inChannels,
+//       convYSize,
+//       convXSize
+//     ));
+
+//     int yStride = 1;
+//     int xStride = 1;
+
+
+//     CUDNN_ERR(name.c_str(),miopenCreateConvolutionDescriptor(&convolutionDescriptor));
+//     CUDNN_ERR(name.c_str(),miopenInitConvolutionDescriptor(
+//       convolutionDescriptor,
+//       miopenConvolution,
+//       paddingY,
+//       paddingX,
+//       yStride,
+//       xStride,
+//       dilationY,
+//       dilationX
+//     ));
+//     if(useFP16) {
+//       int alt = 1; // non‑zero enables alt‑impl on MI2xx+ GPUs
+//       CUDNN_ERR(name.c_str(),miopenSetConvolutionAttribute(convolutionDescriptor,MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL,alt));
+//     }
+
+//     convolutionAlgorithms = new ByBatchSize<miopenConvSolution_t>(maxBatchSize);
+
+//     for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
+//       const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
+//       const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
+//       size_t requestedAlgoCount = 8;
+//       size_t returnedAlgoCount = -1;
+//       miopenConvSolution_t solutions[2 * requestedAlgoCount];
+//       CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionCount(
+//         cudaHandles->cudnn,
+//         filterDescriptor,
+//         inputDescriptor,
+//         convolutionDescriptor,
+//         outputDescriptor,
+//         &requestedAlgoCount
+//       ));
+//       CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolution(
+//           cudaHandles->cudnn,
+//           filterDescriptor,
+//           inputDescriptor,
+//           convolutionDescriptor,
+//           outputDescriptor,
+//           requestedAlgoCount,
+//           &returnedAlgoCount,
+//           solutions
+//         ));
+//       if(returnedAlgoCount <= 0)
+//         throw StringError("miopenConvolutionForwardGetSolution returned no algorithms?");
+//       (*convolutionAlgorithms)[batchSize] = solutions[0];
+//       CUDNN_ERR(name.c_str(),miopenConvolutionForwardCompileSolution(
+//         cudaHandles->cudnn,
+//         filterDescriptor,
+//         inputDescriptor,
+//         convolutionDescriptor,
+//         outputDescriptor,
+//         (*convolutionAlgorithms)[batchSize].solution_id
+//       ));
+//     }
+
+//     assert(desc->weights.size() == convYSize * convXSize * inChannels * outChannels);
+
+//     if(filterNHWC) {
+//       vector<float> weightsTransposed(desc->weights.size());
+//       for(int y = 0; y < convYSize; y++) {
+//         for(int x = 0; x < convXSize; x++) {
+//           for(int ic = 0; ic < inChannels; ic++) {
+//             for(int oc = 0; oc < outChannels; oc++) {
+//               weightsTransposed[((oc*convYSize + y)*convXSize + x)*inChannels + ic] =
+//                 desc->weights[((oc*inChannels + ic)*convYSize + y)*convXSize + x];
+//             }
+//           }
+//         }
+//       }
+//       CudaUtils::mallocAndCopyToDevice(name,weightsTransposed,filterBuf,useFP16);
+//       hipDeviceSynchronize();
+//     }
+//     else
+//       CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
+//   }
+
+//   ~ConvLayer() {
+//     hipFree(filterBuf);
+//     miopenDestroyTensorDescriptor(filterDescriptor);
+//     miopenDestroyConvolutionDescriptor(convolutionDescriptor);
+//     delete convolutionAlgorithms;
+//   }
+
+//   size_t requiredWorkspaceBytes(
+//     CudaHandles* cudaHandles,
+//     int batchSize
+//   ) const {
+//     size_t workspaceBytes = 0;
+//     CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionWorkspaceSize(
+//       cudaHandles->cudnn,
+//       filterDescriptor,
+//       inputDescriptors[batchSize],
+//       convolutionDescriptor,
+//       outputDescriptors[batchSize],
+//       (*convolutionAlgorithms)[batchSize].solution_id,
+//       &workspaceBytes
+//     ));
+//     return workspaceBytes;
+//   }
+
+//   void apply(
+//     CudaHandles* cudaHandles,
+//     int batchSize,
+//     bool accumulate,
+//     void* inputBuf,
+//     void* outputBuf,
+//     void* workspaceBuf,
+//     size_t workspaceBytes
+//   ) const {
+//     const float alpha = 1.0f;
+//     const float beta = accumulate ? 1.0f : 0.0f;
+//     CUDNN_ERR(name.c_str(), miopenConvolutionForwardImmediate(
+//       cudaHandles->cudnn,
+//       filterDescriptor,
+//       filterBuf,
+//       inputDescriptors[batchSize],
+//       inputBuf,
+//       convolutionDescriptor,
+//       outputDescriptors[batchSize],
+//       outputBuf,
+//       workspaceBuf,
+//       workspaceBytes,
+//       (*convolutionAlgorithms)[batchSize].solution_id
+//     ));
+//   }
+
+// };
 
 //---------------------------------------------------------------------------------
 
@@ -2352,6 +2534,9 @@ ComputeHandle* NeuralNet::createComputeHandle(
     logger->write(
       "ROCm backend thread " + Global::intToString(serverThreadIdx) + ": Model name: " + loadedModel->modelDesc.name
     );
+    logger->write(
+      "MIOpen finding convolution algorithms for GPU " + string(prop.name) + ". This may take a while, please wait......"
+    );
   }
 
   ComputeHandle* gpuHandle = new ComputeHandle(
diff --git a/cpp/neuralnet/rocmbackend_new.cpp b/cpp/neuralnet/rocmbackend_new.cpp
deleted file mode 100644
index af6bee51e..000000000
--- a/cpp/neuralnet/rocmbackend_new.cpp
+++ /dev/null
@@ -1,3011 +0,0 @@
-#ifdef USE_ROCM_BACKEND
-
-#include "../neuralnet/rocmerrorcheck.h"
-#include "../neuralnet/rocmincludes.h"
-
-#include "../neuralnet/rocmhelpers.h"
-#include "../neuralnet/rocmutils.h"
-#include "../neuralnet/modelversion.h"
-#include "../neuralnet/nninterface.h"
-#include "../neuralnet/nninputs.h"
-#include "../neuralnet/sgfmetadata.h"
-#include "../neuralnet/nneval.h"
-#include "../neuralnet/desc.h"
-
-#include "../core/simpleallocator.h"
-#include "../core/test.h"
-
-#include "../external/half-2.2.0/include/half.hpp"
-
-//------------------------
-#include "../core/using.h"
-//------------------------
-
-using half_t = half_float::half;
-
-//Define this to print out some of the intermediate values of the neural net
-//#define DEBUG_INTERMEDIATE_VALUES
-
-void NeuralNet::globalInitialize() {
-  //Empty for cudnn backend
-}
-
-void NeuralNet::globalCleanup() {
-  hipDeviceReset();
-}
-
-struct CudaHandles {
-  hipblasHandle_t cublas;
-  miopenHandle_t cudnn;
-  const int majorComputeCapability;
-  const int minorComputeCapability;
-
-  CudaHandles(int major, int minor)
-    : majorComputeCapability(major),
-      minorComputeCapability(minor)
-  {
-    CUBLAS_ERR("CudaHandles",hipblasCreate(&cublas));
-    CUDNN_ERR("CudaHandles",miopenCreate(&cudnn));
-  }
-
-  ~CudaHandles() {
-    hipblasDestroy(cublas);
-    miopenDestroy(cudnn);
-  }
-
-  static CudaHandles* cudaHandlesTesting() {
-    const int gpuIdxForThisThread = 0;
-    hipDeviceProp_t prop;
-    hipGetDeviceProperties(&prop,gpuIdxForThisThread);
-    return new CudaHandles(prop.major, prop.minor);
-  }
-
-  CudaHandles(const CudaHandles&) = delete;
-  CudaHandles& operator=(const CudaHandles&) = delete;
-};
-
-//---------------------------------------------------------------------------------
-
-template<typename T>
-struct ByBatchSize {
-  const int maxBatchSize;
-  T* data;
-  miopenStatus_t (*destroyFunc)(T);
-
-  ByBatchSize()
-    : maxBatchSize(0), data(nullptr), destroyFunc(nullptr)
-  {}
-
-  ByBatchSize(
-    int maxBatchSize_
-  ) : maxBatchSize(maxBatchSize_), data(nullptr), destroyFunc(nullptr) {
-    data = new T[maxBatchSize];
-  }
-
-  ByBatchSize(const ByBatchSize&) = delete;
-  ByBatchSize& operator=(const ByBatchSize&) = delete;
-
-  ~ByBatchSize() {
-    if(destroyFunc != nullptr && data != nullptr) {
-      for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
-        (*destroyFunc)(data[batchSize-1]);
-      }
-    }
-    if(data != nullptr) {
-      delete[] data;
-      data = nullptr;
-    }
-  }
-  T& operator[](int batchSize) {
-    return data[batchSize-1];
-  }
-  const T& operator[](int batchSize) const {
-    return data[batchSize-1];
-  }
-};
-
-template<typename T>
-struct ByBatchSizeView {
-  int maxBatchSize;
-  T* data;
-
-  ByBatchSizeView()
-    : maxBatchSize(0), data(nullptr)
-  {}
-
-  ByBatchSizeView(const ByBatchSize<T>& toView)
-    : maxBatchSize(toView.maxBatchSize), data(toView.data)
-  {}
-  ByBatchSizeView& operator=(const ByBatchSize<T>& toView) {
-    maxBatchSize = toView.maxBatchSize;
-    data = toView.data;
-  }
-
-  ~ByBatchSizeView() {
-  }
-  T& operator[](int batchSize) {
-    return data[batchSize-1];
-  }
-  const T& operator[](int batchSize) const {
-    return data[batchSize-1];
-  }
-};
-
-//---------------------------------------------------------------------------------
-
-
-//channels, useFP16, useNHWC
-typedef std::tuple<int, bool, bool> CudnnTensorDesc4DKey;
-
-struct CudnnManager {
-  const string name;
-  const int maxBatchSize;
-  const int nnXLen;
-  const int nnYLen;
-  std::map<CudnnTensorDesc4DKey, ByBatchSize<miopenTensorDescriptor_t>*> tensorDesc4DByBatchSizeByKey;
-
-  CudnnManager(string name_, int maxBatchSize_, int nnXLen_, int nnYLen_)
-    :name(name_),
-     maxBatchSize(maxBatchSize_),
-     nnXLen(nnXLen_),
-     nnYLen(nnYLen_),
-     tensorDesc4DByBatchSizeByKey()
-  {
-  }
-
-  ~CudnnManager() {
-    for(auto& iter: tensorDesc4DByBatchSizeByKey) {
-      delete iter.second;
-    }
-  }
-
-  ByBatchSizeView<miopenTensorDescriptor_t> getTensorDesc4DByBatchSize(
-    int channels, bool useFP16, bool useNHWC
-  ) {
-    auto iter = tensorDesc4DByBatchSizeByKey.find({channels, useFP16, useNHWC});
-    if(iter != tensorDesc4DByBatchSizeByKey.end()) {
-      return ByBatchSizeView<miopenTensorDescriptor_t>(*(iter->second));
-    }
-    ByBatchSize<miopenTensorDescriptor_t>* descs = new ByBatchSize<miopenTensorDescriptor_t>(maxBatchSize);
-    for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
-      miopenTensorDescriptor_t& desc = (*descs)[batchSize];
-      CUDNN_ERR(name.c_str(),miopenCreateTensorDescriptor(&desc));
-      CUDNN_ERR(name.c_str(),miopenSet4dTensorDescriptor(
-                  desc,
-                  (useFP16 ? miopenHalf : miopenFloat),
-                  batchSize,
-                  channels,
-                  nnYLen,
-                  nnXLen
-                ));
-    }
-    descs->destroyFunc = miopenDestroyTensorDescriptor;
-    tensorDesc4DByBatchSizeByKey[{channels, useFP16, useNHWC}] = descs;
-    return ByBatchSizeView<miopenTensorDescriptor_t>(*descs);
-  }
-};
-
-//---------------------------------------------------------------------------------
-
-struct ScratchBuffers {
-
-  const size_t batchXYFloatBytes;
-  const size_t batchFloatBytes;
-  const size_t batchXYBytes;
-  const size_t batchBytes;
-
-  SimpleAllocator<void*>* allocator;
-
-  // Not scratch, but convenient to have here
-  void* zeroBuf;
-  void* oneBuf;
-
-  ScratchBuffers() = delete;
-  ScratchBuffers(const ScratchBuffers&) = delete;
-  ScratchBuffers& operator=(const ScratchBuffers&) = delete;
-
-  ScratchBuffers(int maxBatchSize, int nnXLen, int nnYLen, bool useFP16)
-    : batchXYFloatBytes((size_t)maxBatchSize * nnXLen * nnYLen * sizeof(float)),
-      batchFloatBytes((size_t)maxBatchSize * sizeof(float)),
-      batchXYBytes((size_t)maxBatchSize * nnXLen * nnYLen * (useFP16 ? sizeof(half_t) : sizeof(float))),
-      batchBytes((size_t)maxBatchSize * (useFP16 ? sizeof(half_t) : sizeof(float)))
-  {
-    std::function<void*(size_t)> allocateFunc = [](size_t size) {
-      void* buf;
-      CUDA_ERR("ScratchBuffers",hipMalloc(&buf, size));
-      return buf;
-    };
-    std::function<void(void*)> releaseFunc = [](void* buf) {
-      hipFree(buf);
-    };
-
-    allocator = new SimpleAllocator<void*>(allocateFunc, releaseFunc);
-
-    CudaUtils::hostMallocZeroOneBufs(zeroBuf, oneBuf, useFP16);
-  }
-  ~ScratchBuffers() {
-    delete allocator;
-    free(zeroBuf);
-    free(oneBuf);
-  }
-
-  size_t getBufSizeXY(int channels) const {
-    return channels * batchXYBytes;
-  }
-  size_t getBufSizeXYFloat(int channels) const {
-    return channels * batchXYFloatBytes;
-  }
-  size_t getBufSizeFloat(int channels) const {
-    return channels * batchFloatBytes;
-  }
-  size_t getBufSize(int channels) const {
-    return channels * batchBytes;
-  }
-
-};
-
-
-//---------------------------------------------------------------------------------
-
-struct ConvLayer {
-  const string name;
-  const int inChannels;
-  const int outChannels;
-  ByBatchSizeView<miopenTensorDescriptor_t> inputDescriptors;
-  ByBatchSizeView<miopenTensorDescriptor_t> outputDescriptors;
-  miopenTensorDescriptor_t filterDescriptor;
-  miopenConvolutionDescriptor_t convolutionDescriptor;
-  ByBatchSize<miopenConvAlgoPerf_t>* convolutionAlgorithms; //array of one for each batch size
-  void* filterBuf;
-  void* inputTmp;
-  void* outputTmp;
-  void* workspaceTmp;
-
-  ConvLayer() = delete;
-  ConvLayer(const ConvLayer&) = delete;
-  ConvLayer& operator=(const ConvLayer&) = delete;
-
-  ConvLayer(
-    CudaHandles* cudaHandles,
-    CudnnManager* manager,
-    const ConvLayerDesc* desc,
-    bool useFP16,
-    bool useNHWC
-  ) : ConvLayer(cudaHandles, manager, desc, useFP16, useNHWC, useNHWC)
-  {}
-
-  ConvLayer(
-    CudaHandles* cudaHandles,
-    CudnnManager* manager,
-    const ConvLayerDesc* desc,
-    bool useFP16,
-    bool useNHWCIn,
-    bool useNHWCOut
-  ) :
-    name(desc->name),
-    inChannels(desc->inChannels),
-    outChannels(desc->outChannels)
-  {
-    int convYSize = desc->convYSize;
-    int convXSize = desc->convXSize;
-    int dilationY = desc->dilationY;
-    int dilationX = desc->dilationX;
-    int paddingX = (convXSize / 2) * dilationX;
-    int paddingY = (convYSize / 2) * dilationY;
-
-    assert(convXSize % 2 == 1);
-    assert(convYSize % 2 == 1);
-
-    inputDescriptors = manager->getTensorDesc4DByBatchSize(inChannels,useFP16,useNHWCIn);
-    outputDescriptors = manager->getTensorDesc4DByBatchSize(outChannels,useFP16,useNHWCOut);
-    int maxBatchSize = manager->maxBatchSize;
-    int xLen = manager->nnXLen;
-    int yLen = manager->nnYLen;
-
-    bool filterNHWC = useNHWCOut && dilationY == 1 && dilationX == 1;
-
-    CUDNN_ERR(name.c_str(),miopenCreateTensorDescriptor(&filterDescriptor));
-    CUDNN_ERR(name.c_str(),miopenSet4dTensorDescriptor(
-      filterDescriptor,
-      (useFP16 ? miopenHalf : miopenFloat),
-      outChannels,
-      inChannels,
-      convYSize,
-      convXSize
-    ));
-
-    int yStride = 1;
-    int xStride = 1;
-
-
-    CUDNN_ERR(name.c_str(),miopenCreateConvolutionDescriptor(&convolutionDescriptor));
-    CUDNN_ERR(name.c_str(),miopenInitConvolutionDescriptor(
-      convolutionDescriptor,
-      miopenConvolution,
-      paddingY,
-      paddingX,
-      yStride,
-      xStride,
-      dilationY,
-      dilationX
-    ));
-    if(useFP16) {
-      int alt = 1; // non‑zero enables alt‑impl on MI2xx+ GPUs
-      CUDNN_ERR(name.c_str(),miopenSetConvolutionAttribute(convolutionDescriptor,MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL,alt));
-    }
-
-    convolutionAlgorithms = new ByBatchSize<miopenConvAlgoPerf_t>(maxBatchSize);
-
-    size_t inBytes  = maxBatchSize * inChannels  * xLen * yLen;
-    size_t outBytes = maxBatchSize * outChannels * xLen * yLen;
-    size_t workspaceBytes = requiredWorkspaceBytes(cudaHandles, maxBatchSize); 
-    
-    CudaUtils::mallocOnDevice(name, inBytes, inputTmp, useFP16);
-    CudaUtils::mallocOnDevice(name, outBytes, outputTmp, useFP16);
-    CudaUtils::mallocOnDevice(name, workspaceBytes, workspaceTmp, useFP16);
-    CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
-
-    for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
-      const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
-      const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
-      const int requestedAlgoCount = 8;
-      int returnedAlgoCount = -1;
-      miopenConvAlgoPerf_t results[2 * requestedAlgoCount];
-      CUDNN_ERR(name.c_str(),miopenFindConvolutionForwardAlgorithm(
-          cudaHandles->cudnn,
-          inputDescriptor,
-          inputTmp,
-          filterDescriptor,
-          filterBuf,
-          convolutionDescriptor,
-          outputDescriptor,
-          outputTmp,
-          requestedAlgoCount,
-          &returnedAlgoCount,
-          results,
-          workspaceTmp,
-          workspaceBytes,
-          false
-        ));
-      if(returnedAlgoCount <= 0)
-        throw StringError("miopenFindConvolutionForwardAlgorithm returned no algorithms?");
-      (*convolutionAlgorithms)[batchSize] = results[0];
-    }
-
-    assert(desc->weights.size() == convYSize * convXSize * inChannels * outChannels);
-
-    CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
-  }
-
-  ~ConvLayer() {
-    hipFree(filterBuf);
-    hipFree(inputTmp);
-    hipFree(outputTmp);
-    hipFree(workspaceTmp);
-    miopenDestroyTensorDescriptor(filterDescriptor);
-    miopenDestroyConvolutionDescriptor(convolutionDescriptor);
-    delete convolutionAlgorithms;
-  }
-
-  size_t requiredWorkspaceBytes(
-    CudaHandles* cudaHandles,
-    int batchSize
-  ) const {
-    size_t workspaceBytes = 0;
-    CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetWorkSpaceSize(
-      cudaHandles->cudnn,
-      filterDescriptor,
-      inputDescriptors[batchSize],
-      convolutionDescriptor,
-      outputDescriptors[batchSize],
-      &workspaceBytes
-    ));
-    return workspaceBytes;
-  }
-
-  void apply(
-    CudaHandles* cudaHandles,
-    int batchSize,
-    bool accumulate,
-    void* inputBuf,
-    void* outputBuf,
-    void* workspaceBuf,
-    size_t workspaceBytes
-  ) const {
-    accumulate = false;
-    const float alpha = 1.0f;
-    const float beta = accumulate ? 1.0f : 0.0f;
-    CUDNN_ERR(name.c_str(), miopenConvolutionForward(
-      cudaHandles->cudnn,
-      &alpha,
-      inputDescriptors[batchSize],
-      inputBuf,
-      filterDescriptor,
-      filterBuf,
-      convolutionDescriptor,
-      (*convolutionAlgorithms)[batchSize].fwd_algo,
-      &beta,
-      outputDescriptors[batchSize],
-      outputBuf,
-      workspaceBuf,
-      workspaceBytes
-    ));
-  }
-
-};
-
-
-//---------------------------------------------------------------------------------
-
-struct BatchNormLayer {
-  const string name;
-  const int numChannels;
-  const float epsilon;
-  const int activation;
-  const int nnXLen;
-  const int nnYLen;
-
-  const bool usingFP16;
-  const bool usingNHWC;
-
-  void* mergedScaleBuf;
-  void* mergedBiasBuf;
-
-  BatchNormLayer() = delete;
-  BatchNormLayer(const BatchNormLayer&) = delete;
-  BatchNormLayer& operator=(const BatchNormLayer&) = delete;
-
-  BatchNormLayer(
-    CudaHandles* cudaHandles,
-    const BatchNormLayerDesc* desc,
-    const ActivationLayerDesc* actDesc,
-    int nnX,
-    int nnY,
-    bool useFP16,
-    bool useNHWC
-  ) :
-    name(desc->name),
-    numChannels(desc->numChannels),
-    epsilon(desc->epsilon),
-    activation(actDesc->activation),
-    nnXLen(nnX),
-    nnYLen(nnY),
-    usingFP16(useFP16),
-    usingNHWC(useNHWC)
-  {
-    (void)cudaHandles;
-
-    assert(desc->mean.size() == numChannels);
-    assert(desc->variance.size() == numChannels);
-    assert(desc->scale.size() == numChannels);
-    assert(desc->bias.size() == numChannels);
-    assert(desc->mergedScale.size() == numChannels);
-    assert(desc->mergedBias.size() == numChannels);
-    CudaUtils::mallocAndCopyToDevice(name,desc->mergedScale,mergedScaleBuf,useFP16);
-    CudaUtils::mallocAndCopyToDevice(name,desc->mergedBias,mergedBiasBuf,useFP16);
-  }
-  ~BatchNormLayer() {
-    hipFree(mergedScaleBuf);
-    hipFree(mergedBiasBuf);
-  }
-
-  void apply(
-    CudaHandles* cudaHandles,
-    int batchSize,
-    void* inputBuf,
-    const void* maskBuf, //ok to be null
-    void* outputBuf
-  ) const {
-    (void)cudaHandles;
-    if(!usingFP16) {
-      if(!usingNHWC)
-        customCudaApplyCScaleBiasNCHW((const float*)inputBuf,(float*)outputBuf,(const float*)mergedScaleBuf,(const float*)mergedBiasBuf,
-                                      (const float*)maskBuf,
-                                      batchSize,numChannels,nnXLen*nnYLen,activation);
-      else
-        customCudaApplyCScaleBiasNHWC((const float*)inputBuf,(float*)outputBuf,(const float*)mergedScaleBuf,(const float*)mergedBiasBuf,
-                                      (const float*)maskBuf,
-                                      batchSize,nnXLen*nnYLen,numChannels,activation);
-    }
-    else {
-      if(!usingNHWC)
-        customCudaApplyCScaleBiasNCHW((const half*)inputBuf,(half*)outputBuf,(const half*)mergedScaleBuf,(const half*)mergedBiasBuf,
-                                      (const half*)maskBuf,
-                                      batchSize,numChannels,nnXLen*nnYLen,activation);
-      else
-        customCudaApplyCScaleBiasNHWC((const half*)inputBuf,(half*)outputBuf,(const half*)mergedScaleBuf,(const half*)mergedBiasBuf,
-                                      (const half*)maskBuf,
-                                      batchSize,nnXLen*nnYLen,numChannels,activation);
-      CUDA_ERR(name.c_str(),hipPeekAtLastError());
-    }
-
-  }
-
-};
-
-
-//---------------------------------------------------------------------------------
-
-struct MatMulLayer {
-  const string name;
-  const int inChannels;
-  const int outChannels;
-  const bool usingFP16;
-  void* matBuf;
-
-  MatMulLayer() = delete;
-  MatMulLayer(const MatMulLayer&) = delete;
-  MatMulLayer& operator=(const MatMulLayer&) = delete;
-
-  MatMulLayer(
-    CudaHandles* cudaHandles,
-    const MatMulLayerDesc* desc,
-    bool useFP16
-  ) :
-    name(desc->name),
-    inChannels(desc->inChannels),
-    outChannels(desc->outChannels),
-    usingFP16(useFP16)
-  {
-    (void)cudaHandles;
-
-    if(inChannels > 0 && outChannels > 0) {
-      assert(desc->weights.size() == inChannels * outChannels);
-      CudaUtils::mallocAndCopyToDevice(name,desc->weights,matBuf,useFP16);
-    }
-    else {
-      matBuf = NULL;
-    }
-  }
-
-  ~MatMulLayer() {
-    if(inChannels > 0 && outChannels > 0)
-      hipFree(matBuf);
-  }
-
-  size_t requiredWorkspaceBytes(
-    CudaHandles* cudaHandles
-  ) const {
-    (void)cudaHandles;
-    size_t workspaceBytes = 0;
-    return workspaceBytes;
-  }
-
-  void apply(
-    CudaHandles* cudaHandles,
-    ScratchBuffers* scratch,
-    int batchSize,
-    void* inputBuf,
-    void* outputBuf,
-    void* workspaceBuf,
-    size_t workspaceBytes
-  ) const {
-    (void)workspaceBuf;
-    (void)workspaceBytes;
-    assert(inChannels > 0 && outChannels > 0);
-
-    if(!usingFP16) {
-      const float alpha = 1.0f;
-      const float beta = 0.0f;
-      CUBLAS_ERR(name.c_str(),hipblasSgemm(
-        cudaHandles->cublas,
-        HIPBLAS_OP_N,
-        HIPBLAS_OP_N,
-        outChannels,
-        batchSize,
-        inChannels,
-        &alpha,
-        (const float*)matBuf,outChannels,
-        (const float*)inputBuf,inChannels,
-        &beta,
-        (float*)outputBuf,outChannels
-      ));
-    }
-    else {
-      const hipblasHalf* alpha = (const hipblasHalf*)scratch->oneBuf;
-      const hipblasHalf* beta = (const hipblasHalf*)scratch->zeroBuf;
-      CUBLAS_ERR(name.c_str(),hipblasHgemm(
-        cudaHandles->cublas,
-        HIPBLAS_OP_N,
-        HIPBLAS_OP_N,
-        outChannels,
-        batchSize,
-        inChannels,
-        alpha,
-        (const hipblasHalf*)matBuf,outChannels,
-        (const hipblasHalf*)inputBuf,inChannels,
-        beta,
-        (hipblasHalf*)outputBuf,outChannels
-      ));
-    }
-
-  }
-
-};
-
-//---------------------------------------------------------------------------------
-
-struct MatBiasLayer {
-  const string name;
-  const int numChannels;
-  const bool usingFP16;
-  const int activation;
-
-  void* biasBuf;
-
-  MatBiasLayer() = delete;
-  MatBiasLayer(const MatBiasLayer&) = delete;
-  MatBiasLayer& operator=(const MatBiasLayer&) = delete;
-
-  MatBiasLayer(
-    CudaHandles* cudaHandles,
-    const MatBiasLayerDesc* desc,
-    bool useFP16,
-    int activation_
-  ) :
-    name(desc->name),
-    numChannels(desc->numChannels),
-    usingFP16(useFP16),
-    activation(activation_)
-  {
-    (void)cudaHandles;
-    if(numChannels > 0) {
-      assert(desc->weights.size() == numChannels);
-      CudaUtils::mallocAndCopyToDevice(name,desc->weights,biasBuf,useFP16);
-    }
-    else
-      biasBuf = NULL;
-  }
-
-  ~MatBiasLayer() {
-    if(numChannels > 0)
-      hipFree(biasBuf);
-  }
-
-  void apply(
-    CudaHandles* cudaHandles,
-    int batchSize,
-    void* matBuf
-  ) const {
-    (void)cudaHandles;
-    assert(numChannels > 0);
-    if(!usingFP16) {
-      customCudaAddCBiasInplaceNC((float*)matBuf,(const float*)biasBuf,batchSize,numChannels,activation);
-      CUDA_ERR(name.c_str(),hipPeekAtLastError());
-    }
-    else {
-      customCudaAddCBiasInplaceNC((half*)matBuf,(const half*)biasBuf,batchSize,numChannels,activation);
-      CUDA_ERR(name.c_str(),hipPeekAtLastError());
-    }
-  }
-
-};
-
-//---------------------------------------------------------------------------------
-
-struct NormActConv {
-  const BatchNormLayer norm;
-  const ConvLayer conv;
-
-  const int inChannels;
-  const int outChannels;
-  const int nnXLen;
-  const int nnYLen;
-  const bool usingFP16;
-  const bool usingNHWC;
-
-  NormActConv() = delete;
-  NormActConv(const NormActConv&) = delete;
-  NormActConv& operator=(const NormActConv&) = delete;
-
-  NormActConv(
-    CudaHandles* cudaHandles,
-    CudnnManager* manager,
-    const BatchNormLayerDesc* normDesc,
-    const ActivationLayerDesc* actDesc,
-    const ConvLayerDesc* convDesc,
-    int nnX,
-    int nnY,
-    bool useFP16,
-    bool useNHWC
-  ): norm(cudaHandles,normDesc,actDesc,nnX,nnY,useFP16,useNHWC),
-     conv(cudaHandles,manager,convDesc,useFP16,useNHWC),
-     inChannels(norm.numChannels),
-     outChannels(conv.outChannels),
-     nnXLen(nnX),
-     nnYLen(nnY),
-     usingFP16(useFP16),
-     usingNHWC(useNHWC)
-  {
-    assert(norm.numChannels == conv.inChannels);
-  }
-
-  ~NormActConv()
-  {}
-
-  size_t requiredWorkspaceBytes(
-    CudaHandles* cudaHandles,
-    int batchSize
-  ) const {
-    size_t bytes = 0;
-    size_t b;
-    b = conv.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    return bytes;
-  }
-
-  void apply(
-    CudaHandles* cudaHandles,
-    int batchSize,
-    bool accumulate,
-    void* inBuf,
-    void* inScratchBuf,
-    void* outBuf,
-    void* maskBuf,
-    void* workspaceBuf,
-    size_t workspaceBytes
-  ) const {
-    norm.apply(cudaHandles,batchSize,inBuf,maskBuf,inScratchBuf);
-#ifdef DEBUG_INTERMEDIATE_VALUES
-    CudaUtils::debugPrint4D(string("AFTER NORM "), inScratchBuf, batchSize, inChannels, nnXLen, nnYLen, usingNHWC, usingFP16);
-#endif
-    conv.apply(cudaHandles,batchSize,accumulate,inScratchBuf,outBuf,workspaceBuf,workspaceBytes);
-  }
-
-};
-
-
-//---------------------------------------------------------------------------------
-
-struct ResidualBlock {
-  const string name;
-  const NormActConv normActConv1;
-  const NormActConv normActConv2;
-
-  ResidualBlock() = delete;
-  ResidualBlock(const ResidualBlock&) = delete;
-  ResidualBlock& operator=(const ResidualBlock&) = delete;
-
-  ResidualBlock(
-    CudaHandles* cudaHandles,
-    CudnnManager* manager,
-    const ResidualBlockDesc* desc,
-    int nnX,
-    int nnY,
-    bool useFP16,
-    bool useNHWC
-  ): name(desc->name),
-     normActConv1(cudaHandles,manager,&desc->preBN,&desc->preActivation,&desc->regularConv,nnX,nnY,useFP16,useNHWC),
-     normActConv2(cudaHandles,manager,&desc->midBN,&desc->midActivation,&desc->finalConv,nnX,nnY,useFP16,useNHWC)
-  {
-  }
-
-  ~ResidualBlock()
-  {}
-
-  size_t requiredWorkspaceBytes(
-    CudaHandles* cudaHandles,
-    int batchSize
-  ) const {
-    size_t bytes = 0;
-    size_t b;
-    b = normActConv1.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    b = normActConv2.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    return bytes;
-  }
-
-  void apply(
-    CudaHandles* cudaHandles,
-    ScratchBuffers* scratch,
-    int batchSize,
-    void* trunkBuf,
-    void* trunkScratchBuf,
-    void* maskBuf,
-    void* workspaceBuf,
-    size_t workspaceBytes
-  ) const {
-    SizedBuf<void*> midIn(scratch->allocator, scratch->getBufSizeXY(normActConv1.outChannels));
-    SizedBuf<void*> midScratch(scratch->allocator, scratch->getBufSizeXY(normActConv1.outChannels));
-    normActConv1.apply(cudaHandles,batchSize,false,trunkBuf,trunkScratchBuf,midIn.buf,maskBuf,workspaceBuf,workspaceBytes);
-    normActConv2.apply(cudaHandles,batchSize,true,midIn.buf,midScratch.buf,trunkBuf,maskBuf,workspaceBuf,workspaceBytes);
-  }
-
-};
-
-
-//----------------------------------------------------------------------------
-
-
-struct GlobalPoolingResidualBlock {
-  const string name;
-  const BatchNormLayer preBN;
-  const ConvLayer regularConv;
-  const ConvLayer gpoolConv;
-  const BatchNormLayer gpoolBN;
-  const MatMulLayer gpoolToBiasMul;
-  const NormActConv normActConv2;
-
-  const int nnXLen;
-  const int nnYLen;
-  const int regularChannels;
-  const int gpoolChannels;
-  const bool usingFP16;
-  const bool usingNHWC;
-
-  GlobalPoolingResidualBlock() = delete;
-  GlobalPoolingResidualBlock(const GlobalPoolingResidualBlock&) = delete;
-  GlobalPoolingResidualBlock& operator=(const GlobalPoolingResidualBlock&) = delete;
-
-  GlobalPoolingResidualBlock(
-    CudaHandles* cudaHandles,
-    CudnnManager* manager,
-    const GlobalPoolingResidualBlockDesc* desc,
-    int nnX,
-    int nnY,
-    bool useFP16,
-    bool useNHWC
-  ): name(desc->name),
-     preBN(cudaHandles,&desc->preBN,&desc->preActivation,nnX,nnY,useFP16,useNHWC),
-     regularConv(cudaHandles,manager,&desc->regularConv,useFP16,useNHWC),
-     gpoolConv(cudaHandles,manager,&desc->gpoolConv,useFP16,useNHWC),
-     gpoolBN(cudaHandles,&desc->gpoolBN,&desc->gpoolActivation,nnX,nnY,useFP16,useNHWC),
-     gpoolToBiasMul(cudaHandles,&desc->gpoolToBiasMul,useFP16),
-     normActConv2(cudaHandles,manager,&desc->midBN,&desc->midActivation,&desc->finalConv,nnX,nnY,useFP16,useNHWC),
-     nnXLen(nnX),
-     nnYLen(nnY),
-     regularChannels(desc->regularConv.outChannels),
-     gpoolChannels(desc->gpoolConv.outChannels),
-     usingFP16(useFP16),
-     usingNHWC(useNHWC)
-  {
-  }
-
-  ~GlobalPoolingResidualBlock() {
-  }
-
-  size_t requiredWorkspaceBytes(
-    CudaHandles* cudaHandles,
-    int batchSize
-  ) const {
-    size_t bytes = 0;
-    size_t b;
-    b = regularConv.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    b = gpoolConv.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    b = gpoolToBiasMul.requiredWorkspaceBytes(cudaHandles);
-    bytes = std::max(bytes,b);
-    b = normActConv2.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    b = sizeof(float)*batchSize*gpoolChannels*nnXLen*nnYLen;
-    bytes = std::max(bytes,b);
-    return bytes;
-  }
-
-  void apply(
-    CudaHandles* cudaHandles,
-    ScratchBuffers* scratch,
-    int batchSize,
-    void* trunkBuf,
-    void* trunkScratchBuf,
-    void* maskBuf,
-    float* maskSumBuf,
-    void* workspaceBuf,
-    size_t workspaceBytes
-  ) const {
-    SizedBuf<void*> regularOut(scratch->allocator, scratch->getBufSizeXY(regularChannels));
-    SizedBuf<void*> regularScratch(scratch->allocator, scratch->getBufSizeXY(regularChannels));
-    SizedBuf<void*> gpoolOut(scratch->allocator, scratch->getBufSizeXY(gpoolChannels));
-    SizedBuf<void*> gpoolOut2(scratch->allocator, scratch->getBufSizeXY(gpoolChannels));
-    SizedBuf<void*> gpoolConcat(scratch->allocator, scratch->getBufSize(gpoolChannels*3));
-    SizedBuf<void*> gpoolBias(scratch->allocator, scratch->getBufSize(regularChannels));
-
-    preBN.apply(cudaHandles,batchSize,trunkBuf,maskBuf,trunkScratchBuf);
-    regularConv.apply(cudaHandles,batchSize,false,trunkScratchBuf,regularOut.buf,workspaceBuf,workspaceBytes);
-    gpoolConv.apply(cudaHandles,batchSize,false,trunkScratchBuf,gpoolOut.buf,workspaceBuf,workspaceBytes);
-    gpoolBN.apply(cudaHandles,batchSize,gpoolOut.buf,maskBuf,gpoolOut2.buf);
-
-    if(!usingFP16) {
-      if(!usingNHWC)
-        customCudaPoolRowsGPoolNCHW((const float*)gpoolOut2.buf,(float*)gpoolConcat.buf,batchSize,gpoolChannels,nnXLen*nnYLen,(const float*)maskBuf,maskSumBuf);
-      else
-        customCudaPoolRowsGPoolNHWC((const float*)gpoolOut2.buf,(float*)gpoolConcat.buf,batchSize,nnXLen*nnYLen,gpoolChannels,(const float*)maskBuf,maskSumBuf);
-    }
-    else {
-      if(!usingNHWC)
-        customCudaPoolRowsGPoolNCHW((const half*)gpoolOut2.buf,(half*)gpoolConcat.buf,batchSize,gpoolChannels,nnXLen*nnYLen,(const half*)maskBuf,maskSumBuf);
-      else
-        customCudaPoolRowsGPoolNHWC((const half*)gpoolOut2.buf,(half*)gpoolConcat.buf,batchSize,nnXLen*nnYLen,gpoolChannels,(const half*)maskBuf,maskSumBuf);
-    }
-    CUDA_ERR(name.c_str(),hipPeekAtLastError());
-
-    gpoolToBiasMul.apply(cudaHandles,scratch,batchSize,gpoolConcat.buf,gpoolBias.buf,workspaceBuf,workspaceBytes);
-
-    if(!usingFP16) {
-      if(!usingNHWC)
-        customCudaAddNCBiasInplaceNCHW((float*)regularOut.buf,(const float*)gpoolBias.buf,batchSize,regularChannels,nnXLen*nnYLen);
-      else
-        customCudaAddNCBiasInplaceNHWC((float*)regularOut.buf,(const float*)gpoolBias.buf,batchSize,nnXLen*nnYLen,regularChannels);
-    }
-    else {
-      if(!usingNHWC)
-        customCudaAddNCBiasInplaceNCHW((half*)regularOut.buf,(const half*)gpoolBias.buf,batchSize,regularChannels,nnXLen*nnYLen);
-      else
-        customCudaAddNCBiasInplaceNHWC((half*)regularOut.buf,(const half*)gpoolBias.buf,batchSize,nnXLen*nnYLen,regularChannels);
-    }
-    CUDA_ERR(name.c_str(),hipPeekAtLastError());
-
-    normActConv2.apply(cudaHandles,batchSize,true,regularOut.buf,regularScratch.buf,trunkBuf,maskBuf,workspaceBuf,workspaceBytes);
-  }
-
-};
-
-//------------------------------------------------------------------------------
-
-struct BlockStack {
-  const int numBlocks;
-  const int trunkNumChannels;
-  const int nnXLen;
-  const int nnYLen;
-  const bool usingFP16;
-  const bool usingNHWC;
-  vector<pair<int,unique_ptr_void>> blocks;
-
-  BlockStack() = delete;
-  BlockStack(const BlockStack&) = delete;
-  BlockStack& operator=(const BlockStack&) = delete;
-
-  BlockStack(
-    CudaHandles* cudaHandles,
-    CudnnManager* manager,
-    int nBlocks,
-    int trunkChannels,
-    const std::vector<std::pair<int, unique_ptr_void>>& descBlocks,
-    int nnX,
-    int nnY,
-    bool useFP16,
-    bool useNHWC
-  );
-  ~BlockStack();
-
-  size_t requiredWorkspaceBytes(
-    CudaHandles* cudaHandles,
-    int batchSize
-  ) const;
-
-  void apply(
-    CudaHandles* cudaHandles,
-    ScratchBuffers* scratch,
-    int batchSize,
-    void* maskBuf,
-    float* maskSumBuf,
-    void* trunkBuf,
-    void* trunkScratchBuf,
-    void* workspaceBuf,
-    size_t workspaceBytes
-  ) const;
-
-};
-
-//------------------------------------------------------------------------------
-
-struct NestedBottleneckResidualBlock {
-  const string name;
-  const NormActConv normActConv1;
-  const BlockStack blocks;
-  const NormActConv normActConv2;
-
-  NestedBottleneckResidualBlock() = delete;
-  NestedBottleneckResidualBlock(const NestedBottleneckResidualBlock&) = delete;
-  NestedBottleneckResidualBlock& operator=(const NestedBottleneckResidualBlock&) = delete;
-
-  NestedBottleneckResidualBlock(
-    CudaHandles* cudaHandles,
-    CudnnManager* manager,
-    const NestedBottleneckResidualBlockDesc* desc,
-    int nnX,
-    int nnY,
-    bool useFP16,
-    bool useNHWC
-  ): name(desc->name),
-     normActConv1(cudaHandles,manager,&desc->preBN,&desc->preActivation,&desc->preConv,nnX,nnY,useFP16,useNHWC),
-     blocks(cudaHandles,manager,desc->numBlocks,desc->preConv.outChannels,desc->blocks,nnX,nnY,useFP16,useNHWC),
-     normActConv2(cudaHandles,manager,&desc->postBN,&desc->postActivation,&desc->postConv,nnX,nnY,useFP16,useNHWC)
-  {
-  }
-
-  ~NestedBottleneckResidualBlock()
-  {}
-
-  size_t requiredWorkspaceBytes(
-    CudaHandles* cudaHandles,
-    int batchSize
-  ) const {
-    size_t bytes = 0;
-    size_t b;
-    b = normActConv1.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    b = blocks.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    b = normActConv2.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    return bytes;
-  }
-
-  void apply(
-    CudaHandles* cudaHandles,
-    ScratchBuffers* scratch,
-    int batchSize,
-    void* trunkBuf,
-    void* trunkScratchBuf,
-    void* maskBuf,
-    float* maskSumBuf,
-    void* workspaceBuf,
-    size_t workspaceBytes
-  ) const {
-    SizedBuf<void*> mid(scratch->allocator, scratch->getBufSizeXY(normActConv1.outChannels));
-    SizedBuf<void*> midScratch(scratch->allocator, scratch->getBufSizeXY(normActConv1.outChannels));
-    assert(normActConv1.outChannels == normActConv2.inChannels);
-    normActConv1.apply(cudaHandles,batchSize,false,trunkBuf,trunkScratchBuf,mid.buf,maskBuf,workspaceBuf,workspaceBytes);
-    blocks.apply(
-      cudaHandles,
-      scratch,
-      batchSize,
-      maskBuf,
-      maskSumBuf,
-      mid.buf,
-      midScratch.buf,
-      workspaceBuf,
-      workspaceBytes
-    );
-    normActConv2.apply(cudaHandles,batchSize,true,mid.buf,midScratch.buf,trunkBuf,maskBuf,workspaceBuf,workspaceBytes);
-  }
-
-};
-
-//------------------------------------------------------------------------------
-
-BlockStack::BlockStack(
-  CudaHandles* cudaHandles,
-  CudnnManager* manager,
-  int nBlocks,
-  int trunkChannels,
-  const std::vector<std::pair<int, unique_ptr_void>>& descBlocks,
-  int nnX,
-  int nnY,
-  bool useFP16,
-  bool useNHWC
-) :
-  numBlocks(nBlocks),
-  trunkNumChannels(trunkChannels),
-  nnXLen(nnX),
-  nnYLen(nnY),
-  usingFP16(useFP16),
-  usingNHWC(useNHWC)
-{
-  assert(numBlocks == descBlocks.size());
-  for(int i = 0; i<numBlocks; i++) {
-    if(descBlocks[i].first == ORDINARY_BLOCK_KIND) {
-      ResidualBlockDesc* blockDesc = (ResidualBlockDesc*)descBlocks[i].second.get();
-      unique_ptr_void blockPtr = make_unique_void(
-        new ResidualBlock(
-          cudaHandles,
-          manager,
-          blockDesc,
-          nnXLen,
-          nnYLen,
-          useFP16,
-          useNHWC
-        )
-      );
-      blocks.push_back(make_pair(ORDINARY_BLOCK_KIND,std::move(blockPtr)));
-    }
-    else if(descBlocks[i].first == GLOBAL_POOLING_BLOCK_KIND) {
-      GlobalPoolingResidualBlockDesc* blockDesc = (GlobalPoolingResidualBlockDesc*)descBlocks[i].second.get();
-      unique_ptr_void blockPtr = make_unique_void(
-        new GlobalPoolingResidualBlock(
-          cudaHandles,
-          manager,
-          blockDesc,
-          nnXLen,
-          nnYLen,
-          useFP16,
-          useNHWC
-        )
-      );
-      blocks.push_back(make_pair(GLOBAL_POOLING_BLOCK_KIND,std::move(blockPtr)));
-    }
-    else if(descBlocks[i].first == NESTED_BOTTLENECK_BLOCK_KIND) {
-      NestedBottleneckResidualBlockDesc* blockDesc = (NestedBottleneckResidualBlockDesc*)descBlocks[i].second.get();
-      unique_ptr_void blockPtr = make_unique_void(
-        new NestedBottleneckResidualBlock(
-          cudaHandles,
-          manager,
-          blockDesc,
-          nnXLen,
-          nnYLen,
-          useFP16,
-          useNHWC
-        )
-      );
-      blocks.push_back(make_pair(NESTED_BOTTLENECK_BLOCK_KIND,std::move(blockPtr)));
-    }
-    else {
-      ASSERT_UNREACHABLE;
-    }
-  }
-}
-BlockStack::~BlockStack() {
-}
-
-size_t BlockStack::requiredWorkspaceBytes(
-  CudaHandles* cudaHandles,
-  int batchSize
-) const {
-  size_t bytes = 0;
-  size_t b;
-
-  for(int i = 0; i<blocks.size(); i++) {
-    if(blocks[i].first == ORDINARY_BLOCK_KIND) {
-      ResidualBlock* block = (ResidualBlock*)blocks[i].second.get();
-      b = block->requiredWorkspaceBytes(cudaHandles,batchSize);
-      bytes = std::max(bytes,b);
-    }
-    else if(blocks[i].first == GLOBAL_POOLING_BLOCK_KIND) {
-      GlobalPoolingResidualBlock* block = (GlobalPoolingResidualBlock*)blocks[i].second.get();
-      b = block->requiredWorkspaceBytes(cudaHandles,batchSize);
-      bytes = std::max(bytes,b);
-    }
-    else if(blocks[i].first == NESTED_BOTTLENECK_BLOCK_KIND) {
-      NestedBottleneckResidualBlock* block = (NestedBottleneckResidualBlock*)blocks[i].second.get();
-      b = block->requiredWorkspaceBytes(cudaHandles,batchSize);
-      bytes = std::max(bytes,b);
-    }
-    else {
-      ASSERT_UNREACHABLE;
-    }
-  }
-  return bytes;
-}
-
-void BlockStack::apply(
-  CudaHandles* cudaHandles,
-  ScratchBuffers* scratch,
-  int batchSize,
-  void* maskBuf,
-  float* maskSumBuf,
-  void* trunkBuf,
-  void* trunkScratchBuf,
-  void* workspaceBuf,
-  size_t workspaceBytes
-) const {
-
-  for(int i = 0; i<blocks.size(); i++) {
-#ifdef DEBUG_INTERMEDIATE_VALUES
-    CudaUtils::debugPrint4D(string("Blockstack before block " + Global::intToString(i)), trunkBuf, batchSize, trunkNumChannels, nnXLen, nnYLen, usingNHWC, usingFP16);
-#endif
-
-    if(blocks[i].first == ORDINARY_BLOCK_KIND) {
-      ResidualBlock* block = (ResidualBlock*)blocks[i].second.get();
-      block->apply(
-        cudaHandles,
-        scratch,
-        batchSize,
-        trunkBuf,
-        trunkScratchBuf,
-        maskBuf,
-        workspaceBuf,
-        workspaceBytes
-      );
-    }
-    else if(blocks[i].first == GLOBAL_POOLING_BLOCK_KIND) {
-      GlobalPoolingResidualBlock* block = (GlobalPoolingResidualBlock*)blocks[i].second.get();
-      block->apply(
-        cudaHandles,
-        scratch,
-        batchSize,
-        trunkBuf,
-        trunkScratchBuf,
-        maskBuf,
-        maskSumBuf,
-        workspaceBuf,
-        workspaceBytes
-      );
-    }
-    else if(blocks[i].first == NESTED_BOTTLENECK_BLOCK_KIND) {
-      NestedBottleneckResidualBlock* block = (NestedBottleneckResidualBlock*)blocks[i].second.get();
-      block->apply(
-        cudaHandles,
-        scratch,
-        batchSize,
-        trunkBuf,
-        trunkScratchBuf,
-        maskBuf,
-        maskSumBuf,
-        workspaceBuf,
-        workspaceBytes
-      );
-    }
-    else {
-      ASSERT_UNREACHABLE;
-    }
-  }
-}
-//------------------------------------------------------------------------------
-
-struct SGFMetadataEncoder {
-  const string name;
-
-  const bool usingFP16;
-
-  const MatMulLayer mul1;
-  const MatBiasLayer bias1;
-  const MatMulLayer mul2;
-  const MatBiasLayer bias2;
-  const MatMulLayer mul3;
-
-  SGFMetadataEncoder() = delete;
-  SGFMetadataEncoder(const SGFMetadataEncoder&) = delete;
-  SGFMetadataEncoder& operator=(const SGFMetadataEncoder&) = delete;
-
-  SGFMetadataEncoder(
-    CudaHandles* cudaHandles,
-    const SGFMetadataEncoderDesc* desc,
-    bool useFP16
-  ) :
-    name(desc->name),
-    usingFP16(useFP16),
-    mul1(cudaHandles,&desc->mul1,useFP16),
-    bias1(cudaHandles,&desc->bias1,useFP16,desc->act1.activation),
-    mul2(cudaHandles,&desc->mul2,useFP16),
-    bias2(cudaHandles,&desc->bias2,useFP16,desc->act2.activation),
-    mul3(cudaHandles,&desc->mul3,useFP16)
-  {
-  }
-
-  ~SGFMetadataEncoder()
-  {
-  }
-
-  size_t requiredWorkspaceBytes(
-    CudaHandles* cudaHandles,
-    int batchSize
-  ) const {
-    (void)batchSize;
-    size_t bytes = 0;
-    size_t b;
-
-    b = mul1.requiredWorkspaceBytes(cudaHandles);
-    bytes = std::max(bytes,b);
-    b = mul2.requiredWorkspaceBytes(cudaHandles);
-    bytes = std::max(bytes,b);
-    b = mul3.requiredWorkspaceBytes(cudaHandles);
-    bytes = std::max(bytes,b);
-
-    return bytes;
-  }
-
-  void apply(
-    CudaHandles* cudaHandles,
-    ScratchBuffers* scratch,
-    int batchSize,
-    void* inputBuf,
-    void* outputBuf,
-    void* workspaceBuf,
-    size_t workspaceBytes
-  ) const {
-    SizedBuf<void*> internalBuf1(scratch->allocator, scratch->getBufSizeFloat(std::max(mul1.outChannels,mul2.outChannels)));
-    SizedBuf<void*> internalBuf2(scratch->allocator, scratch->getBufSizeFloat(std::max(mul1.outChannels,mul2.outChannels)));
-
-    mul1.apply(cudaHandles,scratch,batchSize,inputBuf,internalBuf1.buf,workspaceBuf,workspaceBytes);
-    bias1.apply(cudaHandles,batchSize,internalBuf1.buf);
-    mul2.apply(cudaHandles,scratch,batchSize,internalBuf1.buf,internalBuf2.buf,workspaceBuf,workspaceBytes);
-    bias2.apply(cudaHandles,batchSize,internalBuf2.buf);
-    mul3.apply(cudaHandles,scratch,batchSize,internalBuf2.buf,outputBuf,workspaceBuf,workspaceBytes);
-  }
-
-};
-
-
-//----------------------------------------------------------------------------
-
-struct Trunk {
-  const string name;
-  const int modelVersion;
-  const int numBlocks;
-  const int trunkNumChannels;
-
-  const int nnXLen;
-  const int nnYLen;
-  const bool usingFP16;
-  const bool usingNHWC;
-
-  std::unique_ptr<ConvLayer> initialConv;
-  std::unique_ptr<MatMulLayer> initialMatMul;
-  std::unique_ptr<SGFMetadataEncoder> sgfMetadataEncoder;
-  const BlockStack blocks;
-  std::unique_ptr<BatchNormLayer> trunkTipBN;
-
-  Trunk() = delete;
-  Trunk(const Trunk&) = delete;
-  Trunk& operator=(const Trunk&) = delete;
-
-  Trunk(
-    CudaHandles* cudaHandles,
-    CudnnManager* manager,
-    const TrunkDesc* desc,
-    int nnX,
-    int nnY,
-    bool inputsUseNHWC,
-    bool useFP16,
-    bool useNHWC
-  ) :
-    name(desc->name),
-    modelVersion(desc->modelVersion),
-    numBlocks(desc->numBlocks),
-    trunkNumChannels(desc->trunkNumChannels),
-    nnXLen(nnX),
-    nnYLen(nnY),
-    usingFP16(useFP16),
-    usingNHWC(useNHWC),
-    blocks(cudaHandles,manager,desc->numBlocks,desc->trunkNumChannels,desc->blocks,nnX,nnY,useFP16,useNHWC)
-  {
-    int midNumChannels = desc->midNumChannels;
-    int regularNumChannels = desc->regularNumChannels;
-    int gpoolNumChannels = desc->gpoolNumChannels;
-
-    int maxBatchSize = manager->maxBatchSize;
-    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,trunkNumChannels);
-    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,midNumChannels);
-    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,regularNumChannels);
-    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,gpoolNumChannels);
-
-    initialConv = std::make_unique<ConvLayer>(cudaHandles,manager,&desc->initialConv,useFP16,inputsUseNHWC,useNHWC);
-    initialMatMul = std::make_unique<MatMulLayer>(cudaHandles,&desc->initialMatMul,useFP16);
-    if(desc->metaEncoderVersion > 0) {
-      sgfMetadataEncoder = std::make_unique<SGFMetadataEncoder>(cudaHandles,&desc->sgfMetadataEncoder,useFP16);
-      testAssert(sgfMetadataEncoder->mul3.outChannels == initialMatMul->outChannels);
-    }
-
-    trunkTipBN = std::make_unique<BatchNormLayer>(cudaHandles,&desc->trunkTipBN,&desc->trunkTipActivation,nnXLen,nnYLen,useFP16,useNHWC);
-    assert(desc->blocks.size() == numBlocks);
-  }
-
-  ~Trunk()
-  {
-  }
-
-  size_t requiredWorkspaceBytes(
-    CudaHandles* cudaHandles,
-    int batchSize
-  ) const {
-    size_t bytes = 0;
-    size_t b;
-
-    b = initialConv->requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-
-    b = initialMatMul->requiredWorkspaceBytes(cudaHandles);
-    bytes = std::max(bytes,b);
-
-    if(sgfMetadataEncoder != nullptr) {
-      b = sgfMetadataEncoder->requiredWorkspaceBytes(cudaHandles,batchSize);
-      bytes = std::max(bytes,b);
-    }
-
-    b = blocks.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    return bytes;
-  }
-
-  void apply(
-    CudaHandles* cudaHandles,
-    ScratchBuffers* scratch,
-    int batchSize,
-    void* inputBuf,
-    void* inputGlobalBuf,
-    void* inputMetaBuf,
-    void* maskBuf,
-    float* maskSumBuf,
-    void* trunkBuf,
-    void* workspaceBuf,
-    size_t workspaceBytes
-  ) const {
-
-    SizedBuf<void*> trunkScratch(scratch->allocator, scratch->getBufSizeXY(trunkNumChannels));
-
-    //Feed the conv into trunkScratch.buf, not trunkBuf
-    initialConv->apply(cudaHandles,batchSize,false,inputBuf,trunkScratch.buf,workspaceBuf,workspaceBytes);
-
-    #ifdef DEBUG_INTERMEDIATE_VALUES
-    CudaUtils::debugPrint4D(string("After initial conv"), trunkScratch.buf, batchSize, trunkNumChannels, nnXLen, nnYLen, usingNHWC, usingFP16);
-    #endif
-
-    //Feed the matmul into trunkBuf
-    initialMatMul->apply(cudaHandles,scratch,batchSize,inputGlobalBuf,trunkBuf,workspaceBuf,workspaceBytes);
-    //Then accumulate it into trunkScratch.buf, broadcasting during the process
-    if(!usingFP16) {
-      if(!usingNHWC)
-        customCudaAddNCBiasInplaceNCHW((float*)trunkScratch.buf,(const float*)trunkBuf,batchSize,trunkNumChannels,nnXLen*nnYLen);
-      else
-        customCudaAddNCBiasInplaceNHWC((float*)trunkScratch.buf,(const float*)trunkBuf,batchSize,nnXLen*nnYLen,trunkNumChannels);
-    }
-    else {
-      if(!usingNHWC)
-        customCudaAddNCBiasInplaceNCHW((half*)trunkScratch.buf,(const half*)trunkBuf,batchSize,trunkNumChannels,nnXLen*nnYLen);
-      else
-        customCudaAddNCBiasInplaceNHWC((half*)trunkScratch.buf,(const half*)trunkBuf,batchSize,nnXLen*nnYLen,trunkNumChannels);
-    }
-    CUDA_ERR(name.c_str(),hipPeekAtLastError());
-
-    if(sgfMetadataEncoder != nullptr) {
-      testAssert(inputMetaBuf != NULL);
-      //Feed the result into trunkBuf
-      sgfMetadataEncoder->apply(cudaHandles,scratch,batchSize,inputMetaBuf,trunkBuf,workspaceBuf,workspaceBytes);
-      //Then accumulate it into trunkScratch.buf, broadcasting during the process
-      if(!usingFP16) {
-        if(!usingNHWC)
-          customCudaAddNCBiasInplaceNCHW((float*)trunkScratch.buf,(const float*)trunkBuf,batchSize,trunkNumChannels,nnXLen*nnYLen);
-        else
-          customCudaAddNCBiasInplaceNHWC((float*)trunkScratch.buf,(const float*)trunkBuf,batchSize,nnXLen*nnYLen,trunkNumChannels);
-      }
-      else {
-        if(!usingNHWC)
-          customCudaAddNCBiasInplaceNCHW((half*)trunkScratch.buf,(const half*)trunkBuf,batchSize,trunkNumChannels,nnXLen*nnYLen);
-        else
-          customCudaAddNCBiasInplaceNHWC((half*)trunkScratch.buf,(const half*)trunkBuf,batchSize,nnXLen*nnYLen,trunkNumChannels);
-      }
-      CUDA_ERR(name.c_str(),hipPeekAtLastError());
-    }
-    else {
-      testAssert(inputMetaBuf == NULL);
-    }
-
-    //Flip trunkBuf and trunkScratch.buf so that the result gets accumulated in trunkScratch.buf
-    blocks.apply(
-      cudaHandles,
-      scratch,
-      batchSize,
-      maskBuf,
-      maskSumBuf,
-      trunkScratch.buf,
-      trunkBuf,
-      workspaceBuf,
-      workspaceBytes
-    );
-
-    //And now with the final BN port it from trunkScratch.buf to trunkBuf.
-    trunkTipBN->apply(cudaHandles,batchSize,trunkScratch.buf,maskBuf,trunkBuf);
-
-    #ifdef DEBUG_INTERMEDIATE_VALUES
-    CudaUtils::debugPrint4D(string("Trunk tip"), trunkBuf, batchSize, trunkNumChannels, nnXLen, nnYLen, usingNHWC, usingFP16);
-    #endif
-  }
-
-};
-
-//------------------------------------------------------------------------------
-
-static void fillMaskFloatBufAndMaskSumBuf(void* maskBuf, float*& maskFloatBuf, float*& maskSumBuf, bool usingFP16, int batchSize, int nnXLen, int nnYLen) {
-  if(!usingFP16) {
-    maskFloatBuf = (float*)maskBuf;
-    customCudaPoolRowsSumNCHW((const float*)maskFloatBuf,maskSumBuf,batchSize,1,nnXLen*nnYLen,1.0);
-    CUDA_ERR("sumMask",hipPeekAtLastError());
-  }
-  else {
-    customCudaCopyFromHalf((const half*)maskBuf,maskFloatBuf,batchSize*nnXLen*nnYLen);
-    CUDA_ERR("copyMaskFromHalf",hipPeekAtLastError());
-    customCudaPoolRowsSumNCHW((const float*)maskFloatBuf,maskSumBuf,batchSize,1,nnXLen*nnYLen,1.0);
-    CUDA_ERR("sumMask",hipPeekAtLastError());
-  }
-}
-
-
-//------------------------------------------------------------------------------
-
-struct PolicyHead {
-  const string name;
-  const int modelVersion;
-  const int nnXLen;
-  const int nnYLen;
-  const int p1Channels;
-  const int g1Channels;
-  const int p2Channels;
-  const bool usingFP16;
-  const bool usingNHWC;
-
-  const ConvLayer p1Conv;
-  const ConvLayer g1Conv;
-  const BatchNormLayer g1BN;
-  const MatMulLayer gpoolToBiasMul;
-  const BatchNormLayer p1BN;
-  const ConvLayer p2Conv;
-  const MatMulLayer gpoolToPassMul;
-  const MatBiasLayer gpoolToPassBias;
-  const MatMulLayer gpoolToPassMul2;
-
-  PolicyHead() = delete;
-  PolicyHead(const PolicyHead&) = delete;
-  PolicyHead& operator=(const PolicyHead&) = delete;
-
-  PolicyHead(
-    CudaHandles* cudaHandles,
-    CudnnManager* manager,
-    const PolicyHeadDesc* desc,
-    int nnX,
-    int nnY,
-    bool useFP16,
-    bool useNHWC
-  ) :
-    name(desc->name),
-    modelVersion(desc->modelVersion),
-    nnXLen(nnX),
-    nnYLen(nnY),
-    p1Channels(desc->p1Conv.outChannels),
-    g1Channels(desc->g1Conv.outChannels),
-    p2Channels(desc->p2Conv.outChannels),
-    usingFP16(useFP16),
-    usingNHWC(useNHWC),
-    p1Conv(cudaHandles,manager,&desc->p1Conv,useFP16,useNHWC),
-    g1Conv(cudaHandles,manager,&desc->g1Conv,useFP16,useNHWC),
-    g1BN(cudaHandles,&desc->g1BN,&desc->g1Activation,nnX,nnY,useFP16,useNHWC),
-    gpoolToBiasMul(cudaHandles,&desc->gpoolToBiasMul,false),
-    p1BN(cudaHandles,&desc->p1BN,&desc->p1Activation,nnX,nnY,false,useNHWC),
-    p2Conv(cudaHandles,manager,&desc->p2Conv,false,useNHWC),
-    gpoolToPassMul(cudaHandles,&desc->gpoolToPassMul,false),
-    gpoolToPassBias(cudaHandles,&desc->gpoolToPassBias,false,desc->passActivation.activation),
-    gpoolToPassMul2(cudaHandles,&desc->gpoolToPassMul2,false)
-  {
-  }
-
-  ~PolicyHead()
-  {
-  }
-
-  size_t requiredWorkspaceBytes(
-    CudaHandles* cudaHandles,
-    int batchSize
-  ) const {
-    size_t bytes = 0;
-    size_t b;
-
-    b = p1Conv.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    b = g1Conv.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    b = gpoolToBiasMul.requiredWorkspaceBytes(cudaHandles);
-    bytes = std::max(bytes,b);
-    b = p2Conv.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    b = gpoolToPassMul.requiredWorkspaceBytes(cudaHandles);
-    bytes = std::max(bytes,b);
-    b = gpoolToPassMul2.requiredWorkspaceBytes(cudaHandles);
-    bytes = std::max(bytes,b);
-    b = sizeof(float)*batchSize*g1Channels*nnXLen*nnYLen;
-    bytes = std::max(bytes,b);
-
-    return bytes;
-  }
-
-  void apply(
-    CudaHandles* cudaHandles,
-    ScratchBuffers* scratch,
-    int batchSize,
-    void* maskBuf,
-    float* maskFloatBuf,
-    float* maskSumBuf,
-    void* trunkBuf,
-    float* policyPassBuf,
-    float* policyBuf,
-    void* workspaceBuf,
-    size_t workspaceBytes
-  ) const {
-
-    SizedBuf<void*> p1Out(scratch->allocator, scratch->getBufSizeXYFloat(p1Channels)); //Need to hold floats, not just halfs
-    SizedBuf<void*> p1Out2(scratch->allocator, scratch->getBufSizeXYFloat(p1Channels)); //Need to hold floats, not just halfs
-    SizedBuf<void*> g1Out(scratch->allocator, scratch->getBufSizeXY(g1Channels));
-    SizedBuf<void*> g1Out2(scratch->allocator, scratch->getBufSizeXY(g1Channels));
-    SizedBuf<void*> g1Concat(scratch->allocator, scratch->getBufSizeFloat(g1Channels*3));
-    SizedBuf<void*> g1Bias(scratch->allocator, scratch->getBufSizeFloat(p1Channels));
-    SizedBuf<void*> p1Pass(scratch->allocator, scratch->getBufSizeFloat(p1Channels));
-
-    p1Conv.apply(cudaHandles,batchSize,false,trunkBuf,p1Out.buf,workspaceBuf,workspaceBytes);
-    g1Conv.apply(cudaHandles,batchSize,false,trunkBuf,g1Out.buf,workspaceBuf,workspaceBytes);
-    g1BN.apply(cudaHandles,batchSize,g1Out.buf,maskBuf,g1Out2.buf);
-
-    if(!usingFP16) {
-      if(!usingNHWC)
-        customCudaPoolRowsGPoolNCHW((const float*)g1Out2.buf,(float*)g1Concat.buf,batchSize,g1Channels,nnXLen*nnYLen,maskFloatBuf,maskSumBuf);
-      else
-        customCudaPoolRowsGPoolNHWC((const float*)g1Out2.buf,(float*)g1Concat.buf,batchSize,nnXLen*nnYLen,g1Channels,maskFloatBuf,maskSumBuf);
-      CUDA_ERR(name.c_str(),hipPeekAtLastError());
-    }
-    else {
-      customCudaCopyFromHalf((const half*)g1Out2.buf,(float*)workspaceBuf,batchSize*g1Channels*nnXLen*nnYLen);
-      CUDA_ERR(name.c_str(),hipPeekAtLastError());
-      if(!usingNHWC)
-        customCudaPoolRowsGPoolNCHW((const float*)workspaceBuf,(float*)g1Concat.buf,batchSize,g1Channels,nnXLen*nnYLen,maskFloatBuf,maskSumBuf);
-      else
-        customCudaPoolRowsGPoolNHWC((const float*)workspaceBuf,(float*)g1Concat.buf,batchSize,nnXLen*nnYLen,g1Channels,maskFloatBuf,maskSumBuf);
-      CUDA_ERR(name.c_str(),hipPeekAtLastError());
-    }
-
-    gpoolToBiasMul.apply(cudaHandles,scratch,batchSize,g1Concat.buf,g1Bias.buf,workspaceBuf,workspaceBytes);
-
-    #ifdef DEBUG_INTERMEDIATE_VALUES
-    CudaUtils::debugPrint4D(string("p1 pre-gpool-sum"), p1Out.buf, batchSize, p1Channels, nnXLen, nnYLen, usingNHWC, usingFP16);
-    CudaUtils::debugPrint4D(string("g1 pre-gpool"), g1Out.buf, batchSize, g1Channels, nnXLen, nnYLen, usingNHWC, usingFP16);
-    CudaUtils::debugPrint2D(string("g1 pooled"), g1Concat.buf, batchSize, g1Channels*3, false);
-    CudaUtils::debugPrint2D(string("g1 biases"), g1Bias.buf, batchSize, p1Channels, false);
-    #endif
-
-    float* p1OutBufA;
-    float* p1OutBufB;
-    if(!usingFP16) {
-      p1OutBufA = (float*)p1Out.buf;
-      p1OutBufB = (float*)p1Out2.buf;
-    }
-    else {
-      customCudaCopyFromHalf((const half*)p1Out.buf,(float*)p1Out2.buf,batchSize*p1Channels*nnXLen*nnYLen);
-      CUDA_ERR(name.c_str(),hipPeekAtLastError());
-      p1OutBufA = (float*)p1Out2.buf;
-      p1OutBufB = (float*)p1Out.buf;
-    }
-
-    if(!usingNHWC)
-      customCudaAddNCBiasInplaceNCHW(p1OutBufA,(float*)g1Bias.buf,batchSize,p1Channels,nnXLen*nnYLen);
-    else
-      customCudaAddNCBiasInplaceNHWC(p1OutBufA,(float*)g1Bias.buf,batchSize,nnXLen*nnYLen,p1Channels);
-    CUDA_ERR(name.c_str(),hipPeekAtLastError());
-
-    p1BN.apply(cudaHandles,batchSize,p1OutBufA,maskFloatBuf,p1OutBufB);
-    p2Conv.apply(cudaHandles,batchSize,false,p1OutBufB,(float*)policyBuf,workspaceBuf,workspaceBytes);
-
-    if(modelVersion >= 15) {
-      gpoolToPassMul.apply(cudaHandles,scratch,batchSize,g1Concat.buf,p1Pass.buf,workspaceBuf,workspaceBytes);
-      gpoolToPassBias.apply(cudaHandles,batchSize,p1Pass.buf);
-      gpoolToPassMul2.apply(cudaHandles,scratch,batchSize,p1Pass.buf,policyPassBuf,workspaceBuf,workspaceBytes);
-    }
-    else {
-      gpoolToPassMul.apply(cudaHandles,scratch,batchSize,g1Concat.buf,policyPassBuf,workspaceBuf,workspaceBytes);
-    }
-
-    #ifdef DEBUG_INTERMEDIATE_VALUES
-    CudaUtils::debugPrint4D(string("p1 after-gpool-sum"), p1OutBufA, batchSize, p1Channels, nnXLen, nnYLen, usingNHWC, false);
-    CudaUtils::debugPrint2D(string("policypass"), policyPassBuf, batchSize, 1, false);
-    CudaUtils::debugPrint4D(string("policy"), policyBuf, batchSize, p2Channels, nnXLen, nnYLen, usingNHWC, false);
-    #endif
-
-  }
-
-};
-
-//------------------------------------------------------------------------------
-
-struct ValueHead {
-  const string name;
-  const int modelVersion;
-  const int nnXLen;
-  const int nnYLen;
-  const int v1Channels;
-  const int v2Channels;
-  const int valueChannels;
-  const int scoreValueChannels;
-  const int ownershipChannels;
-  const bool usingFP16;
-  const bool usingNHWC;
-
-  const ConvLayer v1Conv;
-  const BatchNormLayer v1BN;
-  const MatMulLayer v2Mul;
-  const MatBiasLayer v2Bias;
-  const MatMulLayer v3Mul;
-  const MatBiasLayer v3Bias;
-  const MatMulLayer sv3Mul;
-  const MatBiasLayer sv3Bias;
-  const ConvLayer vOwnershipConv;
-
-  ValueHead() = delete;
-  ValueHead(const ValueHead&) = delete;
-  ValueHead& operator=(const ValueHead&) = delete;
-
-  ValueHead(
-    CudaHandles* cudaHandles,
-    CudnnManager* manager,
-    const ValueHeadDesc* desc,
-    int nnX,
-    int nnY,
-    bool useFP16,
-    bool useNHWC
-  ) :
-    name(desc->name),
-    modelVersion(desc->modelVersion),
-    nnXLen(nnX),
-    nnYLen(nnY),
-    v1Channels(desc->v1Conv.outChannels),
-    v2Channels(desc->v2Mul.outChannels),
-    valueChannels(desc->v3Mul.outChannels),
-    scoreValueChannels(desc->sv3Mul.outChannels),
-    ownershipChannels(desc->vOwnershipConv.outChannels),
-    usingFP16(useFP16),
-    usingNHWC(useNHWC),
-    v1Conv(cudaHandles,manager,&desc->v1Conv,useFP16,useNHWC),
-    v1BN(cudaHandles,&desc->v1BN,&desc->v1Activation,nnX,nnY,useFP16,useNHWC),
-    v2Mul(cudaHandles,&desc->v2Mul,false),
-    v2Bias(cudaHandles,&desc->v2Bias,false,desc->v2Activation.activation),
-    v3Mul(cudaHandles,&desc->v3Mul,false),
-    v3Bias(cudaHandles,&desc->v3Bias,false,ACTIVATION_IDENTITY),
-    sv3Mul(cudaHandles,&desc->sv3Mul,false),
-    sv3Bias(cudaHandles,&desc->sv3Bias,false,ACTIVATION_IDENTITY),
-    vOwnershipConv(cudaHandles,manager,&desc->vOwnershipConv,useFP16,useNHWC)
-  {
-  }
-
-  ~ValueHead()
-  {
-  }
-
-  size_t requiredWorkspaceBytes(
-    CudaHandles* cudaHandles,
-    int batchSize
-  ) const {
-    size_t bytes = 0;
-    size_t b;
-
-    b = v1Conv.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    b = v2Mul.requiredWorkspaceBytes(cudaHandles);
-    bytes = std::max(bytes,b);
-    b = v3Mul.requiredWorkspaceBytes(cudaHandles);
-    bytes = std::max(bytes,b);
-    b = sizeof(float)*batchSize*v1Channels*nnXLen*nnYLen;
-    bytes = std::max(bytes,b);
-
-    b = sv3Mul.requiredWorkspaceBytes(cudaHandles);
-    bytes = std::max(bytes,b);
-    b = vOwnershipConv.requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    b = sizeof(float)*batchSize*ownershipChannels*nnXLen*nnYLen;
-    bytes = std::max(bytes,b);
-
-    return bytes;
-  }
-
-
-  void apply(
-    CudaHandles* cudaHandles,
-    ScratchBuffers* scratch,
-    int batchSize,
-    void* maskBuf,
-    float* maskSumBuf,
-    void* trunkBuf,
-    float* valueBuf,
-    float* scoreValueBuf,
-    void* ownershipBuf,
-    void* workspaceBuf,
-    size_t workspaceBytes
-  ) const {
-    SizedBuf<void*> v1Out(scratch->allocator, scratch->getBufSizeXY(v1Channels));
-    SizedBuf<void*> v1Out2(scratch->allocator, scratch->getBufSizeXY(v1Channels));
-    SizedBuf<void*> v1Mean(scratch->allocator, scratch->getBufSizeFloat(v1Channels*3));
-    SizedBuf<void*> v2Out(scratch->allocator, scratch->getBufSizeFloat(v2Channels));
-    SizedBuf<void*> ownershipScratch(scratch->allocator, scratch->getBufSizeXYFloat(ownershipChannels));
-
-    v1Conv.apply(cudaHandles,batchSize,false,trunkBuf,v1Out.buf,workspaceBuf,workspaceBytes);
-    v1BN.apply(cudaHandles,batchSize,v1Out.buf,maskBuf,v1Out2.buf);
-
-    void* bufToBePooled = v1Out2.buf;
-    if(usingFP16) {
-      customCudaCopyFromHalf((const half*)v1Out2.buf,(float*)workspaceBuf,batchSize*v1Channels*nnXLen*nnYLen);
-      CUDA_ERR(name.c_str(),hipPeekAtLastError());
-      bufToBePooled = workspaceBuf;
-    }
-
-    if(!usingNHWC)
-      customCudaValueHeadPoolNCHW((float*)bufToBePooled,(float*)v1Mean.buf,batchSize,v1Channels,nnXLen*nnYLen,maskSumBuf);
-    else
-      customCudaValueHeadPoolNHWC((const float*)bufToBePooled,(float*)v1Mean.buf,batchSize,nnXLen*nnYLen,v1Channels,maskSumBuf);
-    CUDA_ERR(name.c_str(),hipPeekAtLastError());
-
-    v2Mul.apply(cudaHandles,scratch,batchSize,v1Mean.buf,v2Out.buf,workspaceBuf,workspaceBytes);
-    v2Bias.apply(cudaHandles,batchSize,v2Out.buf);
-    v3Mul.apply(cudaHandles,scratch,batchSize,v2Out.buf,valueBuf,workspaceBuf,workspaceBytes);
-    v3Bias.apply(cudaHandles,batchSize,valueBuf);
-
-    sv3Mul.apply(cudaHandles,scratch,batchSize,v2Out.buf,scoreValueBuf,workspaceBuf,workspaceBytes);
-    sv3Bias.apply(cudaHandles,batchSize,scoreValueBuf);
-
-    #ifdef DEBUG_INTERMEDIATE_VALUES
-    CudaUtils::debugPrint4D(string("v1"), v1Out.buf, batchSize, v1Channels, nnXLen, nnYLen, usingNHWC, usingFP16);
-    CudaUtils::debugPrint2D(string("v1 pooled"), v1Mean.buf, batchSize, v1Channels, false);
-    CudaUtils::debugPrint2D(string("v2"), v2Out.buf, batchSize, v1Channels, false);
-    #endif
-
-    if(!usingFP16) {
-      vOwnershipConv.apply(cudaHandles,batchSize,false,v1Out2.buf,ownershipBuf,workspaceBuf,workspaceBytes);
-    }
-    else {
-      vOwnershipConv.apply(cudaHandles,batchSize,false,v1Out2.buf,ownershipScratch.buf,workspaceBuf,workspaceBytes);
-      customCudaCopyFromHalf((const half*)ownershipScratch.buf,(float*)ownershipBuf,batchSize*ownershipChannels*nnXLen*nnYLen);
-      CUDA_ERR("vOwnership copy",hipPeekAtLastError());
-    }
-
-  }
-
-};
-
-//------------------------------------------------------------------------------
-
-struct Model {
-  const string name;
-  const int modelVersion;
-  const int maxBatchSize;
-  const int nnXLen;
-  const int nnYLen;
-  const int numInputChannels;
-  const int numInputGlobalChannels;
-  const int numInputMetaChannels;
-  const int numPolicyChannels;
-  const int numValueChannels;
-  const int numScoreValueChannels;
-  const int numOwnershipChannels;
-  const bool usingFP16;
-  const bool usingNHWC;
-  const bool inputsUsingNHWC;
-
-  std::unique_ptr<Trunk> trunk;
-  std::unique_ptr<PolicyHead> policyHead;
-  std::unique_ptr<ValueHead> valueHead;
-  std::unique_ptr<CudnnManager> manager;
-
-  Model() = delete;
-  Model(const Model&) = delete;
-  Model& operator=(const Model&) = delete;
-
-  Model(
-    CudaHandles* cudaHandles,
-    const ModelDesc* desc,
-    int maxBatchSz,
-    int nnX,
-    int nnY,
-    bool inputsUseNHWC,
-    bool useFP16,
-    bool useNHWC
-  ) :
-    name(desc->name),
-    modelVersion(desc->modelVersion),
-    maxBatchSize(maxBatchSz),
-    nnXLen(nnX),
-    nnYLen(nnY),
-    numInputChannels(desc->numInputChannels),
-    numInputGlobalChannels(desc->numInputGlobalChannels),
-    numInputMetaChannels(desc->numInputMetaChannels),
-    numPolicyChannels(desc->numPolicyChannels),
-    numValueChannels(desc->numValueChannels),
-    numScoreValueChannels(desc->numScoreValueChannels),
-    numOwnershipChannels(desc->numOwnershipChannels),
-    usingFP16(useFP16),
-    usingNHWC(useNHWC),
-    inputsUsingNHWC(inputsUseNHWC)
-  {
-    if(nnXLen > NNPos::MAX_BOARD_LEN)
-      throw StringError(Global::strprintf("nnXLen (%d) is greater than NNPos::MAX_BOARD_LEN (%d)",
-        nnXLen, NNPos::MAX_BOARD_LEN
-      ));
-    if(nnYLen > NNPos::MAX_BOARD_LEN)
-      throw StringError(Global::strprintf("nnYLen (%d) is greater than NNPos::MAX_BOARD_LEN (%d)",
-        nnYLen, NNPos::MAX_BOARD_LEN
-      ));
-
-    int numFeatures = NNModelVersion::getNumSpatialFeatures(modelVersion);
-    if(numInputChannels != numFeatures)
-      throw StringError(Global::strprintf("Neural net numInputChannels (%d) was not the expected number based on version (%d)",
-        numInputChannels, numFeatures
-      ));
-    int numGlobalFeatures = NNModelVersion::getNumGlobalFeatures(modelVersion);
-    if(numInputGlobalChannels != numGlobalFeatures)
-      throw StringError(Global::strprintf("Neural net numInputGlobalChannels (%d) was not the expected number based on version (%d)",
-        numInputGlobalChannels, numGlobalFeatures
-      ));
-    if(numInputMetaChannels > 0) {
-      if(numInputMetaChannels != SGFMetadata::METADATA_INPUT_NUM_CHANNELS)
-        throw StringError(Global::strprintf("Neural net numInputMetaChannels (%d) was not the expected number (%d)",
-          numInputMetaChannels, SGFMetadata::METADATA_INPUT_NUM_CHANNELS
-        ));
-    }
-
-    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numInputChannels);
-    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numInputGlobalChannels);
-    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numInputMetaChannels);
-    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numPolicyChannels);
-    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numValueChannels);
-    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numScoreValueChannels);
-    CudaUtils::checkBufferSize(maxBatchSize,nnXLen,nnYLen,numOwnershipChannels);
-
-    manager = std::make_unique<CudnnManager>(name, maxBatchSize, nnXLen, nnYLen);
-    trunk = std::make_unique<Trunk>(cudaHandles,manager.get(),&desc->trunk,nnXLen,nnYLen,inputsUseNHWC,useFP16,useNHWC);
-    policyHead = std::make_unique<PolicyHead>(cudaHandles,manager.get(),&desc->policyHead,nnXLen,nnYLen,useFP16,useNHWC);
-    valueHead = std::make_unique<ValueHead>(cudaHandles,manager.get(),&desc->valueHead,nnXLen,nnYLen,useFP16,useNHWC);
-  }
-
-  ~Model()
-  {
-  }
-
-  size_t requiredWorkspaceBytes(
-    CudaHandles* cudaHandles,
-    int batchSize
-  ) const {
-    size_t bytes = 0;
-    size_t b;
-
-    b = trunk->requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    b = policyHead->requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-    b = valueHead->requiredWorkspaceBytes(cudaHandles,batchSize);
-    bytes = std::max(bytes,b);
-
-    return bytes;
-  }
-
-  void apply(
-    CudaHandles* cudaHandles,
-    ScratchBuffers* scratch,
-    int batchSize,
-    bool requireExactNNLen,
-
-    void* inputBuf,
-    void* inputGlobalBuf,
-    void* inputMetaBuf,
-
-    float* policyPassBuf,
-    float* policyBuf,
-
-    float* valueBuf,
-    float* scoreValueBuf,
-    void* ownershipBuf,
-
-    void* workspaceBuf,
-    size_t workspaceBytes
-  ) const {
-    SizedBuf<void*> mask(scratch->allocator, scratch->getBufSizeXY(1));
-    SizedBuf<void*> maskFloat(scratch->allocator, scratch->getBufSizeXYFloat(1));
-    SizedBuf<void*> maskSum(scratch->allocator, scratch->getBufSizeFloat(1));
-
-    void* maskBuf = mask.buf;
-    float* maskFloatBuf = (float*)maskFloat.buf;
-    float* maskSumBuf = (float*)maskSum.buf;
-
-    if(!usingFP16) {
-      if(inputsUsingNHWC)
-        customCudaChannel0ExtractNHWC((const float*)inputBuf, (float*)maskBuf, batchSize, nnXLen*nnYLen, numInputChannels);
-      else
-        customCudaChannel0ExtractNCHW((const float*)inputBuf, (float*)maskBuf, batchSize, numInputChannels, nnXLen*nnYLen);
-      CUDA_ERR("modelExtractMask",hipPeekAtLastError());
-    }
-    else {
-      if(inputsUsingNHWC)
-        customCudaChannel0ExtractNHWC((const half*)inputBuf, (half*)maskBuf, batchSize, nnXLen*nnYLen, numInputChannels);
-      else
-        customCudaChannel0ExtractNCHW((const half*)inputBuf, (half*)maskBuf, batchSize, numInputChannels, nnXLen*nnYLen);
-      CUDA_ERR("modelExtractMask",hipPeekAtLastError());
-    }
-
-    fillMaskFloatBufAndMaskSumBuf(maskBuf,maskFloatBuf,maskSumBuf,usingFP16,batchSize,nnXLen,nnYLen);
-
-    //Don't do any masking if we know the board is exactly the desired size
-    if(requireExactNNLen) {
-      //Set to NULL to signal downstream that this buf doesn't need to be used
-      maskBuf = NULL;
-      maskFloatBuf = NULL;
-      //The global pooling structures need this no matter what, for normalizing based on this and its sqrt.
-      //maskSumBuf = NULL;
-    }
-
-    #ifdef DEBUG_INTERMEDIATE_VALUES
-    CudaUtils::debugPrint4D(string("Initial bin features"), inputBuf, batchSize, trunk->initialConv->inChannels, nnXLen, nnYLen, inputsUsingNHWC, usingFP16);
-    CudaUtils::debugPrint2D(string("Initial global features"), inputGlobalBuf, batchSize, trunk->initialMatMul->inChannels, usingFP16);
-    if(trunk->sgfMetadataEncoder != nullptr) {
-      assert(inputMetaBuf != NULL);
-      CudaUtils::debugPrint2D(string("Initial meta features"), inputMetaBuf, batchSize, trunk->sgfMetadataEncoder->mul1.inChannels, usingFP16);
-    }
-    #endif
-
-    SizedBuf<void*> trunkBuf(scratch->allocator, scratch->getBufSizeXY(trunk->trunkNumChannels));
-
-    trunk->apply(
-      cudaHandles,
-      scratch,
-      batchSize,
-      inputBuf,
-      inputGlobalBuf,
-      inputMetaBuf,
-      maskBuf,
-      maskSumBuf,
-      trunkBuf.buf,
-      workspaceBuf,
-      workspaceBytes
-    );
-    policyHead->apply(
-      cudaHandles,
-      scratch,
-      batchSize,
-      maskBuf,
-      maskFloatBuf,
-      maskSumBuf,
-      trunkBuf.buf,
-      policyPassBuf,
-      policyBuf,
-      workspaceBuf,
-      workspaceBytes
-    );
-    valueHead->apply(
-      cudaHandles,
-      scratch,
-      batchSize,
-      maskBuf,
-      maskSumBuf,
-      trunkBuf.buf,
-      valueBuf,
-      scoreValueBuf,
-      ownershipBuf,
-      workspaceBuf,
-      workspaceBytes
-    );
-  }
-
-};
-
-
-//------------------------------------------------------------------------------
-
-struct LoadedModel {
-  ModelDesc modelDesc;
-
-  LoadedModel(const string& fileName, const string& expectedSha256) {
-    ModelDesc::loadFromFileMaybeGZipped(fileName,modelDesc,expectedSha256);
-    modelDesc.applyScale8ToReduceActivations();
-  }
-
-  LoadedModel() = delete;
-  LoadedModel(const LoadedModel&) = delete;
-  LoadedModel& operator=(const LoadedModel&) = delete;
-};
-
-LoadedModel* NeuralNet::loadModelFile(const string& file, const string& expectedSha256) {
-  LoadedModel* loadedModel = new LoadedModel(file,expectedSha256);
-  return loadedModel;
-}
-
-void NeuralNet::freeLoadedModel(LoadedModel* loadedModel) {
-  delete loadedModel;
-}
-
-const ModelDesc& NeuralNet::getModelDesc(const LoadedModel* loadedModel) {
-  return loadedModel->modelDesc;
-}
-
-//------------------------------------------------------------------------------
-
-struct Buffers {
-  //All of these are device pointers
-
-  float* inputBufFloat;
-  void* inputBuf;
-  float* inputGlobalBufFloat;
-  void* inputGlobalBuf;
-  float* inputMetaBufFloat;
-  void* inputMetaBuf;
-  size_t inputBufBytesFloat;
-  size_t inputBufBytes;
-  size_t inputGlobalBufBytesFloat;
-  size_t inputGlobalBufBytes;
-  size_t inputMetaBufBytesFloat;
-  size_t inputMetaBufBytes;
-
-  float* policyPassBuf;
-  size_t policyPassBufBytes;
-  float* policyBuf;
-  size_t policyBufBytes;
-
-  float* valueBuf;
-  size_t valueBufBytes;
-  float* scoreValueBuf;
-  size_t scoreValueBufBytes;
-  void* ownershipBuf;
-  size_t ownershipBufBytes;
-
-  void* workspaceBuf;
-  size_t workspaceBytes;
-
-  Buffers() = delete;
-  Buffers(const Buffers&) = delete;
-  Buffers& operator=(const Buffers&) = delete;
-
-  Buffers(CudaHandles* cudaHandles, const Model& m, const ScratchBuffers& scratch) {
-    size_t batchXYFloatBytes = (size_t)scratch.batchXYFloatBytes;
-    size_t batchFloatBytes = (size_t)scratch.batchFloatBytes;
-    size_t batchXYBytes = (size_t)scratch.batchXYBytes;
-    size_t batchBytes = (size_t)scratch.batchBytes;
-
-    inputBufBytesFloat = m.numInputChannels * batchXYFloatBytes;
-    inputBufBytes = m.numInputChannels * batchXYBytes;
-    inputGlobalBufBytesFloat = m.numInputGlobalChannels * batchFloatBytes;
-    inputGlobalBufBytes = m.numInputGlobalChannels * batchBytes;
-    inputMetaBufBytesFloat = m.numInputMetaChannels * batchFloatBytes;
-    inputMetaBufBytes = m.numInputMetaChannels * batchBytes;
-
-    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&inputBufFloat), inputBufBytesFloat));
-    CUDA_ERR("Buffers",hipMalloc(&inputBuf, inputBufBytes));
-    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&inputGlobalBufFloat), inputGlobalBufBytesFloat));
-    CUDA_ERR("Buffers",hipMalloc(&inputGlobalBuf, inputGlobalBufBytes));
-    if(m.numInputMetaChannels > 0) {
-      CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&inputMetaBufFloat), inputMetaBufBytesFloat));
-      CUDA_ERR("Buffers",hipMalloc(&inputMetaBuf, inputMetaBufBytes));
-    }
-    else {
-      inputMetaBufFloat = NULL;
-      inputMetaBuf = NULL;
-    }
-
-    if(m.modelVersion >= 16)
-      testAssert(m.policyHead->p2Channels == 4);
-    else if(m.modelVersion >= 12)
-      testAssert(m.policyHead->p2Channels == 2);
-    else
-      testAssert(m.policyHead->p2Channels == 1);
-
-    policyPassBufBytes = m.policyHead->p2Channels * batchFloatBytes;
-    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&policyPassBuf), policyPassBufBytes));
-    policyBufBytes = m.policyHead->p2Channels * batchXYFloatBytes;
-    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&policyBuf), policyBufBytes));
-
-    valueBufBytes = m.valueHead->valueChannels * batchFloatBytes;
-    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&valueBuf), valueBufBytes));
-
-    scoreValueBufBytes = m.valueHead->scoreValueChannels * batchFloatBytes;
-    CUDA_ERR("Buffers",hipMalloc(reinterpret_cast<void**>(&scoreValueBuf), scoreValueBufBytes));
-
-    //This buf is used for both an intermdiate fp16 result in fp16 mode, and ALSO the final fp32 output, so always must be fp32-sized
-    ownershipBufBytes = m.valueHead->ownershipChannels * batchXYFloatBytes;
-    CUDA_ERR("Buffers",hipMalloc(&ownershipBuf, ownershipBufBytes));
-
-    //In theory the requiredWorkspaceBytes calls could give us values non-monotone in batch size
-    //such as if the convolution algorithm changes between batch size 1 and larger.
-    //So we call it for all the batch sizes.
-    size_t bytes = 0;
-    size_t b;
-    for(int batchSize = 1; batchSize <= m.maxBatchSize; batchSize++) {
-      b = m.requiredWorkspaceBytes(cudaHandles,batchSize);
-      bytes = std::max(bytes,b);
-    }
-
-    CUDA_ERR("Buffers",hipMalloc(&workspaceBuf, bytes));
-    workspaceBytes = bytes;
-  }
-
-  ~Buffers() {
-    hipFree(inputBufFloat);
-    hipFree(inputBuf);
-    hipFree(inputGlobalBufFloat);
-    hipFree(inputGlobalBuf);
-    if(inputMetaBufFloat != NULL)
-      hipFree(inputMetaBufFloat);
-    if(inputMetaBuf != NULL)
-      hipFree(inputMetaBuf);
-
-    hipFree(policyPassBuf);
-    hipFree(policyBuf);
-
-    hipFree(valueBuf);
-    hipFree(scoreValueBuf);
-    hipFree(ownershipBuf);
-
-    hipFree(workspaceBuf);
-  }
-
-};
-
-//------------------------------------------------------------------------------
-
-struct ComputeContext {
-  int nnXLen;
-  int nnYLen;
-  enabled_t useFP16Mode;
-  enabled_t useNHWCMode;
-};
-
-ComputeContext* NeuralNet::createComputeContext(
-  const std::vector<int>& gpuIdxs,
-  Logger* logger,
-  int nnXLen,
-  int nnYLen,
-  const string& openCLTunerFile,
-  const string& homeDataDirOverride,
-  bool openCLReTunePerBoardSize,
-  enabled_t useFP16Mode,
-  enabled_t useNHWCMode,
-  const LoadedModel* loadedModel
-) {
-  (void)gpuIdxs;
-  (void)logger;
-  (void)openCLTunerFile;
-  (void)homeDataDirOverride;
-  (void)openCLReTunePerBoardSize;
-  (void)loadedModel;
-
-  ComputeContext* context = new ComputeContext();
-  context->nnXLen = nnXLen;
-  context->nnYLen = nnYLen;
-  context->useFP16Mode = useFP16Mode;
-  context->useNHWCMode = useNHWCMode;
-  return context;
-}
-
-void NeuralNet::freeComputeContext(ComputeContext* computeContext) {
-  delete computeContext;
-}
-
-//------------------------------------------------------------------------------
-
-struct ComputeHandle {
-  std::unique_ptr<CudaHandles> cudaHandles;
-  std::unique_ptr<Model> model;
-  std::unique_ptr<ScratchBuffers> scratch;
-  std::unique_ptr<Buffers> buffers;
-  const bool usingFP16;
-  const int nnXLen;
-  const int nnYLen;
-  const bool requireExactNNLen;
-  const bool inputsUseNHWC;
-  const bool usingNHWC;
-
-  ComputeHandle(
-    const ComputeContext* context,
-    const LoadedModel* loadedModel,
-    int majorComputeCapability,
-    int minorComputeCapability,
-    int maxBatchSize,
-    bool requireExactNNLen_,
-    bool inputsUseNHWC_,
-    bool useFP16,
-    bool useNHWC
-  ) :
-    usingFP16(useFP16),
-    nnXLen(context->nnXLen),
-    nnYLen(context->nnYLen),
-    requireExactNNLen(requireExactNNLen_),
-    inputsUseNHWC(inputsUseNHWC_),
-    usingNHWC(useNHWC)
-  {
-    cudaHandles = std::make_unique<CudaHandles>(majorComputeCapability,minorComputeCapability);
-    model = std::make_unique<Model>(
-      cudaHandles.get(), &(loadedModel->modelDesc), maxBatchSize,
-      nnXLen, nnYLen, inputsUseNHWC, useFP16, useNHWC
-    );
-    scratch = std::make_unique<ScratchBuffers>(maxBatchSize, nnXLen, nnYLen, useFP16);
-    buffers = std::make_unique<Buffers>(cudaHandles.get(), *model, *scratch);
-
-    //Synchronize after creating buffers and copying all the weights, just in case
-    CUDA_ERR("ComputeHandle", hipDeviceSynchronize());
-  }
-  ~ComputeHandle() {
-  }
-
-  ComputeHandle() = delete;
-  ComputeHandle(const ComputeHandle&) = delete;
-  ComputeHandle& operator=(const ComputeHandle&) = delete;
-};
-
-ComputeHandle* NeuralNet::createComputeHandle(
-  ComputeContext* context,
-  const LoadedModel* loadedModel,
-  Logger* logger,
-  int maxBatchSize,
-  bool requireExactNNLen,
-  bool inputsUseNHWC,
-  int gpuIdxForThisThread,
-  int serverThreadIdx
-) {
-  //Use whatever CUDA believes GPU 0 to be.
-  if(gpuIdxForThisThread == -1)
-    gpuIdxForThisThread = 0;
-
-  CUDA_ERR("createComputeHandle",hipSetDevice(gpuIdxForThisThread));
-
-  hipDeviceProp_t prop;
-  hipGetDeviceProperties(&prop,gpuIdxForThisThread);
-
-  bool useFP16 = false;
-  bool useNHWC = false;
-  if(context->useFP16Mode == enabled_t::True || context->useFP16Mode == enabled_t::Auto)
-    useFP16 = true;
-
-  if(logger != NULL) {
-    logger->write(
-      "ROCm backend thread " + Global::intToString(serverThreadIdx) + ": Found GPU " + string(prop.name)
-      + " memory " + Global::uint64ToString(prop.totalGlobalMem)
-      + " compute capability major " + Global::intToString(prop.major)
-      + " minor " + Global::intToString(prop.minor)
-    );
-    logger->write(
-      "ROCm backend thread " + Global::intToString(serverThreadIdx) + ": Model version " + Global::intToString(loadedModel->modelDesc.modelVersion) +
-      " useFP16 = " + Global::boolToString(useFP16) +
-      " useNHWC = " + Global::boolToString(useNHWC)
-    );
-    logger->write(
-      "ROCm backend thread " + Global::intToString(serverThreadIdx) + ": Model name: " + loadedModel->modelDesc.name
-    );
-    logger->write(
-      "MIOpen finding convolution algorithms for GPU " + string(prop.name) + ". This may take a while, please wait............"
-    );
-  }
-
-  ComputeHandle* gpuHandle = new ComputeHandle(
-    context,loadedModel,prop.major,prop.minor,maxBatchSize,requireExactNNLen,inputsUseNHWC,useFP16,useNHWC
-  );
-  return gpuHandle;
-}
-
-void NeuralNet::freeComputeHandle(ComputeHandle* gpuHandle) {
-  delete gpuHandle;
-}
-
-bool NeuralNet::isUsingFP16(const ComputeHandle* handle) {
-  return handle->usingFP16;
-}
-
-//------------------------------------------------------------------------------
-
-void NeuralNet::printDevices() {
-  int numDevices = 0;
-  hipGetDeviceCount(&numDevices);
-  for(int i = 0; i<numDevices; i++) {
-    hipDeviceProp_t prop;
-    hipGetDeviceProperties(&prop, i);
-    cout << "Found ROCm device " << i << ": " << prop.name << endl;
-  }
-}
-
-
-//------------------------------------------------------------------------------
-
-struct InputBuffers {
-  int maxBatchSize;
-
-  size_t singleInputElts;
-  size_t singleInputBytes;
-  size_t singleInputGlobalElts;
-  size_t singleInputGlobalBytes;
-  size_t singleInputMetaElts;
-  size_t singleInputMetaBytes;
-  size_t singlePolicyPassResultElts;
-  size_t singlePolicyPassResultBytes;
-  size_t singlePolicyResultElts;
-  size_t singlePolicyResultBytes;
-  size_t singleValueResultElts;
-  size_t singleValueResultBytes;
-  size_t singleScoreValueResultElts;
-  size_t singleScoreValueResultBytes;
-  size_t singleOwnershipResultElts;
-  size_t singleOwnershipResultBytes;
-
-  size_t userInputBufferBytes;
-  size_t userInputGlobalBufferBytes;
-  size_t userInputMetaBufferBytes;
-  size_t policyPassResultBufferBytes;
-  size_t policyResultBufferBytes;
-  size_t valueResultBufferBytes;
-  size_t scoreValueResultBufferBytes;
-  size_t ownershipResultBufferBytes;
-
-  float* userInputBuffer; //Host pointer
-  float* userInputGlobalBuffer; //Host pointer
-  float* userInputMetaBuffer; //Host pointer
-
-  float* policyPassResults; //Host pointer
-  float* policyResults; //Host pointer
-  float* valueResults; //Host pointer
-  float* scoreValueResults; //Host pointer
-  float* ownershipResults; //Host pointer
-
-  InputBuffers(const LoadedModel* loadedModel, int maxBatchSz, int nnXLen, int nnYLen) {
-    const ModelDesc& m = loadedModel->modelDesc;
-
-    maxBatchSize = maxBatchSz;
-    singleInputElts = (size_t)m.numInputChannels * nnXLen * nnYLen;
-    singleInputBytes = (size_t)m.numInputChannels * nnXLen * nnYLen * sizeof(float);
-    singleInputGlobalElts = (size_t)m.numInputGlobalChannels;
-    singleInputGlobalBytes = (size_t)m.numInputGlobalChannels * sizeof(float);
-    singleInputMetaElts = (size_t)m.numInputMetaChannels;
-    singleInputMetaBytes = (size_t)m.numInputMetaChannels * sizeof(float);
-    singlePolicyPassResultElts = (size_t)(m.numPolicyChannels);
-    singlePolicyPassResultBytes = (size_t)(m.numPolicyChannels) * sizeof(float);
-    singlePolicyResultElts = (size_t)(m.numPolicyChannels * nnXLen * nnYLen);
-    singlePolicyResultBytes = (size_t)(m.numPolicyChannels * nnXLen * nnYLen) * sizeof(float);
-    singleValueResultElts = (size_t)m.numValueChannels;
-    singleValueResultBytes = (size_t)m.numValueChannels * sizeof(float);
-    singleScoreValueResultElts = (size_t)m.numScoreValueChannels;
-    singleScoreValueResultBytes = (size_t)m.numScoreValueChannels * sizeof(float);
-    singleOwnershipResultElts = (size_t)m.numOwnershipChannels * nnXLen * nnYLen;
-    singleOwnershipResultBytes = (size_t)m.numOwnershipChannels * nnXLen * nnYLen * sizeof(float);
-
-    assert(NNModelVersion::getNumSpatialFeatures(m.modelVersion) == m.numInputChannels);
-    assert(NNModelVersion::getNumGlobalFeatures(m.modelVersion) == m.numInputGlobalChannels);
-    if(m.numInputMetaChannels > 0) {
-      assert(SGFMetadata::METADATA_INPUT_NUM_CHANNELS == m.numInputMetaChannels);
-    }
-
-    userInputBufferBytes = (size_t)m.numInputChannels * maxBatchSize * nnXLen * nnYLen * sizeof(float);
-    userInputGlobalBufferBytes = (size_t)m.numInputGlobalChannels * maxBatchSize * sizeof(float);
-    userInputMetaBufferBytes = (size_t)m.numInputMetaChannels * maxBatchSize * sizeof(float);
-    policyPassResultBufferBytes = (size_t)maxBatchSize * m.numPolicyChannels * sizeof(float);
-    policyResultBufferBytes = (size_t)maxBatchSize * m.numPolicyChannels * nnXLen * nnYLen * sizeof(float);
-    valueResultBufferBytes = (size_t)maxBatchSize * m.numValueChannels * sizeof(float);
-    scoreValueResultBufferBytes = (size_t)maxBatchSize * m.numScoreValueChannels * sizeof(float);
-    ownershipResultBufferBytes = (size_t)maxBatchSize * nnXLen * nnYLen * m.numOwnershipChannels * sizeof(float);
-
-    userInputBuffer = new float[(size_t)m.numInputChannels * maxBatchSize * nnXLen * nnYLen];
-    userInputGlobalBuffer = new float[(size_t)m.numInputGlobalChannels * maxBatchSize];
-    if(m.numInputMetaChannels > 0)
-      userInputMetaBuffer = new float[(size_t)m.numInputMetaChannels * maxBatchSize];
-    else
-      userInputMetaBuffer = NULL;
-
-    policyPassResults = new float[(size_t)maxBatchSize * m.numPolicyChannels];
-    policyResults = new float[(size_t)maxBatchSize * m.numPolicyChannels * nnXLen * nnYLen];
-    valueResults = new float[(size_t)maxBatchSize * m.numValueChannels];
-
-    scoreValueResults = new float[(size_t)maxBatchSize * m.numScoreValueChannels];
-    ownershipResults = new float[(size_t)maxBatchSize * nnXLen * nnYLen * m.numOwnershipChannels];
-  }
-
-  ~InputBuffers() {
-    delete[] userInputBuffer;
-    delete[] userInputGlobalBuffer;
-    if(userInputMetaBuffer != NULL)
-      delete[] userInputMetaBuffer;
-    delete[] policyPassResults;
-    delete[] policyResults;
-    delete[] valueResults;
-    delete[] scoreValueResults;
-    delete[] ownershipResults;
-  }
-
-  InputBuffers() = delete;
-  InputBuffers(const InputBuffers&) = delete;
-  InputBuffers& operator=(const InputBuffers&) = delete;
-
-};
-
-InputBuffers* NeuralNet::createInputBuffers(const LoadedModel* loadedModel, int maxBatchSize, int nnXLen, int nnYLen) {
-  return new InputBuffers(loadedModel,maxBatchSize,nnXLen,nnYLen);
-}
-void NeuralNet::freeInputBuffers(InputBuffers* inputBuffers) {
-  delete inputBuffers;
-}
-
-//---------------------------------------------------------------------------------------
-
-
-void NeuralNet::getOutput(
-  ComputeHandle* gpuHandle,
-  InputBuffers* inputBuffers,
-  int numBatchEltsFilled,
-  NNResultBuf** inputBufs,
-  vector<NNOutput*>& outputs
-) {
-  assert(numBatchEltsFilled <= inputBuffers->maxBatchSize);
-  assert(numBatchEltsFilled > 0);
-  const int batchSize = numBatchEltsFilled;
-  const int nnXLen = gpuHandle->nnXLen;
-  const int nnYLen = gpuHandle->nnYLen;
-  const int modelVersion = gpuHandle->model->modelVersion;
-
-  const int numSpatialFeatures = NNModelVersion::getNumSpatialFeatures(modelVersion);
-  const int numGlobalFeatures = NNModelVersion::getNumGlobalFeatures(modelVersion);
-  const int numMetaFeatures = inputBuffers->singleInputMetaElts;
-  assert(numSpatialFeatures == gpuHandle->model->numInputChannels);
-  assert(numSpatialFeatures * nnXLen * nnYLen == inputBuffers->singleInputElts);
-  assert(numGlobalFeatures == inputBuffers->singleInputGlobalElts);
-  const int numPolicyChannels = gpuHandle->model->numPolicyChannels;
-
-  for(int nIdx = 0; nIdx<batchSize; nIdx++) {
-    float* rowSpatialInput = inputBuffers->userInputBuffer + (inputBuffers->singleInputElts * nIdx);
-    float* rowGlobalInput = inputBuffers->userInputGlobalBuffer + (inputBuffers->singleInputGlobalElts * nIdx);
-    float* rowMetaInput = inputBuffers->userInputMetaBuffer + (inputBuffers->singleInputMetaElts * nIdx);
-
-    const float* rowGlobal = inputBufs[nIdx]->rowGlobalBuf.data();
-    const float* rowSpatial = inputBufs[nIdx]->rowSpatialBuf.data();
-    const float* rowMeta = inputBufs[nIdx]->rowMetaBuf.data();
-    bool hasRowMeta = inputBufs[nIdx]->hasRowMeta;
-    std::copy(rowGlobal,rowGlobal+numGlobalFeatures,rowGlobalInput);
-    if(numMetaFeatures > 0) {
-      testAssert(rowMeta != NULL);
-      testAssert(hasRowMeta);
-      std::copy(rowMeta,rowMeta+numMetaFeatures,rowMetaInput);
-    }
-    else {
-      testAssert(!hasRowMeta);
-    }
-    SymmetryHelpers::copyInputsWithSymmetry(rowSpatial, rowSpatialInput, 1, nnYLen, nnXLen, numSpatialFeatures, gpuHandle->inputsUseNHWC, inputBufs[nIdx]->symmetry);
-  }
-
-  Buffers* buffers = gpuHandle->buffers.get();
-  ScratchBuffers* scratch = gpuHandle->scratch.get();
-
-  if(!gpuHandle->usingFP16) {
-    assert(inputBuffers->userInputBufferBytes == buffers->inputBufBytes);
-    assert(inputBuffers->userInputGlobalBufferBytes == buffers->inputGlobalBufBytes);
-    assert(inputBuffers->userInputMetaBufferBytes == buffers->inputMetaBufBytes);
-    assert(inputBuffers->policyPassResultBufferBytes == buffers->policyPassBufBytes);
-    assert(inputBuffers->policyResultBufferBytes == buffers->policyBufBytes);
-    assert(inputBuffers->valueResultBufferBytes == buffers->valueBufBytes);
-    assert(inputBuffers->singleInputBytes == inputBuffers->singleInputElts*4);
-    assert(inputBuffers->singleInputGlobalBytes == inputBuffers->singleInputGlobalElts*4);
-    assert(inputBuffers->singleInputMetaBytes == inputBuffers->singleInputMetaElts*4);
-    assert(inputBuffers->singlePolicyPassResultElts == numPolicyChannels);
-    assert(inputBuffers->singlePolicyPassResultBytes == numPolicyChannels * sizeof(float));
-    assert(inputBuffers->singlePolicyResultElts == numPolicyChannels*nnXLen*nnYLen);
-    assert(inputBuffers->singlePolicyResultBytes == numPolicyChannels*nnXLen*nnYLen * sizeof(float));
-    assert(inputBuffers->scoreValueResultBufferBytes == buffers->scoreValueBufBytes);
-    assert(inputBuffers->ownershipResultBufferBytes == buffers->ownershipBufBytes);
-    assert(inputBuffers->singleOwnershipResultElts == nnXLen*nnYLen);
-    assert(inputBuffers->singleOwnershipResultBytes == nnXLen*nnYLen * sizeof(float));
-
-    CUDA_ERR("getOutput",hipMemcpy(buffers->inputBuf, inputBuffers->userInputBuffer, inputBuffers->singleInputBytes*batchSize, hipMemcpyHostToDevice));
-    CUDA_ERR("getOutput",hipMemcpy(buffers->inputGlobalBuf, inputBuffers->userInputGlobalBuffer, inputBuffers->singleInputGlobalBytes*batchSize, hipMemcpyHostToDevice));
-    if(numMetaFeatures > 0) {
-      CUDA_ERR("getOutput",hipMemcpy(buffers->inputMetaBuf, inputBuffers->userInputMetaBuffer, inputBuffers->singleInputMetaBytes*batchSize, hipMemcpyHostToDevice));
-    }
-  }
-  else {
-    assert(inputBuffers->userInputBufferBytes == buffers->inputBufBytesFloat);
-    assert(inputBuffers->userInputGlobalBufferBytes == buffers->inputGlobalBufBytesFloat);
-    assert(inputBuffers->userInputMetaBufferBytes == buffers->inputMetaBufBytesFloat);
-    assert(inputBuffers->policyResultBufferBytes == buffers->policyBufBytes);
-    assert(inputBuffers->valueResultBufferBytes == buffers->valueBufBytes);
-    assert(inputBuffers->userInputBufferBytes == buffers->inputBufBytes*2);
-    assert(inputBuffers->userInputGlobalBufferBytes == buffers->inputGlobalBufBytes*2);
-    assert(inputBuffers->userInputMetaBufferBytes == buffers->inputMetaBufBytes*2);
-    assert(inputBuffers->singleInputBytes == inputBuffers->singleInputElts*4);
-    assert(inputBuffers->singleInputGlobalBytes == inputBuffers->singleInputGlobalElts*4);
-    assert(inputBuffers->singleInputMetaBytes == inputBuffers->singleInputMetaElts*4);
-    assert(inputBuffers->singlePolicyPassResultElts == numPolicyChannels);
-    assert(inputBuffers->singlePolicyPassResultBytes == numPolicyChannels * sizeof(float));
-    assert(inputBuffers->singlePolicyResultElts == numPolicyChannels*nnXLen*nnYLen);
-    assert(inputBuffers->singlePolicyResultBytes == numPolicyChannels*nnXLen*nnYLen * sizeof(float));
-    assert(inputBuffers->scoreValueResultBufferBytes == buffers->scoreValueBufBytes);
-    assert(inputBuffers->ownershipResultBufferBytes == buffers->ownershipBufBytes);
-    assert(inputBuffers->singleOwnershipResultElts == nnXLen*nnYLen);
-    assert(inputBuffers->singleOwnershipResultBytes == nnXLen*nnYLen * sizeof(float));
-
-    CUDA_ERR("getOutput",hipMemcpy(buffers->inputBufFloat, inputBuffers->userInputBuffer, inputBuffers->singleInputBytes*batchSize, hipMemcpyHostToDevice));
-    CUDA_ERR("getOutput",hipMemcpy(buffers->inputGlobalBufFloat, inputBuffers->userInputGlobalBuffer, inputBuffers->singleInputGlobalBytes*batchSize, hipMemcpyHostToDevice));
-    if(numMetaFeatures > 0) {
-      CUDA_ERR("getOutput",hipMemcpy(buffers->inputMetaBufFloat, inputBuffers->userInputMetaBuffer, inputBuffers->singleInputMetaBytes*batchSize, hipMemcpyHostToDevice));
-    }
-
-    customCudaCopyToHalf((const float*)buffers->inputBufFloat,(half*)buffers->inputBuf,inputBuffers->singleInputElts*batchSize);
-    CUDA_ERR("getOutput",hipPeekAtLastError());
-    customCudaCopyToHalf((const float*)buffers->inputGlobalBufFloat,(half*)buffers->inputGlobalBuf,inputBuffers->singleInputGlobalElts*batchSize);
-    CUDA_ERR("getOutput",hipPeekAtLastError());
-    if(numMetaFeatures > 0) {
-      customCudaCopyToHalf((const float*)buffers->inputMetaBufFloat,(half*)buffers->inputMetaBuf,inputBuffers->singleInputMetaElts*batchSize);
-      CUDA_ERR("getOutput",hipPeekAtLastError());
-    }
-  }
-
-  gpuHandle->model->apply(
-    gpuHandle->cudaHandles.get(),
-    scratch,
-    batchSize,
-    gpuHandle->requireExactNNLen,
-
-    buffers->inputBuf,
-    buffers->inputGlobalBuf,
-    buffers->inputMetaBuf,
-
-    buffers->policyPassBuf,
-    buffers->policyBuf,
-
-    buffers->valueBuf,
-    buffers->scoreValueBuf,
-    buffers->ownershipBuf,
-
-    buffers->workspaceBuf,
-    buffers->workspaceBytes
-  );
-
-  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->policyPassResults, buffers->policyPassBuf, inputBuffers->singlePolicyPassResultBytes*batchSize, hipMemcpyDeviceToHost));
-  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->policyResults, buffers->policyBuf, inputBuffers->singlePolicyResultBytes*batchSize, hipMemcpyDeviceToHost));
-  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->valueResults, buffers->valueBuf, inputBuffers->singleValueResultBytes*batchSize, hipMemcpyDeviceToHost));
-  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->scoreValueResults, buffers->scoreValueBuf, inputBuffers->singleScoreValueResultBytes*batchSize, hipMemcpyDeviceToHost));
-  CUDA_ERR("getOutput",hipMemcpy(inputBuffers->ownershipResults, buffers->ownershipBuf, inputBuffers->singleOwnershipResultBytes*batchSize, hipMemcpyDeviceToHost));
-
-  assert(outputs.size() == batchSize);
-
-  float policyProbsTmp[NNPos::MAX_NN_POLICY_SIZE];
-
-  for(int row = 0; row < batchSize; row++) {
-    NNOutput* output = outputs[row];
-    assert(output->nnXLen == nnXLen);
-    assert(output->nnYLen == nnYLen);
-    float policyOptimism = (float)inputBufs[row]->policyOptimism;
-
-    const float* policyPassSrcBuf = inputBuffers->policyPassResults + row * numPolicyChannels;
-    const float* policySrcBuf = inputBuffers->policyResults + row * numPolicyChannels * nnXLen * nnYLen;
-    float* policyProbs = output->policyProbs;
-
-    // These are in logits, the client does the postprocessing to turn them into
-    // policy probabilities and white game outcome probabilities
-    // Also we don't fill in the nnHash here either
-    // Handle version >= 12 policy optimism
-    if(numPolicyChannels == 2 || (numPolicyChannels == 4 && modelVersion >= 16)) {
-       if(gpuHandle->usingNHWC) {
-        for(int i = 0; i<nnXLen*nnYLen; i++) {
-          float p = policySrcBuf[i*numPolicyChannels];
-          float pOpt = policySrcBuf[i*numPolicyChannels+1];
-          policyProbsTmp[i] = p + (pOpt-p) * policyOptimism;
-        }
-        SymmetryHelpers::copyOutputsWithSymmetry(policyProbsTmp, policyProbs, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
-        policyProbs[nnXLen*nnYLen] = policyPassSrcBuf[0] + (policyPassSrcBuf[1] - policyPassSrcBuf[0]) * policyOptimism;
-      }
-      else {
-        for(int i = 0; i<nnXLen*nnYLen; i++) {
-          float p = policySrcBuf[i];
-          float pOpt = policySrcBuf[i+nnXLen*nnYLen];
-          policyProbsTmp[i] = p + (pOpt-p) * policyOptimism;
-        }
-        SymmetryHelpers::copyOutputsWithSymmetry(policyProbsTmp, policyProbs, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
-        policyProbs[nnXLen*nnYLen] = policyPassSrcBuf[0] + (policyPassSrcBuf[1] - policyPassSrcBuf[0]) * policyOptimism;
-      }
-    }
-    else {
-      assert(numPolicyChannels == 1);
-      SymmetryHelpers::copyOutputsWithSymmetry(policySrcBuf, policyProbs, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
-      policyProbs[nnXLen*nnYLen] = policyPassSrcBuf[0];
-    }
-
-    int numValueChannels = gpuHandle->model->numValueChannels;
-    assert(numValueChannels == 3);
-    output->whiteWinProb = inputBuffers->valueResults[row * numValueChannels];
-    output->whiteLossProb = inputBuffers->valueResults[row * numValueChannels + 1];
-    output->whiteNoResultProb = inputBuffers->valueResults[row * numValueChannels + 2];
-
-    //As above, these are NOT actually from white's perspective, but rather the player to move.
-    //As usual the client does the postprocessing.
-    if(output->whiteOwnerMap != NULL) {
-      const float* ownershipSrcBuf = inputBuffers->ownershipResults + row * nnXLen * nnYLen;
-      assert(gpuHandle->model->numOwnershipChannels == 1);
-      SymmetryHelpers::copyOutputsWithSymmetry(ownershipSrcBuf, output->whiteOwnerMap, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
-    }
-
-    if(modelVersion >= 9) {
-      int numScoreValueChannels = gpuHandle->model->numScoreValueChannels;
-      assert(numScoreValueChannels == 6);
-      output->whiteScoreMean = inputBuffers->scoreValueResults[row * numScoreValueChannels];
-      output->whiteScoreMeanSq = inputBuffers->scoreValueResults[row * numScoreValueChannels + 1];
-      output->whiteLead = inputBuffers->scoreValueResults[row * numScoreValueChannels + 2];
-      output->varTimeLeft = inputBuffers->scoreValueResults[row * numScoreValueChannels + 3];
-      output->shorttermWinlossError = inputBuffers->scoreValueResults[row * numScoreValueChannels + 4];
-      output->shorttermScoreError = inputBuffers->scoreValueResults[row * numScoreValueChannels + 5];
-    }
-    else if(modelVersion >= 8) {
-      int numScoreValueChannels = gpuHandle->model->numScoreValueChannels;
-      assert(numScoreValueChannels == 4);
-      output->whiteScoreMean = inputBuffers->scoreValueResults[row * numScoreValueChannels];
-      output->whiteScoreMeanSq = inputBuffers->scoreValueResults[row * numScoreValueChannels + 1];
-      output->whiteLead = inputBuffers->scoreValueResults[row * numScoreValueChannels + 2];
-      output->varTimeLeft = inputBuffers->scoreValueResults[row * numScoreValueChannels + 3];
-      output->shorttermWinlossError = 0;
-      output->shorttermScoreError = 0;
-    }
-    else if(modelVersion >= 4) {
-      int numScoreValueChannels = gpuHandle->model->numScoreValueChannels;
-      assert(numScoreValueChannels == 2);
-      output->whiteScoreMean = inputBuffers->scoreValueResults[row * numScoreValueChannels];
-      output->whiteScoreMeanSq = inputBuffers->scoreValueResults[row * numScoreValueChannels + 1];
-      output->whiteLead = output->whiteScoreMean;
-      output->varTimeLeft = 0;
-      output->shorttermWinlossError = 0;
-      output->shorttermScoreError = 0;
-    }
-    else if(modelVersion >= 3) {
-      int numScoreValueChannels = gpuHandle->model->numScoreValueChannels;
-      assert(numScoreValueChannels == 1);
-      output->whiteScoreMean = inputBuffers->scoreValueResults[row * numScoreValueChannels];
-      //Version 3 neural nets don't have any second moment output, implicitly already folding it in, so we just use the mean squared
-      output->whiteScoreMeanSq = output->whiteScoreMean * output->whiteScoreMean;
-      output->whiteLead = output->whiteScoreMean;
-      output->varTimeLeft = 0;
-      output->shorttermWinlossError = 0;
-      output->shorttermScoreError = 0;
-    }
-    else {
-      ASSERT_UNREACHABLE;
-    }
-  }
-
-}
-
-//TESTING ----------------------------------------------------------------------------------
-
-
-bool NeuralNet::testEvaluateConv(
-  const ConvLayerDesc* desc,
-  int desiredBatchSize,
-  int nnXLen,
-  int nnYLen,
-  bool useFP16,
-  bool useNHWC,
-  const vector<float>& inputBuffer,
-  vector<float>& outputBuffer
-) {
-  hipDeviceSynchronize();
-  CudaHandles* cudaHandles = CudaHandles::cudaHandlesTesting();
-
-  size_t numInputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->inChannels;
-  size_t numOutputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->outChannels;
-  if(numInputFloats != inputBuffer.size())
-    throw StringError("testEvaluateConv: unexpected input buffer size");
-
-  void* deviceInput;
-  void* deviceOutput;
-  CudaUtils::mallocAndCopyToDevice("deviceInput", inputBuffer.data(), numInputFloats, deviceInput, useFP16);
-  CudaUtils::mallocOnDevice("deviceOutput", numOutputFloats, deviceOutput, useFP16);
-
-  int maxBatchSize = desiredBatchSize;
-
-  CudnnManager* manager = new CudnnManager("manager",maxBatchSize,nnXLen,nnYLen);
-  ConvLayer* convLayer = new ConvLayer(cudaHandles,manager,desc,useFP16,useNHWC);
-
-  size_t workspaceBytes =
-    convLayer->requiredWorkspaceBytes(cudaHandles,desiredBatchSize);
-  void* deviceWorkspace;
-  CUDA_ERR("deviceWorkspace",hipMalloc(&deviceWorkspace, workspaceBytes));
-
-
-  bool accumulate = false;
-  convLayer->apply(
-    cudaHandles,
-    desiredBatchSize,
-    accumulate,
-    deviceInput,
-    deviceOutput,
-    deviceWorkspace,
-    workspaceBytes
-  );
-
-  outputBuffer.resize(numOutputFloats);
-  CudaUtils::expensiveCopyFromDevice("copyResultsToHost", outputBuffer.data(), numOutputFloats, deviceOutput, useFP16);
-
-  hipFree(deviceWorkspace);
-
-  delete convLayer;
-  delete manager;
-  hipFree(deviceInput);
-  hipFree(deviceOutput);
-  delete cudaHandles;
-
-  return true;
-}
-
-
-bool NeuralNet::testEvaluateBatchNorm(
-  const BatchNormLayerDesc* desc,
-  int desiredBatchSize,
-  int nnXLen,
-  int nnYLen,
-  bool useFP16,
-  bool useNHWC,
-  const vector<float>& inputBuffer,
-  const vector<float>& maskBuffer,
-  vector<float>& outputBuffer
-) {
-  hipDeviceSynchronize();
-  CudaHandles* cudaHandles = CudaHandles::cudaHandlesTesting();
-
-  size_t numInputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->numChannels;
-  size_t numMaskFloats = (size_t)desiredBatchSize * nnXLen * nnYLen;
-  size_t numOutputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->numChannels;
-  if(numInputFloats != inputBuffer.size())
-    throw StringError("testEvaluateBatchNorm: unexpected input buffer size");
-  if(numMaskFloats != maskBuffer.size())
-    throw StringError("testEvaluateBatchNorm: unexpected mask buffer size");
-
-  ActivationLayerDesc actDesc;
-  actDesc.activation = ACTIVATION_IDENTITY;
-
-  void* deviceInput;
-  void* deviceMask;
-  void* deviceOutput;
-  CudaUtils::mallocAndCopyToDevice("deviceInput", inputBuffer.data(), numInputFloats, deviceInput, useFP16);
-  CudaUtils::mallocAndCopyToDevice("deviceMask", maskBuffer.data(), numMaskFloats, deviceMask, useFP16);
-  CudaUtils::mallocOnDevice("deviceOutput", numOutputFloats, deviceOutput, useFP16);
-
-  BatchNormLayer* batchNormLayer = new BatchNormLayer(cudaHandles,desc,&actDesc,nnXLen,nnYLen,useFP16,useNHWC);
-
-  batchNormLayer->apply(
-    cudaHandles,
-    desiredBatchSize,
-    deviceInput,
-    deviceMask,
-    deviceOutput
-  );
-
-  outputBuffer.resize(numOutputFloats);
-  CudaUtils::expensiveCopyFromDevice("copyResultsToHost", outputBuffer.data(), numOutputFloats, deviceOutput, useFP16);
-
-  delete batchNormLayer;
-
-  hipFree(deviceInput);
-  hipFree(deviceMask);
-  hipFree(deviceOutput);
-  delete cudaHandles;
-
-  return true;
-}
-
-
-bool NeuralNet::testEvaluateResidualBlock(
-  const ResidualBlockDesc* desc,
-  int desiredBatchSize,
-  int nnXLen,
-  int nnYLen,
-  bool useFP16,
-  bool useNHWC,
-  const vector<float>& inputBuffer,
-  const vector<float>& maskBuffer,
-  vector<float>& outputBuffer
-) {
-  hipDeviceSynchronize();
-  CudaHandles* cudaHandles = CudaHandles::cudaHandlesTesting();
-
-  size_t numInputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->preBN.numChannels;
-  size_t numMaskFloats = (size_t)desiredBatchSize * nnXLen * nnYLen;
-  size_t numOutputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->finalConv.outChannels;
-  if(numInputFloats != inputBuffer.size())
-    throw StringError("testEvaluateResidualBlock: unexpected input buffer size");
-  if(numMaskFloats != maskBuffer.size())
-    throw StringError("testEvaluateResidualBlock: unexpected mask buffer size");
-
-  ScratchBuffers* scratch = new ScratchBuffers(desiredBatchSize, nnXLen, nnYLen, useFP16);
-
-  void* deviceInput;
-  void* deviceMask;
-  void* deviceScratch;
-  CudaUtils::mallocAndCopyToDevice("deviceInput", inputBuffer.data(), numInputFloats, deviceInput, useFP16);
-  CudaUtils::mallocAndCopyToDevice("deviceMask", maskBuffer.data(), numMaskFloats, deviceMask, useFP16);
-  CudaUtils::mallocOnDevice("deviceScratch", numInputFloats, deviceScratch, useFP16);
-
-  int maxBatchSize = desiredBatchSize;
-
-  CudnnManager* manager = new CudnnManager("manager",maxBatchSize,nnXLen,nnYLen);
-  ResidualBlock* residualBlock = new ResidualBlock(cudaHandles,manager,desc,nnXLen,nnYLen,useFP16,useNHWC);
-
-  size_t workspaceBytes =
-    residualBlock->requiredWorkspaceBytes(cudaHandles,desiredBatchSize);
-  void* deviceWorkspace;
-  CUDA_ERR("deviceWorkspace",hipMalloc(&deviceWorkspace, workspaceBytes));
-
-  residualBlock->apply(
-    cudaHandles,
-    scratch,
-    desiredBatchSize,
-    deviceInput,
-    deviceScratch,
-    deviceMask,
-    deviceWorkspace,
-    workspaceBytes
-  );
-
-  outputBuffer.resize(numOutputFloats);
-  CudaUtils::expensiveCopyFromDevice("copyResultsToHost", outputBuffer.data(), numOutputFloats, deviceInput, useFP16);
-
-  hipFree(deviceWorkspace);
-
-  delete residualBlock;
-  delete manager;
-  hipFree(deviceInput);
-  hipFree(deviceMask);
-  hipFree(deviceScratch);
-  delete scratch;
-  delete cudaHandles;
-
-  return true;
-}
-
-bool NeuralNet::testEvaluateGlobalPoolingResidualBlock(
-  const GlobalPoolingResidualBlockDesc* desc,
-  int desiredBatchSize,
-  int nnXLen,
-  int nnYLen,
-  bool useFP16,
-  bool useNHWC,
-  const vector<float>& inputBuffer,
-  const vector<float>& maskBuffer,
-  vector<float>& outputBuffer
-) {
-  hipDeviceSynchronize();
-  CudaHandles* cudaHandles = CudaHandles::cudaHandlesTesting();
-
-  size_t numInputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->preBN.numChannels;
-  size_t numMaskFloats = (size_t)desiredBatchSize * nnXLen * nnYLen;
-  size_t numMaskSumFloats = (size_t)desiredBatchSize;
-  size_t numOutputFloats = (size_t)desiredBatchSize * nnXLen * nnYLen * desc->finalConv.outChannels;
-
-  if(numInputFloats != inputBuffer.size())
-    throw StringError("testEvaluateGlobalPoolingResidualBlock: unexpected input buffer size");
-  if(numMaskFloats != maskBuffer.size())
-    throw StringError("testEvaluateGlobalPoolingResidualBlock: unexpected mask buffer size");
-
-  ScratchBuffers* scratch = new ScratchBuffers(desiredBatchSize, nnXLen, nnYLen, useFP16);
-
-  void* deviceInput;
-  void* deviceMask;
-  float* deviceMaskFloatOrig;
-  float* deviceMaskFloat;
-  float* deviceMaskSum;
-  void* deviceScratch;
-
-  CudaUtils::mallocAndCopyToDevice("deviceInput", inputBuffer.data(), numInputFloats, deviceInput, useFP16);
-  CudaUtils::mallocAndCopyToDevice("deviceMask", maskBuffer.data(), numMaskFloats, deviceMask, useFP16);
-  CUDA_ERR("deviceMaskFloat",hipMalloc(reinterpret_cast<void**>(&deviceMaskFloat), numMaskFloats * sizeof(float)));
-  CUDA_ERR("deviceMaskSum",hipMalloc(reinterpret_cast<void**>(&deviceMaskSum), numMaskSumFloats * sizeof(float)));
-  deviceMaskFloatOrig = deviceMaskFloat;
-  CudaUtils::mallocOnDevice("deviceScratch", numInputFloats, deviceScratch, useFP16);
-
-  fillMaskFloatBufAndMaskSumBuf(deviceMask, deviceMaskFloat, deviceMaskSum, useFP16, desiredBatchSize, nnXLen, nnYLen);
-
-  int maxBatchSize = desiredBatchSize;
-
-  CudnnManager* manager = new CudnnManager("manager",maxBatchSize,nnXLen,nnYLen);
-  GlobalPoolingResidualBlock* residualBlock = new GlobalPoolingResidualBlock(
-    cudaHandles,manager,desc,nnXLen,nnYLen,useFP16,useNHWC
-  );
-
-  size_t workspaceBytes =
-    residualBlock->requiredWorkspaceBytes(
-      cudaHandles,desiredBatchSize
-    );
-
-  void* deviceWorkspace;
-  CUDA_ERR("deviceWorkspace",hipMalloc(&deviceWorkspace, workspaceBytes));
-
-  residualBlock->apply(
-    cudaHandles,
-    scratch,
-    desiredBatchSize,
-    deviceInput,
-    deviceScratch,
-    deviceMask,
-    deviceMaskSum,
-    deviceWorkspace,
-    workspaceBytes
-  );
-
-  outputBuffer.resize(numOutputFloats);
-  CudaUtils::expensiveCopyFromDevice("copyResultsToHost", outputBuffer.data(), numOutputFloats, deviceInput, useFP16);
-
-  hipFree(deviceWorkspace);
-
-  delete residualBlock;
-  delete manager;
-
-  hipFree(deviceInput);
-  hipFree(deviceMask);
-  hipFree(deviceMaskFloatOrig);
-  hipFree(deviceMaskSum);
-  hipFree(deviceScratch);
-  delete scratch;
-  delete cudaHandles;
-
-  return true;
-}
-
-
-#endif  // USE_ROCM_BACKEND

From c70d841a92f97d364ba6132b4957eb6014310cbb Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Sat, 2 Aug 2025 04:27:09 +0200
Subject: [PATCH 11/24] Update docks

---
 .gitignore                         |  2 +-
 Compiling.md                       |  3 +-
 README.md                          | 55 +++++++++++++++++-------------
 cpp/README.md                      |  2 +-
 cpp/configs/analysis_example.cfg   | 29 ++++++++++++++--
 cpp/configs/contribute_example.cfg | 30 ++++++++++++++--
 cpp/configs/gtp_example.cfg        | 32 +++++++++++++++--
 cpp/configs/match_example.cfg      | 30 ++++++++++++++--
 8 files changed, 144 insertions(+), 39 deletions(-)

diff --git a/.gitignore b/.gitignore
index 2e933d553..83492a14f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,7 +21,7 @@ cpp/main
 cpp/maincuda
 cpp/mainopencl
 cpp/katago
-cpp/configs
+# cpp/configs
 cpp/evalsgf
 cpp/run*.sh
 cpp/tests/scratch
diff --git a/Compiling.md b/Compiling.md
index 648fea548..9d810a503 100644
--- a/Compiling.md
+++ b/Compiling.md
@@ -33,6 +33,7 @@ As also mentioned in the instructions below but repeated here for visibility, if
       * If using the OpenCL backend, a modern GPU that supports OpenCL 1.2 or greater, or else something like [this](https://software.intel.com/en-us/opencl-sdk) for CPU. But if using CPU, Eigen should be better.
       * If using the CUDA backend, CUDA 11 or later and a compatible version of CUDNN based on your CUDA version (https://developer.nvidia.com/cuda-toolkit) (https://developer.nvidia.com/cudnn) and a GPU capable of supporting them.
       * If using the TensorRT backend, in addition to a compatible CUDA Toolkit (https://developer.nvidia.com/cuda-toolkit), you also need TensorRT (https://developer.nvidia.com/tensorrt) that is at least version 8.5.
+      * If using the ROCm backend, ROCm 6.4 or later and a GPU capable of supporting them. More information about installation(https://rocm.docs.amd.com/projects/install-on-linux/en/latest/) and please install all possiable ROCm developer packages, instead of just ROCm runtime packages.
       * If using the Eigen backend, Eigen3. With Debian packages, (i.e. apt or apt-get), this should be `libeigen3-dev`.
       * zlib, libzip. With Debian packages (i.e. apt or apt-get), these should be `zlib1g-dev`, `libzip-dev`.
       * If you want to do self-play training and research, probably Google perftools `libgoogle-perftools-dev` for TCMalloc or some other better malloc implementation. For unknown reasons, the allocation pattern in self-play with large numbers of threads and parallel games causes a lot of memory fragmentation under glibc malloc that will eventually run your machine out of memory, but better mallocs handle it fine.
@@ -41,7 +42,7 @@ As also mentioned in the instructions below but repeated here for visibility, if
       * `git clone https://github.com/lightvector/KataGo.git`
    * Compile using CMake and make in the cpp directory:
       * `cd KataGo/cpp`
-      * `cmake . -DUSE_BACKEND=OPENCL` or `cmake . -DUSE_BACKEND=CUDA` or `cmake . -DUSE_BACKEND=TENSORRT` or `cmake . -DUSE_BACKEND=EIGEN` depending on which backend you want.
+      * `cmake . -DUSE_BACKEND=OPENCL` or `cmake . -DUSE_BACKEND=CUDA` or `cmake . -DUSE_BACKEND=TENSORRT` or `cmake . -DUSE_BACKEND=EIGEN` or `cmake . -DUSE_BACKEND=ROCM`depending on which backend you want.
          * Specify also `-DUSE_TCMALLOC=1` if using TCMalloc.
          * Compiling will also call git commands to embed the git hash into the compiled executable, specify also `-DNO_GIT_REVISION=1` to disable it if this is causing issues for you.
          * Specify `-DUSE_AVX2=1` to also compile Eigen with AVX2 and FMA support, which will make it incompatible with old CPUs but much faster. (If you want to go further, you can also add `-DCMAKE_CXX_FLAGS='-march=native'` which will specialize to precisely your machine's CPU, but the exe might not run on other machines at all).
diff --git a/README.md b/README.md
index ce7e87b97..768e40838 100644
--- a/README.md
+++ b/README.md
@@ -1,27 +1,30 @@
 # KataGo
 
-* [Overview](#overview)
-* [Training History and Research](#training-history-and-research)
-* [Where To Download Stuff](#where-to-download-stuff)
-* [Setting Up and Running KataGo](#setting-up-and-running-katago)
-  * [GUIs](#guis)
-  * [Windows and Linux](#windows-and-linux)
-  * [MacOS](#macos)
-  * [OpenCL vs CUDA vs TensorRT vs Eigen](#opencl-vs-cuda-vs-tensorrt-vs-eigen)
-  * [How To Use](#how-to-use)
-  * [Tuning for Performance](#tuning-for-performance)
-  * [Common Questions and Issues](#common-questions-and-issues)
-    * [Issues with specific GPUs or GPU drivers](#issues-with-specific-gpus-or-gpu-drivers)
-    * [Common Problems](#common-problems)
-    * [Other Questions](#other-questions)
-* [Features for Developers](#features-for-developers)
-  * [GTP Extensions](#gtp-extensions)
-  * [Analysis Engine](#analysis-engine)
-* [Compiling KataGo](#compiling-katago)
-* [Source Code Overview](#source-code-overview)
-* [Selfplay Training](#selfplay-training)
-* [Contributors](#contributors)
-* [License](#license)
+- [KataGo](#katago)
+  - [Overview](#overview)
+  - [Training History and Research and Docs](#training-history-and-research-and-docs)
+  - [Where To Download Stuff](#where-to-download-stuff)
+  - [Setting Up and Running KataGo](#setting-up-and-running-katago)
+    - [GUIs](#guis)
+    - [Windows and Linux](#windows-and-linux)
+    - [MacOS](#macos)
+    - [OpenCL vs CUDA vs TensorRT vs ROCm vs Eigen](#opencl-vs-cuda-vs-tensorrt-vs-rocm-vs-eigen)
+    - [How To Use](#how-to-use)
+      - [Human-style Play and Analysis](#human-style-play-and-analysis)
+      - [Other Commands:](#other-commands)
+    - [Tuning for Performance](#tuning-for-performance)
+    - [Common Questions and Issues](#common-questions-and-issues)
+      - [Issues with specific GPUs or GPU drivers](#issues-with-specific-gpus-or-gpu-drivers)
+      - [Common Problems](#common-problems)
+      - [Other Questions](#other-questions)
+  - [Features for Developers](#features-for-developers)
+      - [GTP Extensions:](#gtp-extensions)
+      - [Analysis Engine:](#analysis-engine)
+  - [Compiling KataGo](#compiling-katago)
+  - [Source Code Overview:](#source-code-overview)
+  - [Selfplay Training:](#selfplay-training)
+  - [Contributors](#contributors)
+  - [License](#license)
 
 ## Overview
 
@@ -84,8 +87,8 @@ The community also provides KataGo packages for [Homebrew](https://brew.sh) on M
 
 Use `brew install katago`. The latest config files and networks are installed in KataGo's `share` directory. Find them via `brew list --verbose katago`. A basic way to run katago will be `katago gtp -config $(brew list --verbose katago | grep 'gtp.*\.cfg') -model $(brew list --verbose katago | grep .gz | head -1)`. You should choose the Network according to the release notes here and customize the provided example config as with every other way of installing KataGo.
 
-### OpenCL vs CUDA vs TensorRT vs Eigen
-KataGo has four backends, OpenCL (GPU), CUDA (GPU), TensorRT (GPU), and Eigen (CPU).
+### OpenCL vs CUDA vs TensorRT vs ROCm vs Eigen
+KataGo has five backends, OpenCL (GPU), CUDA (GPU), TensorRT (GPU), ROCm (GPU) and Eigen (CPU).
 
 The quick summary is:
   * **To easily get something working, try OpenCL if you have any good or decent GPU.**
@@ -93,11 +96,13 @@ The quick summary is:
   * Use Eigen with AVX2 if you don't have a GPU or if your GPU is too old/weak to work with OpenCL, and you just want a plain CPU KataGo.
   * Use Eigen without AVX2 if your CPU is old or on a low-end device that doesn't support AVX2.
   * The CUDA backend can work for NVIDIA GPUs with CUDA+CUDNN installed but is likely worse than TensorRT.
+  * The ROCm backend can work for AMD GPUs with ROCm+MIOpen installed.
 
 More in detail:
   * OpenCL is a general GPU backend should be able to run with any GPUs or accelerators that support [OpenCL](https://en.wikipedia.org/wiki/OpenCL), including NVIDIA GPUs, AMD GPUs, as well CPU-based OpenCL implementations or things like Intel Integrated Graphics. This is the most general GPU version of KataGo and doesn't require a complicated install like CUDA does, so is most likely to work out of the box as long as you have a fairly modern GPU. **However, it also need to take some time when run for the very first time to tune itself.** For many systems, this will take 5-30 seconds, but on a few older/slower systems, may take many minutes or longer. Also, the quality of OpenCL implementations is sometimes inconsistent, particularly for Intel Integrated Graphics and for AMD GPUs that are older than several years, so it might not work for very old machines, as well as specific buggy newer AMD GPUs, see also [Issues with specific GPUs or GPU drivers](#issues-with-specific-gpus-or-gpu-drivers).
   * CUDA is a GPU backend specific to NVIDIA GPUs (it will not work with AMD or Intel or any other GPUs) and requires installing [CUDA](https://developer.nvidia.com/cuda-zone) and [CUDNN](https://developer.nvidia.com/cudnn) and a modern NVIDIA GPU. On most GPUs, the OpenCL implementation will actually beat NVIDIA's own CUDA/CUDNN at performance. The exception is for top-end NVIDIA GPUs that support FP16 and tensor cores, in which case sometimes one is better and sometimes the other is better.
   * TensorRT is similar to CUDA, but only uses NVIDIA's TensorRT framework to run the neural network with more optimized kernels. For modern NVIDIA GPUs, it should work whenever CUDA does and will usually be faster than CUDA or any other backend.
+  * ROCm is a GPU backend specific to AMD GPUs (it will not work with NVIDIA or Intel or any other GPUs) and requires installing [ROCm](https://rocm.docs.amd.com) and [MIOpen](https://rocm.docs.amd.com/projects/MIOpen) and a modern AMD GPU. On most GPUs, the OpenCL implementation will actually beat AMD's own ROCm/MIOpen at performance. The exception is for top-end AMD GPUs that support FP16 and stream processors, in which case sometimes one is better and sometimes the other is better.
   * Eigen is a *CPU* backend that should work widely *without* needing a GPU or fancy drivers. Use this if you don't have a good GPU or really any GPU at all. It will be quite significantly slower than OpenCL or CUDA, but on a good CPU can still often get 10 to 20 playouts per second if using the smaller (15 or 20) block neural nets. Eigen can also be compiled with AVX2 and FMA support, which can provide a big performance boost for Intel and AMD CPUs from the last few years. However, it will not run at all on older CPUs (and possibly even some recent but low-power modern CPUs) that don't support these fancy vector instructions.
 
 For **any** implementation, it's recommended that you also tune the number of threads used if you care about optimal performance, as it can make a factor of 2-3 difference in the speed. See "Tuning for Performance" below. However, if you mostly just want to get it working, then the default untuned settings should also be still reasonable.
@@ -175,6 +180,8 @@ This section summarizes a number of common questions and issues when running Kat
 #### Issues with specific GPUs or GPU drivers
 If you are observing any crashes in KataGo while attempting to run the benchmark or the program itself, and you have one of the below GPUs, then this is likely the reason.
 
+* **AMD GPUs** - If you choose to use ROCm backend, uou need a GPU supported with official [System requirements lists](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html) (at least AMD Radeon RX 7700 XT). And ROCm backend only supports Linux now, because MIOpen and CMake HIP Language doesn't support Windows at this moment. We suggest installing the lastest version of ROCm developer stack.
+
 * **AMD Radeon RX 5700** - AMD's drivers for OpenCL for this GPU have been buggy ever since this GPU was released, and as of May 2020 AMD has still never released a fix. If you are using this GPU, you will just not be able to run KataGo (Leela Zero and other Go engines will probably fail too) and will probably also obtain incorrect calculations or crash if doing anything else scientific or mathematical that uses OpenCL. See for example these reddit threads: [[1]](https://www.reddit.com/r/Amd/comments/ebso1x/its_not_just_setihome_any_mathematic_or/) or [[2]](https://www.reddit.com/r/BOINC/comments/ebiz18/psa_please_remove_your_amd_rx5700xt_from_setihome/) or this [L19 thread](https://lifein19x19.com/viewtopic.php?f=18&t=17093).
 * **OpenCL Mesa** - These drivers for OpenCL are buggy. Particularly if on startup before crashing you see KataGo printing something like
 `Found OpenCL Platform 0: ... (Mesa) (OpenCL 1.1 Mesa ...) ...`
diff --git a/cpp/README.md b/cpp/README.md
index 1f5d8d21f..7376c6b7d 100644
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -15,7 +15,7 @@ Summary of source folders, in approximate dependency order, from lowest level to
   * `nninputs.{cpp,h}` - Implements the input features for the neural net.
   * `sgfmetadata.{cpp,h}` - Implements the input features for the [HumanSL neural net](https://github.com/lightvector/KataGo/blob/master/docs/Analysis_Engine.md#human-sl-analysis-guide), for conditioning on various SGF metadata about human players from training data.
   * `nninterface.h` - Common interface that is implemented by every low-level neural net backend.
-  * `{cuda,opencl,eigen,trt,dummy}backend.cpp` - Various backends.
+  * `{cuda,opencl,eigen,trt,rocm,metal,dummy}backend.cpp` - Various backends.
   * `nneval.{cpp,h}` - Top-level handle to the neural net used by the rest of the engine, implements thread-safe batching of queries.
 * `search` - The main search engine.
   * `timecontrols.cpp` - Basic handling of a few possible time controls.
diff --git a/cpp/configs/analysis_example.cfg b/cpp/configs/analysis_example.cfg
index 090bdd242..d5b7e3990 100644
--- a/cpp/configs/analysis_example.cfg
+++ b/cpp/configs/analysis_example.cfg
@@ -207,9 +207,7 @@ nnRandomize = true
 # cudaUseNHWC = auto
 
 
-# ------------------------------
-# Metal GPU settings
-# ------------------------------
+# Metal GPU settings--------------------------------------
 # These only apply when using the METAL version of KataGo.
 
 # For one Metal instance: KataGo will automatically use the default device.
@@ -223,6 +221,31 @@ nnRandomize = true
 # The pattern continues for additional Metal instances.
 
 
+# ROCm GPU settings--------------------------------------
+# These only apply when using the ROCm version of KataGo.
+
+# IF USING ONE GPU: optionally uncomment and change this if the GPU you want to use turns out to be not device 0
+# rocmDeviceToUse = 0
+
+# IF USING TWO GPUS: Uncomment these two lines (AND set numNNServerThreadsPerModel above):
+# rocmDeviceToUseThread0 = 0  # change this if the first GPU you want to use turns out to be not device 0
+# rocmDeviceToUseThread1 = 1  # change this if the second GPU you want to use turns out to be not device 1
+
+# IF USING THREE GPUS: Uncomment these three lines (AND set numNNServerThreadsPerModel above):
+# rocmDeviceToUseThread0 = 0  # change this if the first GPU you want to use turns out to be not device 0
+# rocmDeviceToUseThread1 = 1  # change this if the second GPU you want to use turns out to be not device 1
+# rocmDeviceToUseThread2 = 2  # change this if the third GPU you want to use turns out to be not device 2
+
+# You can probably guess the pattern if you have four, five, etc. GPUs.
+
+# KataGo will automatically use FP16 or not based on the compute capability of your AMD GPU. If you
+# want to try to force a particular behavior though you can uncomment these lines and change them
+# to "true" or "false". E.g. it's using FP16 but on your card that's giving an error, or it's not using
+# FP16 but you think it should.
+# rocmUseFP16 = auto
+# ROCm does not support NHWC, so this is always false.
+
+
 # OpenCL-specific GPU settings--------------------------------------
 # These only apply when using the OpenCL version of KataGo.
 
diff --git a/cpp/configs/contribute_example.cfg b/cpp/configs/contribute_example.cfg
index 6ca039f11..fb6f0d81d 100644
--- a/cpp/configs/contribute_example.cfg
+++ b/cpp/configs/contribute_example.cfg
@@ -83,9 +83,8 @@ watchOngoingGameInFileName = watchgame.txt
 # cudaUseNHWC = auto
 
 
-# ------------------------------
-# Metal GPU settings
-# ------------------------------
+# Metal GPU settings--------------------------------------
+
 # These only apply when using the METAL version of KataGo.
 
 # For one Metal instance: KataGo will automatically use the default device.
@@ -99,6 +98,31 @@ watchOngoingGameInFileName = watchgame.txt
 # The pattern continues for additional Metal instances.
 
 
+# ROCm GPU settings--------------------------------------
+# These only apply when using the ROCm version of KataGo.
+
+# IF USING ONE GPU: optionally uncomment and change this if the GPU you want to use turns out to be not device 0
+# rocmDeviceToUse = 0
+
+# IF USING TWO GPUS: Uncomment these two lines (AND set numNNServerThreadsPerModel above):
+# rocmDeviceToUseThread0 = 0  # change this if the first GPU you want to use turns out to be not device 0
+# rocmDeviceToUseThread1 = 1  # change this if the second GPU you want to use turns out to be not device 1
+
+# IF USING THREE GPUS: Uncomment these three lines (AND set numNNServerThreadsPerModel above):
+# rocmDeviceToUseThread0 = 0  # change this if the first GPU you want to use turns out to be not device 0
+# rocmDeviceToUseThread1 = 1  # change this if the second GPU you want to use turns out to be not device 1
+# rocmDeviceToUseThread2 = 2  # change this if the third GPU you want to use turns out to be not device 2
+
+# You can probably guess the pattern if you have four, five, etc. GPUs.
+
+# KataGo will automatically use FP16 or not based on the compute capability of your AMD GPU. If you
+# want to try to force a particular behavior though you can uncomment these lines and change them
+# to "true" or "false". E.g. it's using FP16 but on your card that's giving an error, or it's not using
+# FP16 but you think it should.
+# rocmUseFP16 = auto
+# ROCm does not support NHWC, so this is always false.
+
+
 # OpenCL GPU settings--------------------------------------
 # These only apply when using the OpenCL version of KataGo.
 
diff --git a/cpp/configs/gtp_example.cfg b/cpp/configs/gtp_example.cfg
index 58098db42..f8289140d 100644
--- a/cpp/configs/gtp_example.cfg
+++ b/cpp/configs/gtp_example.cfg
@@ -443,9 +443,9 @@ searchFactorWhenWinningThreshold = 0.95
 # cudaUseFP16 = auto
 # cudaUseNHWC = auto
 
-# ------------------------------
-# Metal GPU settings
-# ------------------------------
+
+# Metal GPU settings--------------------------------------
+
 # These only apply when using the METAL version of KataGo.
 
 # For one Metal instance: KataGo will automatically use the default device.
@@ -458,6 +458,32 @@ searchFactorWhenWinningThreshold = 0.95
 
 # The pattern continues for additional Metal instances.
 
+
+# ROCm GPU settings--------------------------------------
+# These only apply when using the ROCm version of KataGo.
+
+# IF USING ONE GPU: optionally uncomment and change this if the GPU you want to use turns out to be not device 0
+# rocmDeviceToUse = 0
+
+# IF USING TWO GPUS: Uncomment these two lines (AND set numNNServerThreadsPerModel above):
+# rocmDeviceToUseThread0 = 0  # change this if the first GPU you want to use turns out to be not device 0
+# rocmDeviceToUseThread1 = 1  # change this if the second GPU you want to use turns out to be not device 1
+
+# IF USING THREE GPUS: Uncomment these three lines (AND set numNNServerThreadsPerModel above):
+# rocmDeviceToUseThread0 = 0  # change this if the first GPU you want to use turns out to be not device 0
+# rocmDeviceToUseThread1 = 1  # change this if the second GPU you want to use turns out to be not device 1
+# rocmDeviceToUseThread2 = 2  # change this if the third GPU you want to use turns out to be not device 2
+
+# You can probably guess the pattern if you have four, five, etc. GPUs.
+
+# KataGo will automatically use FP16 or not based on the compute capability of your AMD GPU. If you
+# want to try to force a particular behavior though you can uncomment these lines and change them
+# to "true" or "false". E.g. it's using FP16 but on your card that's giving an error, or it's not using
+# FP16 but you think it should.
+# rocmUseFP16 = auto
+# ROCm does not support NHWC, so this is always false.
+
+
 # ------------------------------
 # OpenCL GPU settings
 # ------------------------------
diff --git a/cpp/configs/match_example.cfg b/cpp/configs/match_example.cfg
index 7e5b4fc09..08859f557 100644
--- a/cpp/configs/match_example.cfg
+++ b/cpp/configs/match_example.cfg
@@ -156,9 +156,8 @@ numNNServerThreadsPerModel = 1
 # cudaUseNHWC = auto
 
 
-# ------------------------------
-# Metal GPU settings
-# ------------------------------
+# Metal GPU settings--------------------------------------
+
 # These only apply when using the METAL version of KataGo.
 
 # For one Metal instance: KataGo will automatically use the default device.
@@ -172,6 +171,31 @@ numNNServerThreadsPerModel = 1
 # The pattern continues for additional Metal instances.
 
 
+# ROCm GPU settings--------------------------------------
+# These only apply when using the ROCm version of KataGo.
+
+# IF USING ONE GPU: optionally uncomment and change this if the GPU you want to use turns out to be not device 0
+# rocmDeviceToUse = 0
+
+# IF USING TWO GPUS: Uncomment these two lines (AND set numNNServerThreadsPerModel above):
+# rocmDeviceToUseThread0 = 0  # change this if the first GPU you want to use turns out to be not device 0
+# rocmDeviceToUseThread1 = 1  # change this if the second GPU you want to use turns out to be not device 1
+
+# IF USING THREE GPUS: Uncomment these three lines (AND set numNNServerThreadsPerModel above):
+# rocmDeviceToUseThread0 = 0  # change this if the first GPU you want to use turns out to be not device 0
+# rocmDeviceToUseThread1 = 1  # change this if the second GPU you want to use turns out to be not device 1
+# rocmDeviceToUseThread2 = 2  # change this if the third GPU you want to use turns out to be not device 2
+
+# You can probably guess the pattern if you have four, five, etc. GPUs.
+
+# KataGo will automatically use FP16 or not based on the compute capability of your AMD GPU. If you
+# want to try to force a particular behavior though you can uncomment these lines and change them
+# to "true" or "false". E.g. it's using FP16 but on your card that's giving an error, or it's not using
+# FP16 but you think it should.
+# rocmUseFP16 = auto
+# ROCm does not support NHWC, so this is always false.
+
+
 # OpenCL GPU settings--------------------------------------
 # These only apply when using OpenCL as the backend for inference.
 # (For GTP, we only ever have one model, when playing matches, we might have more than one, see match_example.cfg)

From 1d05ca8d640a1aa55a7f7a835b0f84db7e85d36d Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Sat, 2 Aug 2025 04:28:24 +0200
Subject: [PATCH 12/24] Update gitignore

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 83492a14f..2e933d553 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,7 +21,7 @@ cpp/main
 cpp/maincuda
 cpp/mainopencl
 cpp/katago
-# cpp/configs
+cpp/configs
 cpp/evalsgf
 cpp/run*.sh
 cpp/tests/scratch

From 9d4662b7d8cacaec0f9b566aa8741813fa2bc870 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Sat, 2 Aug 2025 14:01:55 +0200
Subject: [PATCH 13/24] Update new method

---
 cpp/neuralnet/rocmbackend.cpp | 289 ++++++----------------------------
 1 file changed, 50 insertions(+), 239 deletions(-)

diff --git a/cpp/neuralnet/rocmbackend.cpp b/cpp/neuralnet/rocmbackend.cpp
index 0fd5aa03f..9e7f4cf0b 100644
--- a/cpp/neuralnet/rocmbackend.cpp
+++ b/cpp/neuralnet/rocmbackend.cpp
@@ -255,11 +255,8 @@ struct ConvLayer {
   ByBatchSizeView<miopenTensorDescriptor_t> outputDescriptors;
   miopenTensorDescriptor_t filterDescriptor;
   miopenConvolutionDescriptor_t convolutionDescriptor;
-  ByBatchSize<miopenConvAlgoPerf_t>* convolutionAlgorithms; //array of one for each batch size
+  ByBatchSize<miopenConvSolution_t>* convolutionAlgorithms; //array of one for each batch size
   void* filterBuf;
-  void* inputTmp;
-  void* outputTmp;
-  void* workspaceTmp;
 
   ConvLayer() = delete;
   ConvLayer(const ConvLayer&) = delete;
@@ -299,8 +296,6 @@ struct ConvLayer {
     inputDescriptors = manager->getTensorDesc4DByBatchSize(inChannels,useFP16,useNHWCIn);
     outputDescriptors = manager->getTensorDesc4DByBatchSize(outChannels,useFP16,useNHWCOut);
     int maxBatchSize = manager->maxBatchSize;
-    int xLen = manager->nnXLen;
-    int yLen = manager->nnYLen;
 
     bool filterNHWC = useNHWCOut && dilationY == 1 && dilationX == 1;
 
@@ -334,54 +329,68 @@ struct ConvLayer {
       CUDNN_ERR(name.c_str(),miopenSetConvolutionAttribute(convolutionDescriptor,MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL,alt));
     }
 
-    convolutionAlgorithms = new ByBatchSize<miopenConvAlgoPerf_t>(maxBatchSize);
-
-    size_t inBytes  = maxBatchSize * inChannels  * xLen * yLen + 3324928;
-    size_t outBytes = maxBatchSize * outChannels * xLen * yLen + 3324928;
-    size_t workspaceBytes = requiredWorkspaceBytes(cudaHandles, maxBatchSize) + 3324928;
-    
-    CudaUtils::mallocOnDevice(name, inBytes, inputTmp, useFP16);
-    CudaUtils::mallocOnDevice(name, outBytes, outputTmp, useFP16);
-    CudaUtils::mallocOnDevice(name, workspaceBytes, workspaceTmp, useFP16);
-    CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
+    convolutionAlgorithms = new ByBatchSize<miopenConvSolution_t>(maxBatchSize);
 
     for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
       const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
       const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
-      const int requestedAlgoCount = 8;
-      int returnedAlgoCount = -1;
-      miopenConvAlgoPerf_t results[2 * requestedAlgoCount];
-      CUDNN_ERR(name.c_str(),miopenFindConvolutionForwardAlgorithm(
+      size_t requestedAlgoCount = 8;
+      size_t returnedAlgoCount = -1;
+      miopenConvSolution_t solutions[2 * requestedAlgoCount];
+      CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionCount(
+        cudaHandles->cudnn,
+        filterDescriptor,
+        inputDescriptor,
+        convolutionDescriptor,
+        outputDescriptor,
+        &requestedAlgoCount
+      ));
+      CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolution(
           cudaHandles->cudnn,
-          inputDescriptor,
-          inputTmp,
           filterDescriptor,
-          filterBuf,
+          inputDescriptor,
           convolutionDescriptor,
           outputDescriptor,
-          outputTmp,
           requestedAlgoCount,
           &returnedAlgoCount,
-          results,
-          workspaceTmp,
-          workspaceBytes,
-          false
+          solutions
         ));
       if(returnedAlgoCount <= 0)
-        throw StringError("miopenFindConvolutionForwardAlgorithm returned no algorithms?");
-      (*convolutionAlgorithms)[batchSize] = results[0];
+        throw StringError("miopenConvolutionForwardGetSolution returned no algorithms?");
+      (*convolutionAlgorithms)[batchSize] = solutions[0];
+      CUDNN_ERR(name.c_str(),miopenConvolutionForwardCompileSolution(
+        cudaHandles->cudnn,
+        filterDescriptor,
+        inputDescriptor,
+        convolutionDescriptor,
+        outputDescriptor,
+        (*convolutionAlgorithms)[batchSize].solution_id
+      ));
     }
 
     assert(desc->weights.size() == convYSize * convXSize * inChannels * outChannels);
 
-    CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
+    if(filterNHWC) {
+      vector<float> weightsTransposed(desc->weights.size());
+      for(int y = 0; y < convYSize; y++) {
+        for(int x = 0; x < convXSize; x++) {
+          for(int ic = 0; ic < inChannels; ic++) {
+            for(int oc = 0; oc < outChannels; oc++) {
+              weightsTransposed[((oc*convYSize + y)*convXSize + x)*inChannels + ic] =
+                desc->weights[((oc*inChannels + ic)*convYSize + y)*convXSize + x];
+            }
+          }
+        }
+      }
+      CudaUtils::mallocAndCopyToDevice(name,weightsTransposed,filterBuf,useFP16);
+      hipDeviceSynchronize();
+    }
+    else
+      CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
   }
 
   ~ConvLayer() {
     hipFree(filterBuf);
-    hipFree(inputTmp);
-    hipFree(outputTmp);
-    hipFree(workspaceTmp);
     miopenDestroyTensorDescriptor(filterDescriptor);
     miopenDestroyConvolutionDescriptor(convolutionDescriptor);
     delete convolutionAlgorithms;
@@ -392,12 +401,13 @@ struct ConvLayer {
     int batchSize
   ) const {
     size_t workspaceBytes = 0;
-    CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetWorkSpaceSize(
+    CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionWorkspaceSize(
       cudaHandles->cudnn,
       filterDescriptor,
       inputDescriptors[batchSize],
       convolutionDescriptor,
       outputDescriptors[batchSize],
+      (*convolutionAlgorithms)[batchSize].solution_id,
       &workspaceBytes
     ));
     return workspaceBytes;
@@ -412,224 +422,25 @@ struct ConvLayer {
     void* workspaceBuf,
     size_t workspaceBytes
   ) const {
-    accumulate = false;
     const float alpha = 1.0f;
     const float beta = accumulate ? 1.0f : 0.0f;
-    CUDNN_ERR(name.c_str(), miopenConvolutionForward(
+    CUDNN_ERR(name.c_str(), miopenConvolutionForwardImmediate(
       cudaHandles->cudnn,
-      &alpha,
-      inputDescriptors[batchSize],
-      inputBuf,
       filterDescriptor,
       filterBuf,
+      inputDescriptors[batchSize],
+      inputBuf,
       convolutionDescriptor,
-      (*convolutionAlgorithms)[batchSize].fwd_algo,
-      &beta,
       outputDescriptors[batchSize],
       outputBuf,
       workspaceBuf,
-      workspaceBytes
+      workspaceBytes,
+      (*convolutionAlgorithms)[batchSize].solution_id
     ));
   }
 
 };
 
-// New ConvLayer structure with MIOpen API
-
-// struct ConvLayer {
-//   const string name;
-//   const int inChannels;
-//   const int outChannels;
-//   ByBatchSizeView<miopenTensorDescriptor_t> inputDescriptors;
-//   ByBatchSizeView<miopenTensorDescriptor_t> outputDescriptors;
-//   miopenTensorDescriptor_t filterDescriptor;
-//   miopenConvolutionDescriptor_t convolutionDescriptor;
-//   ByBatchSize<miopenConvSolution_t>* convolutionAlgorithms; //array of one for each batch size
-//   void* filterBuf;
-
-//   ConvLayer() = delete;
-//   ConvLayer(const ConvLayer&) = delete;
-//   ConvLayer& operator=(const ConvLayer&) = delete;
-
-//   ConvLayer(
-//     CudaHandles* cudaHandles,
-//     CudnnManager* manager,
-//     const ConvLayerDesc* desc,
-//     bool useFP16,
-//     bool useNHWC
-//   ) : ConvLayer(cudaHandles, manager, desc, useFP16, useNHWC, useNHWC)
-//   {}
-
-//   ConvLayer(
-//     CudaHandles* cudaHandles,
-//     CudnnManager* manager,
-//     const ConvLayerDesc* desc,
-//     bool useFP16,
-//     bool useNHWCIn,
-//     bool useNHWCOut
-//   ) :
-//     name(desc->name),
-//     inChannels(desc->inChannels),
-//     outChannels(desc->outChannels)
-//   {
-//     int convYSize = desc->convYSize;
-//     int convXSize = desc->convXSize;
-//     int dilationY = desc->dilationY;
-//     int dilationX = desc->dilationX;
-//     int paddingX = (convXSize / 2) * dilationX;
-//     int paddingY = (convYSize / 2) * dilationY;
-
-//     assert(convXSize % 2 == 1);
-//     assert(convYSize % 2 == 1);
-
-//     inputDescriptors = manager->getTensorDesc4DByBatchSize(inChannels,useFP16,useNHWCIn);
-//     outputDescriptors = manager->getTensorDesc4DByBatchSize(outChannels,useFP16,useNHWCOut);
-//     int maxBatchSize = manager->maxBatchSize;
-
-//     bool filterNHWC = useNHWCOut && dilationY == 1 && dilationX == 1;
-
-//     CUDNN_ERR(name.c_str(),miopenCreateTensorDescriptor(&filterDescriptor));
-//     CUDNN_ERR(name.c_str(),miopenSet4dTensorDescriptor(
-//       filterDescriptor,
-//       (useFP16 ? miopenHalf : miopenFloat),
-//       outChannels,
-//       inChannels,
-//       convYSize,
-//       convXSize
-//     ));
-
-//     int yStride = 1;
-//     int xStride = 1;
-
-
-//     CUDNN_ERR(name.c_str(),miopenCreateConvolutionDescriptor(&convolutionDescriptor));
-//     CUDNN_ERR(name.c_str(),miopenInitConvolutionDescriptor(
-//       convolutionDescriptor,
-//       miopenConvolution,
-//       paddingY,
-//       paddingX,
-//       yStride,
-//       xStride,
-//       dilationY,
-//       dilationX
-//     ));
-//     if(useFP16) {
-//       int alt = 1; // non‑zero enables alt‑impl on MI2xx+ GPUs
-//       CUDNN_ERR(name.c_str(),miopenSetConvolutionAttribute(convolutionDescriptor,MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL,alt));
-//     }
-
-//     convolutionAlgorithms = new ByBatchSize<miopenConvSolution_t>(maxBatchSize);
-
-//     for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
-//       const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
-//       const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
-//       size_t requestedAlgoCount = 8;
-//       size_t returnedAlgoCount = -1;
-//       miopenConvSolution_t solutions[2 * requestedAlgoCount];
-//       CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionCount(
-//         cudaHandles->cudnn,
-//         filterDescriptor,
-//         inputDescriptor,
-//         convolutionDescriptor,
-//         outputDescriptor,
-//         &requestedAlgoCount
-//       ));
-//       CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolution(
-//           cudaHandles->cudnn,
-//           filterDescriptor,
-//           inputDescriptor,
-//           convolutionDescriptor,
-//           outputDescriptor,
-//           requestedAlgoCount,
-//           &returnedAlgoCount,
-//           solutions
-//         ));
-//       if(returnedAlgoCount <= 0)
-//         throw StringError("miopenConvolutionForwardGetSolution returned no algorithms?");
-//       (*convolutionAlgorithms)[batchSize] = solutions[0];
-//       CUDNN_ERR(name.c_str(),miopenConvolutionForwardCompileSolution(
-//         cudaHandles->cudnn,
-//         filterDescriptor,
-//         inputDescriptor,
-//         convolutionDescriptor,
-//         outputDescriptor,
-//         (*convolutionAlgorithms)[batchSize].solution_id
-//       ));
-//     }
-
-//     assert(desc->weights.size() == convYSize * convXSize * inChannels * outChannels);
-
-//     if(filterNHWC) {
-//       vector<float> weightsTransposed(desc->weights.size());
-//       for(int y = 0; y < convYSize; y++) {
-//         for(int x = 0; x < convXSize; x++) {
-//           for(int ic = 0; ic < inChannels; ic++) {
-//             for(int oc = 0; oc < outChannels; oc++) {
-//               weightsTransposed[((oc*convYSize + y)*convXSize + x)*inChannels + ic] =
-//                 desc->weights[((oc*inChannels + ic)*convYSize + y)*convXSize + x];
-//             }
-//           }
-//         }
-//       }
-//       CudaUtils::mallocAndCopyToDevice(name,weightsTransposed,filterBuf,useFP16);
-//       hipDeviceSynchronize();
-//     }
-//     else
-//       CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
-//   }
-
-//   ~ConvLayer() {
-//     hipFree(filterBuf);
-//     miopenDestroyTensorDescriptor(filterDescriptor);
-//     miopenDestroyConvolutionDescriptor(convolutionDescriptor);
-//     delete convolutionAlgorithms;
-//   }
-
-//   size_t requiredWorkspaceBytes(
-//     CudaHandles* cudaHandles,
-//     int batchSize
-//   ) const {
-//     size_t workspaceBytes = 0;
-//     CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionWorkspaceSize(
-//       cudaHandles->cudnn,
-//       filterDescriptor,
-//       inputDescriptors[batchSize],
-//       convolutionDescriptor,
-//       outputDescriptors[batchSize],
-//       (*convolutionAlgorithms)[batchSize].solution_id,
-//       &workspaceBytes
-//     ));
-//     return workspaceBytes;
-//   }
-
-//   void apply(
-//     CudaHandles* cudaHandles,
-//     int batchSize,
-//     bool accumulate,
-//     void* inputBuf,
-//     void* outputBuf,
-//     void* workspaceBuf,
-//     size_t workspaceBytes
-//   ) const {
-//     const float alpha = 1.0f;
-//     const float beta = accumulate ? 1.0f : 0.0f;
-//     CUDNN_ERR(name.c_str(), miopenConvolutionForwardImmediate(
-//       cudaHandles->cudnn,
-//       filterDescriptor,
-//       filterBuf,
-//       inputDescriptors[batchSize],
-//       inputBuf,
-//       convolutionDescriptor,
-//       outputDescriptors[batchSize],
-//       outputBuf,
-//       workspaceBuf,
-//       workspaceBytes,
-//       (*convolutionAlgorithms)[batchSize].solution_id
-//     ));
-//   }
-
-// };
-
 //---------------------------------------------------------------------------------
 
 struct BatchNormLayer {

From d40bd509355f882a1efcd9bdc9ae1ef2713f90cd Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Sat, 2 Aug 2025 14:20:05 +0200
Subject: [PATCH 14/24] Optimize performance

---
 cpp/neuralnet/rocmbackend.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/neuralnet/rocmbackend.cpp b/cpp/neuralnet/rocmbackend.cpp
index 539f0b91a..0fd5aa03f 100644
--- a/cpp/neuralnet/rocmbackend.cpp
+++ b/cpp/neuralnet/rocmbackend.cpp
@@ -336,9 +336,9 @@ struct ConvLayer {
 
     convolutionAlgorithms = new ByBatchSize<miopenConvAlgoPerf_t>(maxBatchSize);
 
-    size_t inBytes  = maxBatchSize * inChannels  * xLen * yLen;
-    size_t outBytes = maxBatchSize * outChannels * xLen * yLen;
-    size_t workspaceBytes = requiredWorkspaceBytes(cudaHandles, maxBatchSize) + 10305856; //1661440; 
+    size_t inBytes  = maxBatchSize * inChannels  * xLen * yLen + 3324928;
+    size_t outBytes = maxBatchSize * outChannels * xLen * yLen + 3324928;
+    size_t workspaceBytes = requiredWorkspaceBytes(cudaHandles, maxBatchSize) + 3324928;
     
     CudaUtils::mallocOnDevice(name, inBytes, inputTmp, useFP16);
     CudaUtils::mallocOnDevice(name, outBytes, outputTmp, useFP16);

From 158d24dff21eedc12e58d202d52a4a766ac1f2fc Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Wed, 13 Aug 2025 04:05:50 +0200
Subject: [PATCH 15/24] Update new Convlayer method

---
 cpp/neuralnet/rocmbackend.cpp | 289 ++++++----------------------------
 1 file changed, 50 insertions(+), 239 deletions(-)

diff --git a/cpp/neuralnet/rocmbackend.cpp b/cpp/neuralnet/rocmbackend.cpp
index 0fd5aa03f..9e7f4cf0b 100644
--- a/cpp/neuralnet/rocmbackend.cpp
+++ b/cpp/neuralnet/rocmbackend.cpp
@@ -255,11 +255,8 @@ struct ConvLayer {
   ByBatchSizeView<miopenTensorDescriptor_t> outputDescriptors;
   miopenTensorDescriptor_t filterDescriptor;
   miopenConvolutionDescriptor_t convolutionDescriptor;
-  ByBatchSize<miopenConvAlgoPerf_t>* convolutionAlgorithms; //array of one for each batch size
+  ByBatchSize<miopenConvSolution_t>* convolutionAlgorithms; //array of one for each batch size
   void* filterBuf;
-  void* inputTmp;
-  void* outputTmp;
-  void* workspaceTmp;
 
   ConvLayer() = delete;
   ConvLayer(const ConvLayer&) = delete;
@@ -299,8 +296,6 @@ struct ConvLayer {
     inputDescriptors = manager->getTensorDesc4DByBatchSize(inChannels,useFP16,useNHWCIn);
     outputDescriptors = manager->getTensorDesc4DByBatchSize(outChannels,useFP16,useNHWCOut);
     int maxBatchSize = manager->maxBatchSize;
-    int xLen = manager->nnXLen;
-    int yLen = manager->nnYLen;
 
     bool filterNHWC = useNHWCOut && dilationY == 1 && dilationX == 1;
 
@@ -334,54 +329,68 @@ struct ConvLayer {
       CUDNN_ERR(name.c_str(),miopenSetConvolutionAttribute(convolutionDescriptor,MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL,alt));
     }
 
-    convolutionAlgorithms = new ByBatchSize<miopenConvAlgoPerf_t>(maxBatchSize);
-
-    size_t inBytes  = maxBatchSize * inChannels  * xLen * yLen + 3324928;
-    size_t outBytes = maxBatchSize * outChannels * xLen * yLen + 3324928;
-    size_t workspaceBytes = requiredWorkspaceBytes(cudaHandles, maxBatchSize) + 3324928;
-    
-    CudaUtils::mallocOnDevice(name, inBytes, inputTmp, useFP16);
-    CudaUtils::mallocOnDevice(name, outBytes, outputTmp, useFP16);
-    CudaUtils::mallocOnDevice(name, workspaceBytes, workspaceTmp, useFP16);
-    CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
+    convolutionAlgorithms = new ByBatchSize<miopenConvSolution_t>(maxBatchSize);
 
     for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
       const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
       const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
-      const int requestedAlgoCount = 8;
-      int returnedAlgoCount = -1;
-      miopenConvAlgoPerf_t results[2 * requestedAlgoCount];
-      CUDNN_ERR(name.c_str(),miopenFindConvolutionForwardAlgorithm(
+      size_t requestedAlgoCount = 8;
+      size_t returnedAlgoCount = -1;
+      miopenConvSolution_t solutions[2 * requestedAlgoCount];
+      CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionCount(
+        cudaHandles->cudnn,
+        filterDescriptor,
+        inputDescriptor,
+        convolutionDescriptor,
+        outputDescriptor,
+        &requestedAlgoCount
+      ));
+      CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolution(
           cudaHandles->cudnn,
-          inputDescriptor,
-          inputTmp,
           filterDescriptor,
-          filterBuf,
+          inputDescriptor,
           convolutionDescriptor,
           outputDescriptor,
-          outputTmp,
           requestedAlgoCount,
           &returnedAlgoCount,
-          results,
-          workspaceTmp,
-          workspaceBytes,
-          false
+          solutions
         ));
       if(returnedAlgoCount <= 0)
-        throw StringError("miopenFindConvolutionForwardAlgorithm returned no algorithms?");
-      (*convolutionAlgorithms)[batchSize] = results[0];
+        throw StringError("miopenConvolutionForwardGetSolution returned no algorithms?");
+      (*convolutionAlgorithms)[batchSize] = solutions[0];
+      CUDNN_ERR(name.c_str(),miopenConvolutionForwardCompileSolution(
+        cudaHandles->cudnn,
+        filterDescriptor,
+        inputDescriptor,
+        convolutionDescriptor,
+        outputDescriptor,
+        (*convolutionAlgorithms)[batchSize].solution_id
+      ));
     }
 
     assert(desc->weights.size() == convYSize * convXSize * inChannels * outChannels);
 
-    CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
+    if(filterNHWC) {
+      vector<float> weightsTransposed(desc->weights.size());
+      for(int y = 0; y < convYSize; y++) {
+        for(int x = 0; x < convXSize; x++) {
+          for(int ic = 0; ic < inChannels; ic++) {
+            for(int oc = 0; oc < outChannels; oc++) {
+              weightsTransposed[((oc*convYSize + y)*convXSize + x)*inChannels + ic] =
+                desc->weights[((oc*inChannels + ic)*convYSize + y)*convXSize + x];
+            }
+          }
+        }
+      }
+      CudaUtils::mallocAndCopyToDevice(name,weightsTransposed,filterBuf,useFP16);
+      hipDeviceSynchronize();
+    }
+    else
+      CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
   }
 
   ~ConvLayer() {
     hipFree(filterBuf);
-    hipFree(inputTmp);
-    hipFree(outputTmp);
-    hipFree(workspaceTmp);
     miopenDestroyTensorDescriptor(filterDescriptor);
     miopenDestroyConvolutionDescriptor(convolutionDescriptor);
     delete convolutionAlgorithms;
@@ -392,12 +401,13 @@ struct ConvLayer {
     int batchSize
   ) const {
     size_t workspaceBytes = 0;
-    CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetWorkSpaceSize(
+    CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionWorkspaceSize(
       cudaHandles->cudnn,
       filterDescriptor,
       inputDescriptors[batchSize],
       convolutionDescriptor,
       outputDescriptors[batchSize],
+      (*convolutionAlgorithms)[batchSize].solution_id,
       &workspaceBytes
     ));
     return workspaceBytes;
@@ -412,224 +422,25 @@ struct ConvLayer {
     void* workspaceBuf,
     size_t workspaceBytes
   ) const {
-    accumulate = false;
     const float alpha = 1.0f;
     const float beta = accumulate ? 1.0f : 0.0f;
-    CUDNN_ERR(name.c_str(), miopenConvolutionForward(
+    CUDNN_ERR(name.c_str(), miopenConvolutionForwardImmediate(
       cudaHandles->cudnn,
-      &alpha,
-      inputDescriptors[batchSize],
-      inputBuf,
       filterDescriptor,
       filterBuf,
+      inputDescriptors[batchSize],
+      inputBuf,
       convolutionDescriptor,
-      (*convolutionAlgorithms)[batchSize].fwd_algo,
-      &beta,
       outputDescriptors[batchSize],
       outputBuf,
       workspaceBuf,
-      workspaceBytes
+      workspaceBytes,
+      (*convolutionAlgorithms)[batchSize].solution_id
     ));
   }
 
 };
 
-// New ConvLayer structure with MIOpen API
-
-// struct ConvLayer {
-//   const string name;
-//   const int inChannels;
-//   const int outChannels;
-//   ByBatchSizeView<miopenTensorDescriptor_t> inputDescriptors;
-//   ByBatchSizeView<miopenTensorDescriptor_t> outputDescriptors;
-//   miopenTensorDescriptor_t filterDescriptor;
-//   miopenConvolutionDescriptor_t convolutionDescriptor;
-//   ByBatchSize<miopenConvSolution_t>* convolutionAlgorithms; //array of one for each batch size
-//   void* filterBuf;
-
-//   ConvLayer() = delete;
-//   ConvLayer(const ConvLayer&) = delete;
-//   ConvLayer& operator=(const ConvLayer&) = delete;
-
-//   ConvLayer(
-//     CudaHandles* cudaHandles,
-//     CudnnManager* manager,
-//     const ConvLayerDesc* desc,
-//     bool useFP16,
-//     bool useNHWC
-//   ) : ConvLayer(cudaHandles, manager, desc, useFP16, useNHWC, useNHWC)
-//   {}
-
-//   ConvLayer(
-//     CudaHandles* cudaHandles,
-//     CudnnManager* manager,
-//     const ConvLayerDesc* desc,
-//     bool useFP16,
-//     bool useNHWCIn,
-//     bool useNHWCOut
-//   ) :
-//     name(desc->name),
-//     inChannels(desc->inChannels),
-//     outChannels(desc->outChannels)
-//   {
-//     int convYSize = desc->convYSize;
-//     int convXSize = desc->convXSize;
-//     int dilationY = desc->dilationY;
-//     int dilationX = desc->dilationX;
-//     int paddingX = (convXSize / 2) * dilationX;
-//     int paddingY = (convYSize / 2) * dilationY;
-
-//     assert(convXSize % 2 == 1);
-//     assert(convYSize % 2 == 1);
-
-//     inputDescriptors = manager->getTensorDesc4DByBatchSize(inChannels,useFP16,useNHWCIn);
-//     outputDescriptors = manager->getTensorDesc4DByBatchSize(outChannels,useFP16,useNHWCOut);
-//     int maxBatchSize = manager->maxBatchSize;
-
-//     bool filterNHWC = useNHWCOut && dilationY == 1 && dilationX == 1;
-
-//     CUDNN_ERR(name.c_str(),miopenCreateTensorDescriptor(&filterDescriptor));
-//     CUDNN_ERR(name.c_str(),miopenSet4dTensorDescriptor(
-//       filterDescriptor,
-//       (useFP16 ? miopenHalf : miopenFloat),
-//       outChannels,
-//       inChannels,
-//       convYSize,
-//       convXSize
-//     ));
-
-//     int yStride = 1;
-//     int xStride = 1;
-
-
-//     CUDNN_ERR(name.c_str(),miopenCreateConvolutionDescriptor(&convolutionDescriptor));
-//     CUDNN_ERR(name.c_str(),miopenInitConvolutionDescriptor(
-//       convolutionDescriptor,
-//       miopenConvolution,
-//       paddingY,
-//       paddingX,
-//       yStride,
-//       xStride,
-//       dilationY,
-//       dilationX
-//     ));
-//     if(useFP16) {
-//       int alt = 1; // non‑zero enables alt‑impl on MI2xx+ GPUs
-//       CUDNN_ERR(name.c_str(),miopenSetConvolutionAttribute(convolutionDescriptor,MIOPEN_CONVOLUTION_ATTRIB_FP16_ALT_IMPL,alt));
-//     }
-
-//     convolutionAlgorithms = new ByBatchSize<miopenConvSolution_t>(maxBatchSize);
-
-//     for(int batchSize = 1; batchSize <= maxBatchSize; batchSize++) {
-//       const miopenTensorDescriptor_t& inputDescriptor = inputDescriptors[batchSize];
-//       const miopenTensorDescriptor_t& outputDescriptor = outputDescriptors[batchSize];
-//       size_t requestedAlgoCount = 8;
-//       size_t returnedAlgoCount = -1;
-//       miopenConvSolution_t solutions[2 * requestedAlgoCount];
-//       CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionCount(
-//         cudaHandles->cudnn,
-//         filterDescriptor,
-//         inputDescriptor,
-//         convolutionDescriptor,
-//         outputDescriptor,
-//         &requestedAlgoCount
-//       ));
-//       CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolution(
-//           cudaHandles->cudnn,
-//           filterDescriptor,
-//           inputDescriptor,
-//           convolutionDescriptor,
-//           outputDescriptor,
-//           requestedAlgoCount,
-//           &returnedAlgoCount,
-//           solutions
-//         ));
-//       if(returnedAlgoCount <= 0)
-//         throw StringError("miopenConvolutionForwardGetSolution returned no algorithms?");
-//       (*convolutionAlgorithms)[batchSize] = solutions[0];
-//       CUDNN_ERR(name.c_str(),miopenConvolutionForwardCompileSolution(
-//         cudaHandles->cudnn,
-//         filterDescriptor,
-//         inputDescriptor,
-//         convolutionDescriptor,
-//         outputDescriptor,
-//         (*convolutionAlgorithms)[batchSize].solution_id
-//       ));
-//     }
-
-//     assert(desc->weights.size() == convYSize * convXSize * inChannels * outChannels);
-
-//     if(filterNHWC) {
-//       vector<float> weightsTransposed(desc->weights.size());
-//       for(int y = 0; y < convYSize; y++) {
-//         for(int x = 0; x < convXSize; x++) {
-//           for(int ic = 0; ic < inChannels; ic++) {
-//             for(int oc = 0; oc < outChannels; oc++) {
-//               weightsTransposed[((oc*convYSize + y)*convXSize + x)*inChannels + ic] =
-//                 desc->weights[((oc*inChannels + ic)*convYSize + y)*convXSize + x];
-//             }
-//           }
-//         }
-//       }
-//       CudaUtils::mallocAndCopyToDevice(name,weightsTransposed,filterBuf,useFP16);
-//       hipDeviceSynchronize();
-//     }
-//     else
-//       CudaUtils::mallocAndCopyToDevice(name,desc->weights,filterBuf,useFP16);
-//   }
-
-//   ~ConvLayer() {
-//     hipFree(filterBuf);
-//     miopenDestroyTensorDescriptor(filterDescriptor);
-//     miopenDestroyConvolutionDescriptor(convolutionDescriptor);
-//     delete convolutionAlgorithms;
-//   }
-
-//   size_t requiredWorkspaceBytes(
-//     CudaHandles* cudaHandles,
-//     int batchSize
-//   ) const {
-//     size_t workspaceBytes = 0;
-//     CUDNN_ERR(name.c_str(),miopenConvolutionForwardGetSolutionWorkspaceSize(
-//       cudaHandles->cudnn,
-//       filterDescriptor,
-//       inputDescriptors[batchSize],
-//       convolutionDescriptor,
-//       outputDescriptors[batchSize],
-//       (*convolutionAlgorithms)[batchSize].solution_id,
-//       &workspaceBytes
-//     ));
-//     return workspaceBytes;
-//   }
-
-//   void apply(
-//     CudaHandles* cudaHandles,
-//     int batchSize,
-//     bool accumulate,
-//     void* inputBuf,
-//     void* outputBuf,
-//     void* workspaceBuf,
-//     size_t workspaceBytes
-//   ) const {
-//     const float alpha = 1.0f;
-//     const float beta = accumulate ? 1.0f : 0.0f;
-//     CUDNN_ERR(name.c_str(), miopenConvolutionForwardImmediate(
-//       cudaHandles->cudnn,
-//       filterDescriptor,
-//       filterBuf,
-//       inputDescriptors[batchSize],
-//       inputBuf,
-//       convolutionDescriptor,
-//       outputDescriptors[batchSize],
-//       outputBuf,
-//       workspaceBuf,
-//       workspaceBytes,
-//       (*convolutionAlgorithms)[batchSize].solution_id
-//     ));
-//   }
-
-// };
-
 //---------------------------------------------------------------------------------
 
 struct BatchNormLayer {

From 0bfe0a144279a54e50e5a0555d82633d19360f07 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Sat, 4 Oct 2025 15:43:24 +0200
Subject: [PATCH 16/24] Add new compile target

---
 cpp/CMakeLists.txt | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 471a67a5f..69ecf22b4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -165,7 +165,8 @@ elseif(USE_BACKEND STREQUAL "ROCM")
   # Users can -DCMAKE_HIP_ARCHITECTURES=gfx90a;gfx942 manually specify GFX architectures
   if(NOT DEFINED CMAKE_HIP_ARCHITECTURES)
     # Default compile MI200 / RDNA3 cards, can be simplified as needed
-    set(CMAKE_HIP_ARCHITECTURES 90a 942 908 1100 1101 1200 1201 CACHE STRING "AMD GPU targets")
+    # set(CMAKE_HIP_ARCHITECTURES gfx950 gfx942 gfx90a gfx908 gfx1100 gfx1101 gfx1151 gfx1201 gfx1030 CACHE STRING "AMD GPU targets")
+    add_compile_definitions(-DGPU_TARGETS=gfx950,gfx942,gfx90a,gfx908,gfx1100,gfx1101,gfx1151,gfx1201,gfx1030)
   endif()
 
   # 2) Specify backend source code. rocmhelpers.hip contains GPU kernels, don't forget it
@@ -464,7 +465,7 @@ elseif(USE_BACKEND STREQUAL "ROCM")
   target_compile_definitions(katago PRIVATE HIP_TARGET_VERSION=${CMAKE_HIP_COMPILER_VERSION})
 
   string(TOLOWER "${CMAKE_HIP_ARCHITECTURES}" _gfxlist)  # e.g. "90a;942"
-  if(_gfxlist MATCHES "803|900|90a|94[0-9]|110[0-9]|120[0-9]")
+  if(_gfxlist MATCHES "803|900|90a|94[0-9]|110[0-9]|120[0-9]|115[0-9]|1030")
     target_compile_definitions(katago PRIVATE HIP_SUPPORTS_FP16)
     message(STATUS "Detected FP16‑capable GFX arch (${CMAKE_HIP_ARCHITECTURES}); defining HIP_SUPPORTS_FP16")
   endif()

From 26d8c5bd257a8508bae7673ade8b60a5d5506188 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Sat, 8 Nov 2025 13:04:03 +0100
Subject: [PATCH 17/24] Add ROCm for Windows support

---
 cpp/CMakeLists.txt | 119 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 93 insertions(+), 26 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index de93a0540..830b529f8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -2,8 +2,20 @@ cmake_minimum_required(VERSION 3.18.2)
 if(USE_BACKEND STREQUAL "METAL")
   project(katago LANGUAGES CXX Swift)
 elseif(USE_BACKEND STREQUAL "ROCM")
-  set(CMAKE_C_COMPILER  /opt/rocm/bin/hipcc CACHE FILEPATH "" FORCE)
-  set(CMAKE_CXX_COMPILER /opt/rocm/bin/hipcc CACHE FILEPATH "" FORCE)
+  if(WIN32)
+    # Windows: Use clang++ from HIP SDK (hipcc doesn't work well on Windows)
+    # User can override with -DCMAKE_CXX_COMPILER if needed
+    if(NOT DEFINED CMAKE_CXX_COMPILER)
+      if(DEFINED ENV{HIP_PATH})
+        set(CMAKE_CXX_COMPILER "$ENV{HIP_PATH}/bin/clang++.exe" CACHE FILEPATH "" FORCE)
+        set(CMAKE_C_COMPILER "$ENV{HIP_PATH}/bin/clang.exe" CACHE FILEPATH "" FORCE)
+      endif()
+    endif()
+  else()
+    # Linux: Use hipcc
+    set(CMAKE_C_COMPILER  /opt/rocm/bin/hipcc CACHE FILEPATH "" FORCE)
+    set(CMAKE_CXX_COMPILER /opt/rocm/bin/hipcc CACHE FILEPATH "" FORCE)
+  endif()
   project(katago LANGUAGES C CXX HIP)
 else()
   project(katago)
@@ -151,14 +163,23 @@ elseif(USE_BACKEND STREQUAL "ROCM")
   set(CMAKE_HIP_STANDARD 17)
 
   if(CMAKE_PREFIX_PATH STREQUAL "" OR NOT DEFINED CMAKE_PREFIX_PATH)
-    if(DEFINED ENV{HIP_PATH})
-      # Windows HIP‑SDK
-      list(APPEND CMAKE_PREFIX_PATH $ENV{HIP_PATH})
-      message(STATUS "Auto‑detected HIP_PATH=$ENV{HIP_PATH} → CMAKE_PREFIX_PATH")
-    elseif(EXISTS "/opt/rocm")
-      # Linux
-      list(APPEND CMAKE_PREFIX_PATH "/opt/rocm")
-      message(STATUS "CMAKE_PREFIX_PATH not given; defaulting to /opt/rocm")
+    if(WIN32)
+      # Windows: HIP SDK installed via installer or manually
+      if(DEFINED ENV{HIP_PATH})
+        list(APPEND CMAKE_PREFIX_PATH "$ENV{HIP_PATH}")
+        message(STATUS "Auto-detected HIP_PATH=$ENV{HIP_PATH} → CMAKE_PREFIX_PATH")
+      elseif(DEFINED ENV{ROCM_PATH})
+        list(APPEND CMAKE_PREFIX_PATH "$ENV{ROCM_PATH}")
+        message(STATUS "Auto-detected ROCM_PATH=$ENV{ROCM_PATH} → CMAKE_PREFIX_PATH")
+      else()
+        message(WARNING "HIP_PATH or ROCM_PATH environment variable not set. Please install HIP SDK for Windows.")
+      endif()
+    else()
+      # Linux: Standard ROCm installation path
+      if(EXISTS "/opt/rocm")
+        list(APPEND CMAKE_PREFIX_PATH "/opt/rocm")
+        message(STATUS "CMAKE_PREFIX_PATH not given; defaulting to /opt/rocm")
+      endif()
     endif()
   endif()
 
@@ -473,20 +494,38 @@ elseif(USE_BACKEND STREQUAL "ROCM")
     message(STATUS "Detected FP16‑capable GFX arch (${CMAKE_HIP_ARCHITECTURES}); defining HIP_SUPPORTS_FP16")
   endif()
 
-  # 3) Find ROCm runtime & libraries. Since ROCm 6.x, CMake config-mode packages are included. If not found, add -DCMAKE_PREFIX_PATH=/opt/rocm
+  # 3) Find ROCm runtime & libraries. Since ROCm 6.x, CMake config-mode packages are included. If not found, add -DCMAKE_PREFIX_PATH=/opt/rocm (Linux) or HIP SDK path (Windows)
   find_package(hip        QUIET CONFIG)   # Export hip::device / hip::host
   find_package(hipblas    QUIET CONFIG)   # Export roc::hipblas
-  find_package(miopen     QUIET CONFIG)   # Export roc::miopen
+  find_package(miopen     QUIET CONFIG)   # Export roc::miopen or MIOpen
+  
   # ---------- fallback：HIP Runtime ----------
   if(NOT hip_FOUND)
-    find_path(HIP_INCLUDE_DIR hip/hip_runtime.h
-              HINTS ${CMAKE_PREFIX_PATH} /opt/rocm
-              PATH_SUFFIXES include)
-    find_library(HIP_RUNTIME_LIB amdhip64
-                 HINTS ${CMAKE_PREFIX_PATH} /opt/rocm
-                 PATH_SUFFIXES lib lib64)
+    if(WIN32)
+      # Windows: Search in HIP SDK installation
+      find_path(HIP_INCLUDE_DIR hip/hip_runtime.h
+                HINTS ${CMAKE_PREFIX_PATH} ENV HIP_PATH ENV ROCM_PATH
+                PATH_SUFFIXES include)
+      find_library(HIP_RUNTIME_LIB 
+                   NAMES amdhip64 amdhip64_6
+                   HINTS ${CMAKE_PREFIX_PATH} ENV HIP_PATH ENV ROCM_PATH
+                   PATH_SUFFIXES lib bin)
+    else()
+      # Linux: Search in /opt/rocm
+      find_path(HIP_INCLUDE_DIR hip/hip_runtime.h
+                HINTS ${CMAKE_PREFIX_PATH} /opt/rocm
+                PATH_SUFFIXES include)
+      find_library(HIP_RUNTIME_LIB amdhip64
+                   HINTS ${CMAKE_PREFIX_PATH} /opt/rocm
+                   PATH_SUFFIXES lib lib64)
+    endif()
+    
     if(NOT HIP_INCLUDE_DIR OR NOT HIP_RUNTIME_LIB)
-      message(FATAL_ERROR "HIP headers or runtime NOT found; install ROCm or set CMAKE_PREFIX_PATH.")
+      if(WIN32)
+        message(FATAL_ERROR "HIP headers or runtime NOT found; install HIP SDK for Windows or set CMAKE_PREFIX_PATH to HIP SDK installation path.")
+      else()
+        message(FATAL_ERROR "HIP headers or runtime NOT found; install ROCm or set CMAKE_PREFIX_PATH.")
+      endif()
     endif()
     add_library(hip::device UNKNOWN IMPORTED)
     set_target_properties(hip::device PROPERTIES
@@ -498,26 +537,54 @@ elseif(USE_BACKEND STREQUAL "ROCM")
   # ---------- fallback：hipBLAS / MIOpen ----------
   foreach(_pkg hipblas miopen)
     if(NOT ${_pkg}_FOUND)
-      find_library(${_pkg}_LIB ${_pkg}
-                   HINTS ${CMAKE_PREFIX_PATH} /opt/rocm
-                   PATH_SUFFIXES lib lib64)
+      if(WIN32)
+        # Windows naming conventions
+        if(_pkg STREQUAL "hipblas")
+          set(_lib_names hipblas)
+        else()
+          set(_lib_names MIOpen)
+        endif()
+        find_library(${_pkg}_LIB 
+                     NAMES ${_lib_names}
+                     HINTS ${CMAKE_PREFIX_PATH} ENV HIP_PATH ENV ROCM_PATH
+                     PATH_SUFFIXES lib bin)
+      else()
+        # Linux naming
+        find_library(${_pkg}_LIB ${_pkg}
+                     HINTS ${CMAKE_PREFIX_PATH} /opt/rocm
+                     PATH_SUFFIXES lib lib64)
+      endif()
+      
       if(${_pkg}_LIB)
         add_library(roc::${_pkg} UNKNOWN IMPORTED)
         set_target_properties(roc::${_pkg} PROPERTIES
           IMPORTED_LOCATION "${${_pkg}_LIB}")
         target_include_directories(katago SYSTEM PRIVATE ${HIP_INCLUDE_DIR})
+        message(STATUS "Found ${_pkg} at ${${_pkg}_LIB}")
       else()
-        message(FATAL_ERROR "Required ROCm component ${_pkg} not found – install it or set CMAKE_PREFIX_PATH.")
+        if(WIN32)
+          message(FATAL_ERROR "Required ROCm component ${_pkg} not found – install HIP SDK for Windows or set CMAKE_PREFIX_PATH.")
+        else()
+          message(FATAL_ERROR "Required ROCm component ${_pkg} not found – install it or set CMAKE_PREFIX_PATH.")
+        endif()
       endif()
     endif()
   endforeach()
 
-  # 4) Header file paths are resolved by config-mode targets, no need to hard-code
+  # 4) Link libraries
+  # Note: On Windows, MIOpen might need to be linked as "MIOpen" directly if the target doesn't exist
+  if(TARGET MIOpen)
+    set(_miopen_target MIOpen)
+  elseif(TARGET roc::miopen)
+    set(_miopen_target roc::miopen)
+  else()
+    set(_miopen_target roc::miopen)
+  endif()
+  
   target_link_libraries(katago
     hip::device          # HIP runtime & kernel offload
     roc::hipblas         # BLAS
-    MIOpen
-    # roc::miopen          # DNN primitives
+    ${_miopen_target}    # DNN primitives
   )
 elseif(USE_BACKEND STREQUAL "EIGEN")
   target_compile_definitions(katago PRIVATE USE_EIGEN_BACKEND)

From ed396b72aa93e30e9ffcb728e879093151b92033 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Sat, 28 Feb 2026 05:28:48 +0800
Subject: [PATCH 18/24] Fix bugs

---
 cpp/CMakeLists.txt            | 11 +++++++++--
 cpp/neuralnet/rocmbackend.cpp |  9 +++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 577dfd2c3..ddf9f11b1 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -495,10 +495,17 @@ elseif(USE_BACKEND STREQUAL "ROCM")
   target_compile_definitions(katago PRIVATE USE_ROCM_BACKEND)
   target_compile_definitions(katago PRIVATE HIP_TARGET_VERSION=${CMAKE_HIP_COMPILER_VERSION})
 
+  # Option to disable FP16 support
+  set(ROCM_DISABLE_FP16 OFF CACHE BOOL "Disable FP16 support for ROCm backend")
+  
   string(TOLOWER "${CMAKE_HIP_ARCHITECTURES}" _gfxlist)  # e.g. "90a;942"
   if(_gfxlist MATCHES "803|900|90a|94[0-9]|110[0-9]|120[0-9]|115[0-9]|1030")
-    target_compile_definitions(katago PRIVATE HIP_SUPPORTS_FP16)
-    message(STATUS "Detected FP16‑capable GFX arch (${CMAKE_HIP_ARCHITECTURES}); defining HIP_SUPPORTS_FP16")
+    if(ROCM_DISABLE_FP16)
+      message(STATUS "Detected FP16‑capable GFX arch (${CMAKE_HIP_ARCHITECTURES}), but ROCM_DISABLE_FP16 is set, NOT defining HIP_SUPPORTS_FP16")
+    else()
+      target_compile_definitions(katago PRIVATE HIP_SUPPORTS_FP16)
+      message(STATUS "Detected FP16‑capable GFX arch (${CMAKE_HIP_ARCHITECTURES}); defining HIP_SUPPORTS_FP16")
+    endif()
   endif()
 
   # 3) Find ROCm runtime & libraries. Since ROCm 6.x, CMake config-mode packages are included. If not found, add -DCMAKE_PREFIX_PATH=/opt/rocm (Linux) or HIP SDK path (Windows)
diff --git a/cpp/neuralnet/rocmbackend.cpp b/cpp/neuralnet/rocmbackend.cpp
index 9e7f4cf0b..53e2c8d5a 100644
--- a/cpp/neuralnet/rocmbackend.cpp
+++ b/cpp/neuralnet/rocmbackend.cpp
@@ -2330,6 +2330,15 @@ ComputeHandle* NeuralNet::createComputeHandle(
   if(context->useFP16Mode == enabled_t::True || context->useFP16Mode == enabled_t::Auto)
     useFP16 = true;
 
+  // ROCm backend currently only supports NCHW format
+  if(inputsUseNHWC) {
+    if(logger != NULL) {
+      logger->write(
+        "ROCm backend thread " + Global::intToString(serverThreadIdx) + ": WARNING - NHWC format requested but not supported by ROCm backend, falling back to NCHW"
+      );
+    }
+  }
+
   if(logger != NULL) {
     logger->write(
       "ROCm backend thread " + Global::intToString(serverThreadIdx) + ": Found GPU " + string(prop.name)

From ce2c9fc53a115a4aa7ae8d0240c56aacd4da0c20 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Mon, 16 Mar 2026 18:38:00 +0800
Subject: [PATCH 19/24] Add Intel NPU support

---
 .gitignore                         |   2 +-
 Compiling.md                       |  68 ++-
 README.md                          |  32 +-
 cpp/CMakeLists.txt                 | 199 ++++++-
 cpp/README.md                      |   5 +-
 cpp/command/benchmark.cpp          |  97 +++-
 cpp/command/misc.cpp               |  57 ++
 cpp/configs/analysis_example.cfg   |  28 +
 cpp/configs/contribute_example.cfg |  28 +
 cpp/configs/gtp_example.cfg        |  37 ++
 cpp/configs/match_example.cfg      |  30 +
 cpp/dataio/loadmodel.cpp           |  19 +-
 cpp/main.cpp                       |   7 +
 cpp/main.h                         |   1 +
 cpp/neuralnet/onnxbackend.cpp      | 867 +++++++++++++++++++++++++++++
 cpp/neuralnet/onnxmodelbuilder.cpp | 774 +++++++++++++++++++++++++
 cpp/neuralnet/onnxmodelbuilder.h   |  14 +
 cpp/program/gtpconfig.cpp          |  31 +-
 cpp/program/gtpconfig.h            |   3 +-
 cpp/program/setup.cpp              |  55 +-
 cpp/runonnxtests.sh                |  43 ++
 21 files changed, 2350 insertions(+), 47 deletions(-)
 create mode 100644 cpp/neuralnet/onnxbackend.cpp
 create mode 100644 cpp/neuralnet/onnxmodelbuilder.cpp
 create mode 100644 cpp/neuralnet/onnxmodelbuilder.h
 create mode 100644 cpp/runonnxtests.sh

diff --git a/.gitignore b/.gitignore
index 2e933d553..94076bdce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,7 +23,7 @@ cpp/mainopencl
 cpp/katago
 cpp/configs
 cpp/evalsgf
-cpp/run*.sh
+# cpp/run*.sh
 cpp/tests/scratch
 cpp/program/gitinfo.h
 
diff --git a/Compiling.md b/Compiling.md
index 60a0b8276..8fcea46f8 100644
--- a/Compiling.md
+++ b/Compiling.md
@@ -35,6 +35,7 @@ As also mentioned in the instructions below but repeated here for visibility, if
       * If using the TensorRT backend, in addition to a compatible CUDA Toolkit (https://developer.nvidia.com/cuda-toolkit), you also need TensorRT (https://developer.nvidia.com/tensorrt) that is at least version 8.5.
       * If using the ROCm backend, ROCm 6.4 or later and a GPU capable of supporting them. More information about installation(https://rocm.docs.amd.com/projects/install-on-linux/en/latest/) and please install all possiable ROCm developer packages, instead of just ROCm runtime packages.
       * If using the Eigen backend, Eigen3. With Debian packages, (i.e. apt or apt-get), this should be `libeigen3-dev`.
+      * If using the ONNX backend, ONNX Runtime headers/libs and ONNX protobuf dependencies (`onnx/onnx-ml.pb.h`, `onnx_proto`, `protobuf-lite`) for `.bin.gz` model conversion support.
       * zlib, libzip. With Debian packages (i.e. apt or apt-get), these should be `zlib1g-dev`, `libzip-dev`.
       * If you want to do self-play training and research, probably Google perftools `libgoogle-perftools-dev` for TCMalloc or some other better malloc implementation. For unknown reasons, the allocation pattern in self-play with large numbers of threads and parallel games causes a lot of memory fragmentation under glibc malloc that will eventually run your machine out of memory, but better mallocs handle it fine.
       * If compiling to contribute to public distributed training runs, OpenSSL is required (`libssl-dev`).
@@ -42,7 +43,7 @@ As also mentioned in the instructions below but repeated here for visibility, if
       * `git clone https://github.com/lightvector/KataGo.git`
    * Compile using CMake and make in the cpp directory:
       * `cd KataGo/cpp`
-      * `cmake . -DUSE_BACKEND=OPENCL` or `cmake . -DUSE_BACKEND=CUDA` or `cmake . -DUSE_BACKEND=TENSORRT` or `cmake . -DUSE_BACKEND=EIGEN` or `cmake . -DUSE_BACKEND=ROCM`depending on which backend you want.
+      * `cmake . -DUSE_BACKEND=OPENCL` or `cmake . -DUSE_BACKEND=CUDA` or `cmake . -DUSE_BACKEND=TENSORRT` or `cmake . -DUSE_BACKEND=EIGEN` or `cmake . -DUSE_BACKEND=ROCM` or `cmake . -DUSE_BACKEND=ONNX` depending on which backend you want.
          * Specify also `-DUSE_TCMALLOC=1` if using TCMalloc.
          * Compiling will also call git commands to embed the git hash into the compiled executable, specify also `-DNO_GIT_REVISION=1` to disable it if this is causing issues for you.
          * Specify `-DUSE_AVX2=1` to also compile Eigen with AVX2 and FMA support, which will make it incompatible with old CPUs but much faster. (If you want to go further, you can also add `-DCMAKE_CXX_FLAGS='-march=native'` which will specialize to precisely your machine's CPU, but the exe might not run on other machines at all).
@@ -55,6 +56,60 @@ As also mentioned in the instructions below but repeated here for visibility, if
    * You will probably want to edit `configs/gtp_example.cfg` (see "Tuning for Performance" above).
    * If using OpenCL, you will want to verify that KataGo is picking up the correct device when you run it (e.g. some systems may have both an Intel CPU OpenCL and GPU OpenCL, if KataGo appears to pick the wrong one, you can correct this by specifying `openclGpuToUse` in `configs/gtp_example.cfg`).
 
+## ONNX Runtime Backend
+The ONNX backend uses ONNX Runtime for inference, and supports both:
+* `.onnx` models loaded directly.
+* `.bin.gz` KataGo models via internal conversion to ONNX graph (requires ONNX protobuf dependencies in CMake).
+
+### Windows Intel NPU (OpenVINO EP) Setup
+1. Install Visual Studio Community or Visual Studio 2026 Build Tools:
+   * https://visualstudio.microsoft.com/zh-hans/downloads/
+   * In installer workloads, select **Desktop development with C++**.
+2. Install Intel NPU driver:
+   * https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-windows.html
+3. Install OpenVINO 2026 archive package on Windows:
+   * https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-windows.html
+   * Typical install root looks like: `C:\Program Files (x86)\Intel\openvino_2026.0`
+4. Build ONNX Runtime with OpenVINO EP for NPU (follow official docs):
+   * https://onnxruntime.ai/docs/build/eps.html#openvino
+   * Set OpenVINO EP build option so `use_openvino` is `NPU` (for example `--use_openvino NPU` in ORT build.py).
+
+### Prepare `ONNXRUNTIME_ROOT` in KataGo
+Create:
+* `cpp/external/onnxruntime-win-x64-openvino/include`
+* `cpp/external/onnxruntime-win-x64-openvino/lib`
+
+Copy from your ONNX Runtime build/package output (`<ORT_PACKAGE_ROOT>`) to KataGo:
+* `<ORT_PACKAGE_ROOT>/include/*` -> `cpp/external/onnxruntime-win-x64-openvino/include/`
+* `<ORT_PACKAGE_ROOT>/lib/onnxruntime.lib` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime.lib`
+* `<ORT_PACKAGE_ROOT>/lib/onnxruntime.dll` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime.dll`
+* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_shared.dll` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime_providers_shared.dll`
+* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_openvino.dll` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime_providers_openvino.dll`
+
+Optional if present in your ORT output:
+* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_shared.lib` -> `cpp/external/onnxruntime-win-x64-openvino/lib/`
+* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_openvino.lib` -> `cpp/external/onnxruntime-win-x64-openvino/lib/`
+
+### Minimal KataGo Build Commands (Windows, ONNX backend)
+On Windows, `KATAGO_AUTO_FETCH_DEPS=ON` by default, so missing `zlib`, `onnx`, and `protobuf` dependencies are auto-fetched via vcpkg into `cpp/build/deps/vcpkg`.
+
+```
+cmake -S cpp -B cpp/build -G "Visual Studio 18 2026" -A x64 -DUSE_BACKEND=ONNX -DONNXRUNTIME_ROOT=cpp/external/onnxruntime-win-x64-openvino
+cmake --build cpp/build --config Release
+```
+
+If you want to disable auto-fetch and provide dependencies manually:
+* `-DKATAGO_AUTO_FETCH_DEPS=OFF`
+* plus `-DONNX_INCLUDE_DIR=... -DONNX_PROTO_LIB=... -DPROTOBUF_INCLUDE_DIR=... -DPROTOBUF_LIB=... -DZLIB_INCLUDE_DIR=... -DZLIB_LIBRARY=...`
+
+Typical run config for Intel NPU:
+* `onnxProvider = openvino`
+* `onnxOpenVINODeviceType = NPU`
+* `onnxOpenVINOEnableNPUFastCompile = true` (optional; may be ignored on ORT builds that do not support this key)
+
+Multi-device assignment is mainly for `onnxProvider=cuda/tensorrt/migraphx` (`onnxDeviceToUseThread*`).
+For `onnxProvider=openvino` on Intel NPU, a single device is typically used.
+
 ## Windows
    * TLDR:
       * Building from source on Windows is actually a bit tricky, depending on what version you're building, there's not necessarily a super-fast way.
@@ -65,13 +120,8 @@ As also mentioned in the instructions below but repeated here for visibility, if
       * If using the CUDA backend, CUDA 11 or later and a compatible version of CUDNN based on your CUDA version (https://developer.nvidia.com/cuda-toolkit) (https://developer.nvidia.com/cudnn) and a GPU capable of supporting them. I'm unsure how version compatibility works with CUDA, there's a good chance that later versions than these work just as well, but they have not been tested.
       * If using the TensorRT backend, in addition to a compatible CUDA Toolkit (https://developer.nvidia.com/cuda-toolkit), you also need TensorRT (https://developer.nvidia.com/tensorrt) that is at least version 8.5.
       * If using the Eigen backend, Eigen3, version 3.3.x. (http://eigen.tuxfamily.org/index.php?title=Main_Page#Download).
-      * zlib. Easy way to build zlib on Windows is to use vcpkg. Run in Powershell:
-         * git clone https://github.com/microsoft/vcpkg.git
-         * cd .\vcpkg\
-         * .\bootstrap-vcpkg.bat
-         * .\vcpkg.exe install zlib:x64-windows
-         * Set CMake ZLIB_LIBRARY to vcpkg\installed\x64-windows\lib\zlib.lib and ZLIB_INCLUDE_DIRECTORY to vcpkg\installed\x64-windows\include.
-         * Copy zlib1.dll from vcpkg\installed\x64-windows\bin to Katago folder after you've built Katago executable.
+      * If using the ONNX backend, ONNX Runtime package (headers + import libs + runtime DLLs).
+      * On Windows, missing `zlib` and ONNX model-conversion dependencies (`onnx`, `protobuf`) can be auto-fetched by CMake into `cpp/build/deps/vcpkg` (default `KATAGO_AUTO_FETCH_DEPS=ON`).
       * libzip (optional, needed only for self-play training) - for example https://github.com/kiyolee/libzip-win-build
       * For MinGW it's recommended to use [MSYS2](https://www.msys2.org/) building platform to get necessary zlib and libzip dependencies:
         * Install MSYS2 according to the instruction on the official site
@@ -98,7 +148,7 @@ As also mentioned in the instructions below but repeated here for visibility, if
           -DLIBZIP_INCLUDE_DIR_ZIPCONF:PATH="C:/msys64/mingw64/include"
           -DLIBZIP_LIBRARY:FILEPATH="C:/msys64/mingw64/lib/libzip.dll.a"
           ```
-      * Also set `USE_BACKEND` to `OPENCL`, or `CUDA`, or `TENSORRT`, or `EIGEN` depending on what backend you want to use.
+      * Also set `USE_BACKEND` to `OPENCL`, or `CUDA`, or `TENSORRT`, or `EIGEN`, or `ROCM`, or `ONNX` depending on what backend you want to use.
       * Set any other options you want and re-run "Configure" again as needed after setting them. Such as:
          * `NO_GIT_REVISION` if you don't have Git or if cmake is not finding it.
          * `NO_LIBZIP` if you don't care about running self-play training and you don't have libzip.
diff --git a/README.md b/README.md
index 768e40838..04560bb7a 100644
--- a/README.md
+++ b/README.md
@@ -8,8 +8,9 @@
     - [GUIs](#guis)
     - [Windows and Linux](#windows-and-linux)
     - [MacOS](#macos)
-    - [OpenCL vs CUDA vs TensorRT vs ROCm vs Eigen](#opencl-vs-cuda-vs-tensorrt-vs-rocm-vs-eigen)
+    - [OpenCL vs CUDA vs TensorRT vs ROCm vs Eigen vs ONNX](#opencl-vs-cuda-vs-tensorrt-vs-rocm-vs-eigen-vs-onnx)
     - [How To Use](#how-to-use)
+      - [ONNX/OpenVINO Intel NPU Quick Start (Windows)](#onnxopenvino-intel-npu-quick-start-windows)
       - [Human-style Play and Analysis](#human-style-play-and-analysis)
       - [Other Commands:](#other-commands)
     - [Tuning for Performance](#tuning-for-performance)
@@ -87,8 +88,8 @@ The community also provides KataGo packages for [Homebrew](https://brew.sh) on M
 
 Use `brew install katago`. The latest config files and networks are installed in KataGo's `share` directory. Find them via `brew list --verbose katago`. A basic way to run katago will be `katago gtp -config $(brew list --verbose katago | grep 'gtp.*\.cfg') -model $(brew list --verbose katago | grep .gz | head -1)`. You should choose the Network according to the release notes here and customize the provided example config as with every other way of installing KataGo.
 
-### OpenCL vs CUDA vs TensorRT vs ROCm vs Eigen
-KataGo has five backends, OpenCL (GPU), CUDA (GPU), TensorRT (GPU), ROCm (GPU) and Eigen (CPU).
+### OpenCL vs CUDA vs TensorRT vs ROCm vs Eigen vs ONNX
+KataGo has six backends, OpenCL (GPU), CUDA (GPU), TensorRT (GPU), ROCm (GPU), Eigen (CPU), and ONNX (CPU/GPU/NPU via providers).
 
 The quick summary is:
   * **To easily get something working, try OpenCL if you have any good or decent GPU.**
@@ -97,6 +98,7 @@ The quick summary is:
   * Use Eigen without AVX2 if your CPU is old or on a low-end device that doesn't support AVX2.
   * The CUDA backend can work for NVIDIA GPUs with CUDA+CUDNN installed but is likely worse than TensorRT.
   * The ROCm backend can work for AMD GPUs with ROCm+MIOpen installed.
+  * ONNX backend uses ONNX Runtime execution providers (CPU/OpenVINO/CUDA/TensorRT/MIGraphX/CoreML). It is useful for Intel NPU (OpenVINO) and raw `.onnx` models.
 
 More in detail:
   * OpenCL is a general GPU backend should be able to run with any GPUs or accelerators that support [OpenCL](https://en.wikipedia.org/wiki/OpenCL), including NVIDIA GPUs, AMD GPUs, as well CPU-based OpenCL implementations or things like Intel Integrated Graphics. This is the most general GPU version of KataGo and doesn't require a complicated install like CUDA does, so is most likely to work out of the box as long as you have a fairly modern GPU. **However, it also need to take some time when run for the very first time to tune itself.** For many systems, this will take 5-30 seconds, but on a few older/slower systems, may take many minutes or longer. Also, the quality of OpenCL implementations is sometimes inconsistent, particularly for Intel Integrated Graphics and for AMD GPUs that are older than several years, so it might not work for very old machines, as well as specific buggy newer AMD GPUs, see also [Issues with specific GPUs or GPU drivers](#issues-with-specific-gpus-or-gpu-drivers).
@@ -104,6 +106,7 @@ More in detail:
   * TensorRT is similar to CUDA, but only uses NVIDIA's TensorRT framework to run the neural network with more optimized kernels. For modern NVIDIA GPUs, it should work whenever CUDA does and will usually be faster than CUDA or any other backend.
   * ROCm is a GPU backend specific to AMD GPUs (it will not work with NVIDIA or Intel or any other GPUs) and requires installing [ROCm](https://rocm.docs.amd.com) and [MIOpen](https://rocm.docs.amd.com/projects/MIOpen) and a modern AMD GPU. On most GPUs, the OpenCL implementation will actually beat AMD's own ROCm/MIOpen at performance. The exception is for top-end AMD GPUs that support FP16 and stream processors, in which case sometimes one is better and sometimes the other is better.
   * Eigen is a *CPU* backend that should work widely *without* needing a GPU or fancy drivers. Use this if you don't have a good GPU or really any GPU at all. It will be quite significantly slower than OpenCL or CUDA, but on a good CPU can still often get 10 to 20 playouts per second if using the smaller (15 or 20) block neural nets. Eigen can also be compiled with AVX2 and FMA support, which can provide a big performance boost for Intel and AMD CPUs from the last few years. However, it will not run at all on older CPUs (and possibly even some recent but low-power modern CPUs) that don't support these fancy vector instructions.
+  * ONNX backend uses [ONNX Runtime](https://onnxruntime.ai/). It can use CPU by default, OpenVINO for Intel hardware (including NPU on supported systems), CUDA/TensorRT for NVIDIA GPUs, MIGraphX for AMD GPUs, and CoreML on macOS. Multi-device assignment via `onnxDeviceToUseThread*` is mainly for CUDA/TensorRT/MIGraphX providers, while OpenVINO NPU setups are typically single-device.
 
 For **any** implementation, it's recommended that you also tune the number of threads used if you care about optimal performance, as it can make a factor of 2-3 difference in the speed. See "Tuning for Performance" below. However, if you mostly just want to get it working, then the default untuned settings should also be still reasonable.
 
@@ -137,6 +140,29 @@ path/to/katago.exe gtp -model path/to/<NEURALNET>.bin.gz
 path/to/katago.exe gtp -model path/to/<NEURALNET>.bin.gz -config path/to/gtp_custom.cfg
 ```
 
+#### ONNX/OpenVINO Intel NPU Quick Start (Windows)
+
+If you want to use ONNX Runtime + OpenVINO on Intel NPU:
+* Install Intel NPU driver: https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-windows.html
+* Install OpenVINO archive package (Windows): https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-windows.html
+* Build ONNX Runtime with OpenVINO EP for NPU (`use_openvino=NPU`): https://onnxruntime.ai/docs/build/eps.html#openvino
+* See [Compiling.md](Compiling.md) for the full build/copy steps for `cpp/external/onnxruntime-win-x64-openvino`.
+
+Minimal commands:
+```
+# 1) Export .bin/.bin.gz to ONNX (default export size is 19x19)
+./katago.exe exportonnx -model <NEURALNET>.bin.gz -output <NEURALNET>.19x19.onnx
+
+# 2) Benchmark on Intel NPU (OpenVINO provider)
+./katago.exe benchmark -config cpp/configs/gtp_example.cfg -model <NEURALNET>.19x19.onnx -visits 32 -threads 1 -n 2 -override-config onnxProvider=openvino,onnxOpenVINODeviceType=NPU,numSearchThreads=1,numNNServerThreadsPerModel=1
+
+# 3) Run GTP for GUI tools (Sabaki/Lizzie/q5Go/etc)
+./katago.exe gtp -config cpp/configs/gtp_example.cfg -model <NEURALNET>.19x19.onnx
+
+If you don't prepare config file, then use -override-config args, like:
+./katago.exe gtp -config cpp/configs/gtp_example.cfg -model <NEURALNET>.19x19.onnx -override-config onnxProvider=openvino,onnxOpenVINODeviceType=NPU
+```
+
 #### Human-style Play and Analysis
 
 You can also have KataGo imitate human play if you download the human SL model b18c384nbt-humanv0.bin.gz from https://github.com/lightvector/KataGo/releases/tag/v1.15.0, and run a command like the following, providing both the normal model and the human SL model:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ddf9f11b1..b30bb964a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -48,7 +48,7 @@ endif()
 set(BUILD_DISTRIBUTED 0 CACHE BOOL "Build with http support for contributing to distributed training")
 set(USE_BACKEND CACHE STRING "Neural net backend")
 string(TOUPPER "${USE_BACKEND}" USE_BACKEND)
-set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN ROCM)
+set_property(CACHE USE_BACKEND PROPERTY STRINGS "" CUDA TENSORRT OPENCL EIGEN ROCM ONNX)
 
 set(USE_TCMALLOC 0 CACHE BOOL "Use TCMalloc")
 set(NO_GIT_REVISION 0 CACHE BOOL "Disable embedding the git revision into the compiled exe")
@@ -58,6 +58,77 @@ set(USE_BIGGER_BOARDS_EXPENSIVE 0 CACHE BOOL "Allow boards up to size 50. Compil
 set(USE_CACHE_TENSORRT_PLAN 0 CACHE BOOL "Use TENSORRT plan cache. May use a lot of disk space. Only applies when USE_BACKEND is TENSORRT.")
 mark_as_advanced(USE_CACHE_TENSORRT_PLAN)
 
+if(WIN32)
+  set(_katago_auto_fetch_default ON)
+else()
+  set(_katago_auto_fetch_default OFF)
+endif()
+option(KATAGO_AUTO_FETCH_DEPS "Automatically fetch missing dependencies into build/deps (Windows uses vcpkg)." ${_katago_auto_fetch_default})
+set(KATAGO_DEPS_DIR "${CMAKE_SOURCE_DIR}/build/deps" CACHE PATH "Directory for auto-fetched third-party dependencies")
+set(KATAGO_VCPKG_TRIPLET "x64-windows" CACHE STRING "vcpkg triplet used by KATAGO_AUTO_FETCH_DEPS on Windows")
+set(KATAGO_VCPKG_ROOT "${KATAGO_DEPS_DIR}/vcpkg" CACHE PATH "Path to local vcpkg clone used by KATAGO_AUTO_FETCH_DEPS")
+mark_as_advanced(KATAGO_VCPKG_TRIPLET KATAGO_VCPKG_ROOT)
+
+function(katago_vcpkg_bootstrap_if_needed)
+  if(NOT WIN32)
+    message(FATAL_ERROR "katago_vcpkg_bootstrap_if_needed is only supported on Windows")
+  endif()
+
+  if(NOT KATAGO_AUTO_FETCH_DEPS)
+    message(FATAL_ERROR "KATAGO_AUTO_FETCH_DEPS is OFF, cannot auto-fetch missing dependency")
+  endif()
+
+  file(MAKE_DIRECTORY "${KATAGO_DEPS_DIR}")
+
+  if(NOT EXISTS "${KATAGO_VCPKG_ROOT}/vcpkg.exe")
+    if(NOT EXISTS "${KATAGO_VCPKG_ROOT}/.git")
+      find_package(Git QUIET)
+      if(NOT GIT_FOUND)
+        message(FATAL_ERROR "KATAGO_AUTO_FETCH_DEPS requires git to clone vcpkg")
+      endif()
+      message(STATUS "Auto-fetch deps: cloning vcpkg into ${KATAGO_VCPKG_ROOT}")
+      execute_process(
+        COMMAND "${GIT_EXECUTABLE}" clone --depth=1 https://github.com/microsoft/vcpkg.git "${KATAGO_VCPKG_ROOT}"
+        RESULT_VARIABLE _clone_result
+        OUTPUT_VARIABLE _clone_out
+        ERROR_VARIABLE _clone_err
+      )
+      if(NOT _clone_result EQUAL 0)
+        message(FATAL_ERROR "Failed to clone vcpkg.\n${_clone_out}\n${_clone_err}")
+      endif()
+    endif()
+
+    message(STATUS "Auto-fetch deps: bootstrapping vcpkg")
+    execute_process(
+      COMMAND "${KATAGO_VCPKG_ROOT}/bootstrap-vcpkg.bat" -disableMetrics
+      WORKING_DIRECTORY "${KATAGO_VCPKG_ROOT}"
+      RESULT_VARIABLE _bootstrap_result
+    )
+    if(NOT _bootstrap_result EQUAL 0)
+      message(FATAL_ERROR "Failed to bootstrap vcpkg")
+    endif()
+  endif()
+endfunction()
+
+function(katago_vcpkg_install_if_needed package_name)
+  if(NOT WIN32)
+    message(FATAL_ERROR "katago_vcpkg_install_if_needed is only supported on Windows")
+  endif()
+
+  katago_vcpkg_bootstrap_if_needed()
+
+  set(_spec "${package_name}:${KATAGO_VCPKG_TRIPLET}")
+  message(STATUS "Auto-fetch deps: ensuring ${_spec} via vcpkg")
+  execute_process(
+    COMMAND "${KATAGO_VCPKG_ROOT}/vcpkg.exe" install "${_spec}" --disable-metrics
+    WORKING_DIRECTORY "${KATAGO_VCPKG_ROOT}"
+    RESULT_VARIABLE _install_result
+  )
+  if(NOT _install_result EQUAL 0)
+    message(FATAL_ERROR "Failed to install ${_spec} via vcpkg")
+  endif()
+endfunction()
+
 #--------------------------- NEURAL NET BACKEND ------------------------------------------------------------------------
 
 message(STATUS "Building 'katago' executable for GTP engine and other tools.")
@@ -161,6 +232,12 @@ elseif(USE_BACKEND STREQUAL "EIGEN")
   set(NEURALNET_BACKEND_SOURCES
     neuralnet/eigenbackend.cpp
     )
+elseif(USE_BACKEND STREQUAL "ONNX")
+  message(STATUS "-DUSE_BACKEND=ONNX, using ONNX Runtime backend.")
+  set(NEURALNET_BACKEND_SOURCES
+    neuralnet/onnxbackend.cpp
+    neuralnet/onnxmodelbuilder.cpp
+    )
 # --------------------------- ROCM backend（AMD GPU / HIP  MIOpen） ---------------------------
 elseif(USE_BACKEND STREQUAL "ROCM")
   message(STATUS "-DUSE_BACKEND=ROCM, using AMD ROCm backend.")
@@ -207,7 +284,7 @@ elseif(USE_BACKEND STREQUAL "ROCM")
   # add_compile_definitions(HIP_SUPPORTS_FP16)
 
 elseif(USE_BACKEND STREQUAL "")
-  message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=TENSORRT or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=EIGEN to compile with the respective backend.${ColorReset}")
+  message(WARNING "${ColorBoldRed}WARNING: Using dummy neural net backend, intended for non-neural-net testing only, will fail on any code path requiring a neural net. To use neural net, specify -DUSE_BACKEND=CUDA or -DUSE_BACKEND=TENSORRT or -DUSE_BACKEND=OPENCL or -DUSE_BACKEND=EIGEN or -DUSE_BACKEND=ONNX to compile with the respective backend.${ColorReset}")
   set(NEURALNET_BACKEND_SOURCES neuralnet/dummybackend.cpp)
 else()
   message(FATAL_ERROR "Unrecognized backend: " ${USE_BACKEND})
@@ -600,6 +677,107 @@ elseif(USE_BACKEND STREQUAL "ROCM")
     roc::hipblas         # BLAS
     ${_miopen_target}    # DNN primitives
   )
+elseif(USE_BACKEND STREQUAL "ONNX")
+  target_compile_definitions(katago PRIVATE USE_ONNX_BACKEND)
+
+  if(WIN32)
+    set(_onnx_default_root "${CMAKE_CURRENT_SOURCE_DIR}/external/onnxruntime-win-x64-openvino")
+  else()
+    set(_onnx_default_root "")
+  endif()
+  set(ONNXRUNTIME_ROOT "${_onnx_default_root}" CACHE PATH "Path to ONNX Runtime package root")
+
+  if(NOT IS_DIRECTORY "${ONNXRUNTIME_ROOT}")
+    message(FATAL_ERROR "ONNXRUNTIME_ROOT does not exist: ${ONNXRUNTIME_ROOT}")
+  endif()
+
+  set(ONNXRUNTIME_INCLUDE_DIR "${ONNXRUNTIME_ROOT}/include")
+  if(NOT IS_DIRECTORY "${ONNXRUNTIME_INCLUDE_DIR}")
+    message(FATAL_ERROR "ONNX Runtime include directory not found: ${ONNXRUNTIME_INCLUDE_DIR}")
+  endif()
+  target_include_directories(katago SYSTEM PRIVATE "${ONNXRUNTIME_INCLUDE_DIR}")
+
+  if(WIN32)
+    set(ONNXRUNTIME_LIB "${ONNXRUNTIME_ROOT}/lib/onnxruntime.lib")
+    file(GLOB ONNXRUNTIME_DLLS "${ONNXRUNTIME_ROOT}/lib/*.dll")
+  else()
+    find_library(ONNXRUNTIME_LIB onnxruntime HINTS "${ONNXRUNTIME_ROOT}/lib" "${ONNXRUNTIME_ROOT}")
+  endif()
+  if(NOT EXISTS "${ONNXRUNTIME_LIB}" AND NOT ONNXRUNTIME_LIB)
+    message(FATAL_ERROR "Could not find onnxruntime library under ${ONNXRUNTIME_ROOT}")
+  endif()
+  target_link_libraries(katago ${ONNXRUNTIME_LIB})
+
+  # Required by onnxmodelbuilder.cpp for building ONNX graphs from .bin.gz.
+  # These are intentionally configurable because package layouts vary.
+  set(ONNX_INCLUDE_DIR "" CACHE PATH "Directory containing onnx/onnx-ml.pb.h (required for .bin.gz -> ONNX conversion)")
+  set(ONNX_PROTO_LIB "" CACHE FILEPATH "Path to onnx_proto library (required for .bin.gz -> ONNX conversion)")
+  set(PROTOBUF_INCLUDE_DIR "" CACHE PATH "Directory containing google/protobuf/message.h (required for .bin.gz -> ONNX conversion)")
+  set(PROTOBUF_LIB "" CACHE FILEPATH "Path to protobuf library (protobuf-lite or libprotobuf, required for .bin.gz -> ONNX conversion)")
+  mark_as_advanced(CLEAR ONNX_INCLUDE_DIR ONNX_PROTO_LIB PROTOBUF_INCLUDE_DIR PROTOBUF_LIB)
+
+  # Backward compatibility with older cache variable name.
+  if(NOT PROTOBUF_LIB AND PROTOBUF_LITE_LIB)
+    set(PROTOBUF_LIB "${PROTOBUF_LITE_LIB}")
+  endif()
+
+  if(WIN32 AND KATAGO_AUTO_FETCH_DEPS)
+    set(_need_onnx_proto_deps FALSE)
+    if(NOT ONNX_INCLUDE_DIR OR NOT ONNX_PROTO_LIB OR NOT PROTOBUF_INCLUDE_DIR OR NOT PROTOBUF_LIB)
+      set(_need_onnx_proto_deps TRUE)
+    endif()
+    if(_need_onnx_proto_deps)
+      katago_vcpkg_install_if_needed("onnx")
+      katago_vcpkg_install_if_needed("protobuf")
+      set(_katago_vcpkg_installed_root "${KATAGO_VCPKG_ROOT}/installed/${KATAGO_VCPKG_TRIPLET}")
+      if(EXISTS "${_katago_vcpkg_installed_root}/include/onnx/onnx-ml.pb.h")
+        set(ONNX_INCLUDE_DIR "${_katago_vcpkg_installed_root}/include" CACHE PATH "Directory containing onnx/onnx-ml.pb.h (required for .bin.gz -> ONNX conversion)" FORCE)
+      endif()
+      if(EXISTS "${_katago_vcpkg_installed_root}/lib/onnx_proto.lib")
+        set(ONNX_PROTO_LIB "${_katago_vcpkg_installed_root}/lib/onnx_proto.lib" CACHE FILEPATH "Path to onnx_proto library (required for .bin.gz -> ONNX conversion)" FORCE)
+      endif()
+      if(EXISTS "${_katago_vcpkg_installed_root}/include/google/protobuf/message.h")
+        set(PROTOBUF_INCLUDE_DIR "${_katago_vcpkg_installed_root}/include" CACHE PATH "Directory containing google/protobuf/message.h (required for .bin.gz -> ONNX conversion)" FORCE)
+      endif()
+      if(EXISTS "${_katago_vcpkg_installed_root}/lib/libprotobuf-lite.lib")
+        set(PROTOBUF_LIB "${_katago_vcpkg_installed_root}/lib/libprotobuf-lite.lib" CACHE FILEPATH "Path to protobuf library (protobuf-lite or libprotobuf, required for .bin.gz -> ONNX conversion)" FORCE)
+      elseif(EXISTS "${_katago_vcpkg_installed_root}/lib/libprotobuf.lib")
+        set(PROTOBUF_LIB "${_katago_vcpkg_installed_root}/lib/libprotobuf.lib" CACHE FILEPATH "Path to protobuf library (protobuf-lite or libprotobuf, required for .bin.gz -> ONNX conversion)" FORCE)
+      endif()
+    endif()
+  endif()
+
+  if(NOT ONNX_INCLUDE_DIR)
+    find_path(ONNX_INCLUDE_DIR onnx/onnx-ml.pb.h)
+  endif()
+  if(NOT ONNX_PROTO_LIB)
+    find_library(ONNX_PROTO_LIB onnx_proto)
+  endif()
+  if(NOT PROTOBUF_INCLUDE_DIR)
+    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h)
+  endif()
+  if(NOT PROTOBUF_LIB)
+    find_library(PROTOBUF_LIB protobuf-lite protobuf-lite32 libprotobuf protobuf)
+  endif()
+
+  if(NOT ONNX_INCLUDE_DIR OR NOT ONNX_PROTO_LIB OR NOT PROTOBUF_INCLUDE_DIR OR NOT PROTOBUF_LIB)
+    message(FATAL_ERROR
+      "ONNX backend requires ONNX protobuf dependencies for .bin.gz model conversion. "
+      "Set ONNX_INCLUDE_DIR (contains onnx/onnx-ml.pb.h), ONNX_PROTO_LIB, PROTOBUF_INCLUDE_DIR, and PROTOBUF_LIB.")
+  endif()
+  target_include_directories(katago SYSTEM PRIVATE "${ONNX_INCLUDE_DIR}")
+  target_include_directories(katago SYSTEM PRIVATE "${PROTOBUF_INCLUDE_DIR}")
+  target_link_libraries(katago ${ONNX_PROTO_LIB} ${PROTOBUF_LIB})
+
+  if(WIN32 AND ONNXRUNTIME_DLLS)
+    foreach(_onnxruntime_dll IN LISTS ONNXRUNTIME_DLLS)
+      add_custom_command(TARGET katago POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different
+          "${_onnxruntime_dll}"
+          $<TARGET_FILE_DIR:katago>
+      )
+    endforeach()
+  endif()
 elseif(USE_BACKEND STREQUAL "EIGEN")
   target_compile_definitions(katago PRIVATE USE_EIGEN_BACKEND)
   if(NOT (MSVC))
@@ -631,6 +809,23 @@ if(NO_GIT_REVISION AND (NOT BUILD_DISTRIBUTED))
   target_compile_definitions(katago PRIVATE NO_GIT_REVISION)
 endif()
 
+if(WIN32 AND KATAGO_AUTO_FETCH_DEPS)
+  set(_need_zlib_deps FALSE)
+  if(NOT ZLIB_INCLUDE_DIR OR NOT ZLIB_LIBRARY)
+    set(_need_zlib_deps TRUE)
+  endif()
+  if(_need_zlib_deps)
+    katago_vcpkg_install_if_needed("zlib")
+    set(_katago_vcpkg_installed_root "${KATAGO_VCPKG_ROOT}/installed/${KATAGO_VCPKG_TRIPLET}")
+    if(EXISTS "${_katago_vcpkg_installed_root}/include/zlib.h")
+      set(ZLIB_INCLUDE_DIR "${_katago_vcpkg_installed_root}/include" CACHE PATH "Path to directory with zlib.h and other header files" FORCE)
+    endif()
+    if(EXISTS "${_katago_vcpkg_installed_root}/lib/zlib.lib")
+      set(ZLIB_LIBRARY "${_katago_vcpkg_installed_root}/lib/zlib.lib" CACHE FILEPATH "Path to 'libz.so' on Linux or 'libz.lib' on Windows" FORCE)
+    endif()
+  endif()
+endif()
+
 find_package(ZLIB)
 if(ZLIB_FOUND)
   include_directories(${ZLIB_INCLUDE_DIRS})
diff --git a/cpp/README.md b/cpp/README.md
index 7376c6b7d..6f5f95ad7 100644
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -9,13 +9,14 @@ Summary of source folders, in approximate dependency order, from lowest level to
   * `board.{cpp,h}` - Raw board implementation, without move history. Helper functions for Benson's algorithm and ladder search.
   * `boardhistory.{cpp,h}` - Datastructure that does include move history - handles superko, passing, game end, final scoring, komi, handicap detection, etc.
   * `graphhash.{cpp,h}` - History-sensitive hash used for [monte-carlo graph search](https://github.com/lightvector/KataGo/blob/master/docs/GraphSearch.md).
-* `neuralnet` - Neural net GPU implementation and interface. Contains OpenCL, CUDA, Eigen, TensorRT backends along with common interfaces and model data structures.
+* `neuralnet` - Neural net GPU implementation and interface. Contains OpenCL, CUDA, TensorRT, ROCm, Metal, Eigen, and ONNX backends along with common interfaces and model data structures.
   * `desc.{cpp,h}` - Data structure holding neural net structure and weights.
   * `modelversion.{cpp,h}` - Enumerates the various versions of neural net features and models.
   * `nninputs.{cpp,h}` - Implements the input features for the neural net.
   * `sgfmetadata.{cpp,h}` - Implements the input features for the [HumanSL neural net](https://github.com/lightvector/KataGo/blob/master/docs/Analysis_Engine.md#human-sl-analysis-guide), for conditioning on various SGF metadata about human players from training data.
   * `nninterface.h` - Common interface that is implemented by every low-level neural net backend.
-  * `{cuda,opencl,eigen,trt,rocm,metal,dummy}backend.cpp` - Various backends.
+  * `{cuda,opencl,eigen,trt,rocm,metal,onnx,dummy}backend.cpp` - Various backends.
+  * `onnxmodelbuilder.{cpp,h}` - Builds ONNX graphs from KataGo model weights for ONNX Runtime.
   * `nneval.{cpp,h}` - Top-level handle to the neural net used by the rest of the engine, implements thread-safe batching of queries.
 * `search` - The main search engine.
   * `timecontrols.cpp` - Basic handling of a few possible time controls.
diff --git a/cpp/command/benchmark.cpp b/cpp/command/benchmark.cpp
index 92f44aae9..63ad26e57 100644
--- a/cpp/command/benchmark.cpp
+++ b/cpp/command/benchmark.cpp
@@ -272,6 +272,21 @@ int MainCmds::benchmark(const vector<string>& args) {
 #endif
 #ifdef USE_EIGEN_BACKEND
   cout << "You are currently using the Eigen (CPU) version of KataGo. Due to having no GPU, it may be slow." << endl;
+#endif
+#ifdef USE_ONNX_BACKEND
+  string onnxProvider = cfg.contains("onnxProvider") ? cfg.getString("onnxProvider") : "cpu";
+  string onnxProviderLower = Global::toLower(onnxProvider);
+  cout << "You are currently using the ONNX Runtime version of KataGo." << endl;
+  cout << "Your GTP config is currently set to onnxProvider = " << onnxProvider << endl;
+  if(onnxProviderLower == "openvino") {
+    string deviceType = cfg.contains("onnxOpenVINODeviceType") ? cfg.getString("onnxOpenVINODeviceType") : "CPU";
+    cout << "OpenVINO device type = " << deviceType << endl;
+    cout << "For Intel NPU, typically set onnxOpenVINODeviceType = NPU." << endl;
+    cout << "OpenVINO/NPU usually uses a single device; onnxDeviceToUseThread* is typically for cuda/trt/migraphx providers." << endl;
+  }
+  else if(onnxProviderLower == "cuda" || onnxProviderLower == "tensorrt" || onnxProviderLower == "migraphx") {
+    cout << "For ONNX Runtime multi-GPU, use numNNServerThreadsPerModel + onnxDeviceToUseThreadX." << endl;
+  }
 #endif
   cout << endl;
   cout << "Your GTP config is currently set to use numSearchThreads = " << params.numThreads << endl;
@@ -638,6 +653,9 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
   int configNNCacheSizePowerOfTwo = 20;
   int configNNMutexPoolSizePowerOfTwo = 16;
   int configNumSearchThreads = 6;
+#ifdef USE_ONNX_BACKEND
+  string configOnnxProvider = "openvino";
+#endif
 
   cout << endl;
   cout << "=========================================================================" << endl;
@@ -763,30 +781,72 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
       });
   }
 
+#ifdef USE_ONNX_BACKEND
+  {
+    cout << endl;
+    string prompt =
+      "Select ONNX Runtime execution provider in the generated config\n"
+      "(cpu, openvino, cuda, tensorrt, migraphx, coreml), default openvino:\n";
+    promptAndParseInput(prompt, [&](const string& line) {
+        string provider = Global::toLower(Global::trim(line));
+        if(provider == "")
+          provider = "openvino";
+        if(
+          provider != "cpu" &&
+          provider != "openvino" &&
+          provider != "cuda" &&
+          provider != "tensorrt" &&
+          provider != "migraphx" &&
+          provider != "coreml"
+        )
+          throw StringError("Must be one of: cpu, openvino, cuda, tensorrt, migraphx, coreml");
+        configOnnxProvider = provider;
+      });
+  }
+#endif
+
   cout << endl;
   cout << "=========================================================================" << endl;
   cout << "GPUS AND RAM" << endl;
 
 #ifndef USE_EIGEN_BACKEND
   {
-    cout << endl;
-    cout << "Finding available GPU-like devices..." << endl;
-    NeuralNet::printDevices();
-    cout << endl;
+    bool askForDeviceIdxs = true;
+#ifdef USE_ONNX_BACKEND
+    bool onnxProviderSupportsThreadDeviceMap =
+      configOnnxProvider == "cuda" ||
+      configOnnxProvider == "tensorrt" ||
+      configOnnxProvider == "migraphx";
+    askForDeviceIdxs = onnxProviderSupportsThreadDeviceMap;
+#endif
+    if(askForDeviceIdxs) {
+      cout << endl;
+      cout << "Finding available GPU-like devices..." << endl;
+      NeuralNet::printDevices();
+      cout << endl;
 
-    string prompt =
-      "Specify devices/GPUs to use (for example \"0,1,2\" to use devices 0, 1, and 2). Leave blank for a default SINGLE-GPU config:\n";
-    promptAndParseInput(prompt, [&](const string& line) {
-        vector<string> pieces = Global::split(line,',');
-        configDeviceIdxs.clear();
-        for(size_t i = 0; i<pieces.size(); i++) {
-          string piece = Global::trim(pieces[i]);
-          int idx = Global::stringToInt(piece);
-          if(idx < 0 || idx > 10000)
-            throw StringError("Invalid device idx: " + Global::intToString(idx));
-          configDeviceIdxs.push_back(idx);
-        }
-      });
+      string prompt =
+        "Specify devices/GPUs to use (for example \"0,1,2\" to use devices 0, 1, and 2). Leave blank for a default SINGLE-GPU config:\n";
+      promptAndParseInput(prompt, [&](const string& line) {
+          vector<string> pieces = Global::split(line,',');
+          configDeviceIdxs.clear();
+          for(size_t i = 0; i<pieces.size(); i++) {
+            string piece = Global::trim(pieces[i]);
+            int idx = Global::stringToInt(piece);
+            if(idx < 0 || idx > 10000)
+              throw StringError("Invalid device idx: " + Global::intToString(idx));
+            configDeviceIdxs.push_back(idx);
+          }
+        });
+    }
+#ifdef USE_ONNX_BACKEND
+    else {
+      cout << endl;
+      cout << "onnxProvider = " << configOnnxProvider << " selected." << endl;
+      cout << "Skipping per-thread multi-device mapping (mainly used by cuda/tensorrt/migraphx providers)." << endl;
+      configDeviceIdxs.clear();
+    }
+#endif
   }
 #endif
 
@@ -863,6 +923,9 @@ int MainCmds::genconfig(const vector<string>& args, const string& firstCommand)
       configNNCacheSizePowerOfTwo,
       configNNMutexPoolSizePowerOfTwo,
       configNumSearchThreads
+#ifdef USE_ONNX_BACKEND
+      ,configOnnxProvider
+#endif
     );
   };
   updateConfigContents();
diff --git a/cpp/command/misc.cpp b/cpp/command/misc.cpp
index 001bc5b3b..31ca899fc 100644
--- a/cpp/command/misc.cpp
+++ b/cpp/command/misc.cpp
@@ -13,6 +13,10 @@
 #include "../program/setup.h"
 #include "../program/playutils.h"
 #include "../program/play.h"
+#include "../neuralnet/nninterface.h"
+#ifdef USE_ONNX_BACKEND
+#include "../neuralnet/onnxmodelbuilder.h"
+#endif
 #include "../command/commandline.h"
 #include "../tests/tests.h"
 #include "../main.h"
@@ -20,6 +24,7 @@
 #include <chrono>
 #include <csignal>
 #include <cmath>
+#include <memory>
 
 using namespace std;
 
@@ -35,6 +40,58 @@ int MainCmds::printclockinfo(const vector<string>& args) {
   return 0;
 }
 
+int MainCmds::exportonnx(const vector<string>& args) {
+#ifndef USE_ONNX_BACKEND
+  (void)args;
+  cerr << "exportonnx is only available in ONNX backend builds (USE_BACKEND=ONNX)." << endl;
+  return 1;
+#else
+  string modelFile;
+  string outputFile;
+  int nnXLen;
+  int nnYLen;
+  try {
+    KataGoCommandLine cmd("Export KataGo .bin/.bin.gz model to ONNX file.");
+    cmd.addModelFileArg();
+    TCLAP::ValueArg<string> outputArg("o","output","Output ONNX file path",true,string(),"FILE");
+    TCLAP::ValueArg<int> xLenArg("x","xlen","Board x size baked into exported model",false,19,"N");
+    TCLAP::ValueArg<int> yLenArg("y","ylen","Board y size baked into exported model",false,19,"N");
+    cmd.add(outputArg);
+    cmd.add(xLenArg);
+    cmd.add(yLenArg);
+    cmd.parseArgs(args);
+
+    modelFile = cmd.getModelFile();
+    outputFile = outputArg.getValue();
+    nnXLen = xLenArg.getValue();
+    nnYLen = yLenArg.getValue();
+  }
+  catch(TCLAP::ArgException& e) {
+    cerr << "Error: " << e.error() << " for argument " << e.argId() << endl;
+    return 1;
+  }
+
+  if(nnXLen < 2 || nnXLen > NNPos::MAX_BOARD_LEN || nnYLen < 2 || nnYLen > NNPos::MAX_BOARD_LEN)
+    throw StringError("Invalid board size for exportonnx");
+
+  const string expectedSha256 = "";
+  std::unique_ptr<LoadedModel, void(*)(LoadedModel*)> loadedModel(
+    NeuralNet::loadModelFile(modelFile, expectedSha256),
+    NeuralNet::freeLoadedModel
+  );
+  const ModelDesc& modelDesc = NeuralNet::getModelDesc(loadedModel.get());
+  string onnxBytes = OnnxModelBuilder::buildOnnxModel(modelDesc, nnXLen, nnYLen);
+
+  ofstream out;
+  FileUtils::open(out, outputFile, std::ios::binary | std::ios::out);
+  out.write(onnxBytes.data(), onnxBytes.size());
+  out.close();
+
+  cout << "Exported ONNX model to " << outputFile << " (" << onnxBytes.size() << " bytes)" << endl;
+  return 0;
+#endif
+}
+
 int MainCmds::sampleinitializations(const vector<string>& args) {
   Board::initHash();
   ScoreValue::initTables();
diff --git a/cpp/configs/analysis_example.cfg b/cpp/configs/analysis_example.cfg
index c6ba9825a..a87713482 100644
--- a/cpp/configs/analysis_example.cfg
+++ b/cpp/configs/analysis_example.cfg
@@ -292,6 +292,34 @@ nnRandomize = true
 # openclUseFP16 = auto
 
 
+# ONNX Runtime settings--------------------------------------
+# These only apply when using the ONNX version of KataGo.
+
+# Execution provider: cpu (default), openvino, cuda, tensorrt, migraphx, coreml(macOS only)
+# onnxProvider = openvino
+
+# Multi-device assignment is mainly for onnxProvider = cuda / tensorrt / migraphx:
+# onnxDeviceToUse = 0
+# onnxDeviceToUseThread0 = 0
+# onnxDeviceToUseThread1 = 1
+
+# OpenVINO EP options for Intel NPU (typically single device):
+# onnxOpenVINODeviceType = NPU
+# onnxOpenVINODeviceId = 0
+# onnxOpenVINOEnableNPUFastCompile = true  # may be ignored if unsupported by your ORT/OpenVINO build
+# onnxOpenVINOCacheDir = C:\\temp\\katago_ov_cache
+
+# Optional overrides for raw .onnx I/O tensor names and model version:
+# onnxInputSpatial = input_spatial
+# onnxInputGlobal = input_global
+# onnxInputMeta = input_meta
+# onnxOutputPolicy = out_policy
+# onnxOutputValue = out_value
+# onnxOutputMiscvalue = out_miscvalue
+# onnxOutputOwnership = out_ownership
+# onnxModelVersion = 15
+
+
 # Eigen-specific settings--------------------------------------
 # These only apply when using the Eigen (pure CPU) version of KataGo.
 
diff --git a/cpp/configs/contribute_example.cfg b/cpp/configs/contribute_example.cfg
index fb6f0d81d..ecaac3057 100644
--- a/cpp/configs/contribute_example.cfg
+++ b/cpp/configs/contribute_example.cfg
@@ -157,6 +157,34 @@ watchOngoingGameInFileName = watchgame.txt
 # openclUseFP16 = auto
 
 
+# ONNX Runtime settings--------------------------------------
+# These only apply when using the ONNX version of KataGo.
+
+# Execution provider: cpu (default), openvino, cuda, tensorrt, migraphx, coreml(macOS only)
+# onnxProvider = openvino
+
+# Multi-device assignment (for onnxProvider = cuda / tensorrt / migraphx):
+# onnxDeviceToUse = 0
+# onnxDeviceToUseThread0 = 0
+# onnxDeviceToUseThread1 = 1
+
+# OpenVINO EP options for Intel NPU (typically single device):
+# onnxOpenVINODeviceType = NPU
+# onnxOpenVINODeviceId = 0
+# onnxOpenVINOEnableNPUFastCompile = true  # may be ignored if unsupported by your ORT/OpenVINO build
+# onnxOpenVINOCacheDir = C:\\temp\\katago_ov_cache
+
+# Optional overrides for raw .onnx I/O tensor names and model version:
+# onnxInputSpatial = input_spatial
+# onnxInputGlobal = input_global
+# onnxInputMeta = input_meta
+# onnxOutputPolicy = out_policy
+# onnxOutputValue = out_value
+# onnxOutputMiscvalue = out_miscvalue
+# onnxOutputOwnership = out_ownership
+# onnxModelVersion = 15
+
+
 # Eigen-specific settings--------------------------------------
 # These only apply when using the Eigen (pure CPU) version of KataGo.
 
diff --git a/cpp/configs/gtp_example.cfg b/cpp/configs/gtp_example.cfg
index a860d6dfc..9787e11e8 100644
--- a/cpp/configs/gtp_example.cfg
+++ b/cpp/configs/gtp_example.cfg
@@ -543,6 +543,43 @@ searchFactorWhenWinningThreshold = 0.95
 # Default: numSearchThreads
 # numEigenThreadsPerModel = X
 
+# ------------------------------
+# ONNX backend settings
+# ------------------------------
+# These only apply when using the ONNX version of KataGo.
+
+# Execution provider:
+# cpu (default), openvino, cuda, tensorrt, migraphx, coreml(macOS only).
+# onnxProvider = cpu
+
+# Provider-specific device selection for multi-server-thread setups.
+# Primarily for onnxProvider = cuda / tensorrt / migraphx.
+# onnxDeviceToUse = 0
+# onnxDeviceToUseThread0 = 0
+# onnxDeviceToUseThread1 = 1
+
+# OpenVINO EP options (useful for Intel NPU on Windows):
+# NPU, CPU, GPU, AUTO:NPU,CPU, MULTI:NPU.0,NPU.1, etc.
+# onnxOpenVINODeviceType = NPU
+# Optional explicit OpenVINO device id (usually unnecessary for single NPU setups)
+# onnxOpenVINODeviceId = 0
+# Optional fast compile mode for NPU
+# onnxOpenVINOEnableNPUFastCompile = true  # may be ignored if unsupported by your ORT/OpenVINO build
+# Optional cache directory for compiled OpenVINO blobs
+# onnxOpenVINOCacheDir = C:\\temp\\katago_ov_cache
+
+# Override input/output tensor names for raw .onnx models:
+# onnxInputSpatial = input_spatial
+# onnxInputGlobal = input_global
+# onnxInputMeta = input_meta
+# onnxOutputPolicy = out_policy
+# onnxOutputValue = out_value
+# onnxOutputMiscvalue = out_miscvalue
+# onnxOutputOwnership = out_ownership
+
+# Override auto-detected model version for raw .onnx model files.
+# onnxModelVersion = 15
+
 # ===========================================================================
 # Root move selection and biases
 # ===========================================================================
diff --git a/cpp/configs/match_example.cfg b/cpp/configs/match_example.cfg
index 08859f557..11b271ef0 100644
--- a/cpp/configs/match_example.cfg
+++ b/cpp/configs/match_example.cfg
@@ -214,6 +214,36 @@ numNNServerThreadsPerModel = 1
 # openclUseFP16 = auto
 
 
+# ONNX Runtime settings--------------------------------------
+# These only apply when using the ONNX version of KataGo.
+
+# Execution provider: cpu (default), openvino, cuda, tensorrt, migraphx, coreml(macOS only)
+# onnxProvider = openvino
+
+# Multi-device assignment is mainly for onnxProvider = cuda / tensorrt / migraphx:
+# onnxDeviceToUse = 0
+# onnxDeviceToUseModel0 = 0
+# onnxDeviceToUseModel1 = 1
+# onnxDeviceToUseModel0Thread0 = 0
+# onnxDeviceToUseModel0Thread1 = 1
+
+# OpenVINO EP options for Intel NPU (typically single device):
+# onnxOpenVINODeviceType = NPU
+# onnxOpenVINODeviceId = 0
+# onnxOpenVINOEnableNPUFastCompile = true  # may be ignored if unsupported by your ORT/OpenVINO build
+# onnxOpenVINOCacheDir = C:\\temp\\katago_ov_cache
+
+# Optional overrides for raw .onnx I/O tensor names and model version:
+# onnxInputSpatial = input_spatial
+# onnxInputGlobal = input_global
+# onnxInputMeta = input_meta
+# onnxOutputPolicy = out_policy
+# onnxOutputValue = out_value
+# onnxOutputMiscvalue = out_miscvalue
+# onnxOutputOwnership = out_ownership
+# onnxModelVersion = 15
+
+
 # Eigen-specific settings--------------------------------------
 # These only apply when using the Eigen (pure CPU) version of KataGo.
 
diff --git a/cpp/dataio/loadmodel.cpp b/cpp/dataio/loadmodel.cpp
index 81483b170..71d3addf3 100644
--- a/cpp/dataio/loadmodel.cpp
+++ b/cpp/dataio/loadmodel.cpp
@@ -20,30 +20,34 @@ std::time_t to_time_t(TP tp)
 static const vector<string> ACCEPTABLE_MODEL_SUFFIXES {
   ".bin.gz",
   ".bin",
+  ".onnx",
   "model.txt.gz",
   "model.txt"
 };
 static const vector<string> GENERIC_MODEL_NAMES {
   "model.bin.gz",
   "model.bin",
+  "model.onnx",
   "model.txt.gz",
-  "model.txt"
+  "model.txt",
   "Model.bin.gz",
   "Model.bin",
+  "Model.onnx",
   "Model.txt.gz",
-  "Model.txt"
+  "Model.txt",
   "MODEL.bin.gz",
   "MODEL.bin",
+  "MODEL.onnx",
   "MODEL.txt.gz",
-  "MODEL.txt"
+  "MODEL.txt",
   "model.ckpt",
-  "Model.ckpt"
+  "Model.ckpt",
   "MODEL.ckpt",
   "model.checkpoint",
-  "Model.checkpoint"
+  "Model.checkpoint",
   "MODEL.checkpoint",
   "model",
-  "Model"
+  "Model",
   "MODEL",
 };
 
@@ -115,7 +119,8 @@ void LoadModel::deleteModelsOlderThan(const string& modelsDir, Logger& logger, c
     if(Global::isSuffix(filePathStr,".bin.gz") ||
        Global::isSuffix(filePathStr,".txt.gz") ||
        Global::isSuffix(filePathStr,".bin") ||
-       Global::isSuffix(filePathStr,".txt")) {
+       Global::isSuffix(filePathStr,".txt") ||
+       Global::isSuffix(filePathStr,".onnx")) {
       time_t thisTime = to_time_t(gfs::last_write_time(filePath));
       if(thisTime < time) {
         pathsToRemove.push_back(filePath);
diff --git a/cpp/main.cpp b/cpp/main.cpp
index 734b0f848..0d756bfc2 100644
--- a/cpp/main.cpp
+++ b/cpp/main.cpp
@@ -29,6 +29,7 @@ static void printHelp(const vector<string>& args) {
 gtp : Runs GTP engine that can be plugged into any standard Go GUI for play/analysis.
 benchmark : Test speed with different numbers of search threads.
 genconfig : User-friendly interface to generate a config with rules and automatic performance tuning.
+exportonnx : Export KataGo .bin/.bin.gz model to a fixed-size .onnx model.
 
 contribute : Connect to online distributed KataGo training and run perpetually contributing selfplay games.
 
@@ -169,6 +170,8 @@ static int handleSubcommand(const string& subcommand, const vector<string>& args
     return MainCmds::runsleeptest(subArgs);
   else if(subcommand == "printclockinfo")
     return MainCmds::printclockinfo(subArgs);
+  else if(subcommand == "exportonnx")
+    return MainCmds::exportonnx(subArgs);
   else if(subcommand == "sandbox")
     return MainCmds::sandbox();
   else if(subcommand == "version") {
@@ -255,6 +258,8 @@ string Version::getKataGoVersionFullInfo() {
 #endif
 #elif defined(USE_EIGEN_BACKEND)
   out << "Using Eigen(CPU) backend" << endl;
+#elif defined(USE_ONNX_BACKEND)
+  out << "Using ONNX backend" << endl;
 #else
   out << "Using dummy backend" << endl;
 #endif
@@ -293,6 +298,8 @@ string Version::getGitRevisionWithBackend() {
   s += "-opencl";
 #elif defined(USE_EIGEN_BACKEND)
   s += "-eigen";
+#elif defined(USE_ONNX_BACKEND)
+  s += "-onnx";
 #else
   s += "-dummy";
 #endif
diff --git a/cpp/main.h b/cpp/main.h
index 3f8ad78d4..4f03f418e 100644
--- a/cpp/main.h
+++ b/cpp/main.h
@@ -53,6 +53,7 @@ namespace MainCmds {
 
   int demoplay(const std::vector<std::string>& args);
   int printclockinfo(const std::vector<std::string>& args);
+  int exportonnx(const std::vector<std::string>& args);
   int sampleinitializations(const std::vector<std::string>& args);
   int evalrandominits(const std::vector<std::string>& args);
   int searchentropyanalysis(const std::vector<std::string>& args);
diff --git a/cpp/neuralnet/onnxbackend.cpp b/cpp/neuralnet/onnxbackend.cpp
new file mode 100644
index 000000000..551fc5d4b
--- /dev/null
+++ b/cpp/neuralnet/onnxbackend.cpp
@@ -0,0 +1,867 @@
+// ONNX Runtime backend for KataGo.
+// Loads standard .bin.gz model files (builds ONNX graph from ModelDesc) or
+// raw .onnx model files directly, and runs inference via ONNX Runtime with a
+// configurable execution provider (CPU, OpenVINO, CUDA, TensorRT, MIGraphX, CoreML)
+// selected at
+// runtime via the onnxProvider config key.
+
+#include "../neuralnet/nninterface.h"
+#include "../neuralnet/nneval.h"
+#include "../neuralnet/nninputs.h"
+#include "../neuralnet/modelversion.h"
+#include "../neuralnet/onnxmodelbuilder.h"
+
+#include <onnxruntime_cxx_api.h>
+#ifdef __APPLE__
+#include <coreml_provider_factory.h>
+#endif
+
+#include <fstream>
+#include <unordered_map>
+
+using namespace std;
+
+//--------------------------------------------------------------
+
+// Auto-detect modelVersion from introspected channel counts.
+//
+// Detection is based on channel-count heuristics for raw .onnx files where the
+// model version is not encoded in the file.  The mapping assumes V7 inputs
+// (22 spatial + 19 global channels) and distinguishes versions by the number of
+// score-value and policy output channels:
+//   - 4 score-value channels                    -> version 8
+//   - 6 score-value channels, 1 policy channel  -> version 10
+//   - 6 score-value channels, 2 policy channels -> version 15
+//
+// If the heuristic picks the wrong version, set the `onnxModelVersion` config
+// key to the correct value (>= 0) to override auto-detection.
+static int detectModelVersion(
+  int numInputChannels, int numInputGlobalChannels,
+  int numPolicyChannels, int numScoreValueChannels,
+  int configModelVersion
+) {
+  if(configModelVersion >= 0)
+    return configModelVersion;
+
+  // inputsVersion 7 -> models 8-16: 22 spatial + 19 global
+  if(numInputChannels == NNInputs::NUM_FEATURES_SPATIAL_V7 &&
+     numInputGlobalChannels == NNInputs::NUM_FEATURES_GLOBAL_V7) {
+    if(numScoreValueChannels == 6 && numPolicyChannels == 2)
+      return 15;
+    if(numScoreValueChannels == 6 && numPolicyChannels == 1)
+      return 10;
+    if(numScoreValueChannels == 4)
+      return 8;
+    // Default for V7 inputs
+    return 15;
+  }
+  // Older input versions -- fall back to a reasonable default
+  return NNModelVersion::defaultModelVersion;
+}
+
+struct LoadedModel {
+  ModelDesc modelDesc;
+  bool isRawOnnx;
+  string rawOnnxBytes;
+
+  // Constructor for .bin.gz files
+  LoadedModel(const string& fileName, const string& expectedSha256, bool rawOnnx)
+    : isRawOnnx(rawOnnx)
+  {
+    if(!rawOnnx) {
+      ModelDesc::loadFromFileMaybeGZipped(fileName, modelDesc, expectedSha256);
+      return;
+    }
+
+    // Read raw .onnx file bytes
+    {
+      std::ifstream in(fileName, std::ios::binary | std::ios::ate);
+      if(!in.good())
+        throw StringError("ONNX backend: could not open raw ONNX file: " + fileName);
+      std::streamsize size = in.tellg();
+      if(size < 0)
+        throw StringError("ONNX backend: could not determine size of ONNX file: " + fileName);
+      in.seekg(0, std::ios::beg);
+      rawOnnxBytes.resize(size);
+      if(!in.read(rawOnnxBytes.data(), size))
+        throw StringError("ONNX backend: failed to read raw ONNX file: " + fileName);
+    }
+
+    // Create a temporary CPU session to introspect shapes
+    Ort::Env tmpEnv(ORT_LOGGING_LEVEL_WARNING, "KataGoOnnxIntrospect");
+    Ort::SessionOptions tmpOpts;
+    tmpOpts.SetIntraOpNumThreads(1);
+    Ort::Session tmpSession(tmpEnv, rawOnnxBytes.data(), rawOnnxBytes.size(), tmpOpts);
+
+    Ort::AllocatorWithDefaultOptions allocator;
+
+    // Introspect inputs by name first, falling back to shape-based heuristic
+    int numInputChannels = 0;
+    int numInputGlobalChannels = 0;
+    int numInputMetaChannels = 0;
+    size_t numInputs = tmpSession.GetInputCount();
+    for(size_t i = 0; i < numInputs; i++) {
+      Ort::AllocatedStringPtr namePtr = tmpSession.GetInputNameAllocated(i, allocator);
+      string name = namePtr.get();
+      auto typeInfo = tmpSession.GetInputTypeInfo(i);
+      auto tensorInfo = typeInfo.GetTensorTypeAndShapeInfo();
+      auto shape = tensorInfo.GetShape();
+      if(name.find("spatial") != string::npos) {
+        if(shape.size() >= 2)
+          numInputChannels = (int)shape[1];
+      } else if(name.find("global") != string::npos) {
+        if(shape.size() >= 2)
+          numInputGlobalChannels = (int)shape[1];
+      } else if(name.find("meta") != string::npos) {
+        if(shape.size() >= 2)
+          numInputMetaChannels = (int)shape[1];
+      } else if(shape.size() == 4) {
+        // Shape-based fallback: [N, C, H, W] -- spatial input
+        numInputChannels = (int)shape[1];
+      } else if(shape.size() == 2) {
+        // Shape-based fallback: [N, C] -- first 2D is global, second is meta
+        if(numInputGlobalChannels == 0)
+          numInputGlobalChannels = (int)shape[1];
+        else
+          numInputMetaChannels = (int)shape[1];
+      } else {
+        cerr << "ONNX backend warning: unrecognized input tensor '" << name
+             << "' with " << shape.size() << "D shape, ignoring" << "\n";
+      }
+    }
+
+    // Introspect outputs
+    int numPolicyChannels = 0;
+    int numValueChannels = 0;
+    int numScoreValueChannels = 0;
+    int numOwnershipChannels = 0;
+    size_t numOutputs = tmpSession.GetOutputCount();
+    for(size_t i = 0; i < numOutputs; i++) {
+      Ort::AllocatedStringPtr namePtr = tmpSession.GetOutputNameAllocated(i, allocator);
+      string name = namePtr.get();
+      auto typeInfo = tmpSession.GetOutputTypeInfo(i);
+      auto tensorInfo = typeInfo.GetTensorTypeAndShapeInfo();
+      auto shape = tensorInfo.GetShape();
+
+      if(name.find("policy") != string::npos) {
+        // Policy: [N, C, H*W+1] -> dim 1 is policy channels
+        if(shape.size() >= 2)
+          numPolicyChannels = (int)shape[1];
+      } else if(name.find("miscvalue") != string::npos) {
+        // MiscValue: [N, numScoreValueChannels] -- check before "value" since "miscvalue" contains "value"
+        if(shape.size() >= 2)
+          numScoreValueChannels = (int)shape[1];
+      } else if(name.find("value") != string::npos) {
+        // Value: [N, 3]
+        if(shape.size() >= 2)
+          numValueChannels = (int)shape[1];
+      } else if(name.find("ownership") != string::npos) {
+        // Ownership: [N, 1, H, W]
+        if(shape.size() >= 2)
+          numOwnershipChannels = (int)shape[1];
+      }
+    }
+
+    // Populate ModelDesc metadata (weights are in the ONNX graph, not in modelDesc)
+    modelDesc.numInputChannels = numInputChannels;
+    modelDesc.numInputGlobalChannels = numInputGlobalChannels;
+    modelDesc.numInputMetaChannels = numInputMetaChannels;
+    modelDesc.numPolicyChannels = numPolicyChannels;
+    modelDesc.numValueChannels = numValueChannels;
+    modelDesc.numScoreValueChannels = numScoreValueChannels;
+    modelDesc.numOwnershipChannels = numOwnershipChannels;
+
+    // Extract filename stem as model name
+    {
+      size_t lastSlash = fileName.find_last_of("/\\");
+      string basename = (lastSlash != string::npos) ? fileName.substr(lastSlash + 1) : fileName;
+      size_t dotPos = basename.find('.');
+      modelDesc.name = (dotPos != string::npos) ? basename.substr(0, dotPos) : basename;
+    }
+
+    // Model version: auto-detect with possible config override (applied later)
+    modelDesc.modelVersion = detectModelVersion(
+      numInputChannels, numInputGlobalChannels,
+      numPolicyChannels, numScoreValueChannels,
+      -1  // No config override at load time; applied in createComputeHandle if needed
+    );
+
+    // postProcessParams gets default values from its constructor (already set)
+  }
+
+  LoadedModel() = delete;
+  LoadedModel(const LoadedModel&) = delete;
+  LoadedModel& operator=(const LoadedModel&) = delete;
+};
+
+LoadedModel* NeuralNet::loadModelFile(const string& file, const string& expectedSha256) {
+  bool isRawOnnx = Global::isSuffix(file, ".onnx");
+  return new LoadedModel(file, expectedSha256, isRawOnnx);
+}
+
+void NeuralNet::freeLoadedModel(LoadedModel* loadedModel) {
+  delete loadedModel;
+}
+
+const ModelDesc& NeuralNet::getModelDesc(const LoadedModel* loadedModel) {
+  return loadedModel->modelDesc;
+}
+
+//--------------------------------------------------------------
+
+struct ComputeContext {
+  Ort::Env env;
+  int nnXLen;
+  int nnYLen;
+  string providerName;
+  string openvinoDeviceType;
+  string openvinoDeviceId;
+  bool openvinoEnableNPUFastCompile;
+  string openvinoCacheDir;
+
+  // Configurable input/output node names
+  string inputSpatialName;
+  string inputGlobalName;
+  string inputMetaName;
+  string outputPolicyName;
+  string outputValueName;
+  string outputMiscvalueName;
+  string outputOwnershipName;
+
+  // Config override for model version (-1 means auto-detect)
+  int configModelVersion;
+
+  ComputeContext(int xLen, int yLen, const string& provider)
+    : env(ORT_LOGGING_LEVEL_WARNING, "KataGoOnnx"),
+      nnXLen(xLen),
+      nnYLen(yLen),
+      providerName(provider),
+      openvinoDeviceType("CPU"),
+      openvinoDeviceId(""),
+      openvinoEnableNPUFastCompile(false),
+      openvinoCacheDir(""),
+      inputSpatialName("input_spatial"),
+      inputGlobalName("input_global"),
+      inputMetaName("input_meta"),
+      outputPolicyName("out_policy"),
+      outputValueName("out_value"),
+      outputMiscvalueName("out_miscvalue"),
+      outputOwnershipName("out_ownership"),
+      configModelVersion(-1)
+  {}
+};
+
+//--------------------------------------------------------------
+
+struct ComputeHandle {
+  ComputeContext* context;
+  std::unique_ptr<Ort::Session> session;
+  int modelVersion;
+  int numInputChannels;
+  int numInputGlobalChannels;
+  int numPolicyChannels;
+  int numValueChannels;
+  int numScoreValueChannels;
+  int numOwnershipChannels;
+  int numInputMetaChannels;
+  int policyResultLen; // H*W+1
+
+  // Input/output names (stored for session->Run)
+  vector<string> inputNames;
+  vector<string> outputNames;
+  vector<const char*> inputNamePtrs;
+  vector<const char*> outputNamePtrs;
+
+  ComputeHandle(ComputeContext* ctx, const LoadedModel& loadedModel, Logger* logger, int deviceIdxForThread)
+    : context(ctx),
+      modelVersion(loadedModel.modelDesc.modelVersion),
+      numInputChannels(loadedModel.modelDesc.numInputChannels),
+      numInputGlobalChannels(loadedModel.modelDesc.numInputGlobalChannels),
+      numPolicyChannels(loadedModel.modelDesc.numPolicyChannels),
+      numValueChannels(loadedModel.modelDesc.numValueChannels),
+      numScoreValueChannels(loadedModel.modelDesc.numScoreValueChannels),
+      numOwnershipChannels(loadedModel.modelDesc.numOwnershipChannels),
+      numInputMetaChannels(loadedModel.modelDesc.numInputMetaChannels),
+      policyResultLen(ctx->nnXLen * ctx->nnYLen + 1)
+  {
+    // Apply config model version override if set
+    if(ctx->configModelVersion >= 0)
+      modelVersion = ctx->configModelVersion;
+
+    const char* onnxData;
+    size_t onnxSize;
+    string builtOnnxBytes;
+    if(loadedModel.isRawOnnx) {
+      if(logger != NULL)
+        logger->write("ONNX backend: using raw ONNX model (" +
+                       Global::uint64ToString(loadedModel.rawOnnxBytes.size()) + " bytes)");
+      onnxData = loadedModel.rawOnnxBytes.data();
+      onnxSize = loadedModel.rawOnnxBytes.size();
+    } else {
+      if(logger != NULL)
+        logger->write("ONNX backend: building ONNX graph from model weights...");
+      builtOnnxBytes = OnnxModelBuilder::buildOnnxModel(loadedModel.modelDesc, ctx->nnXLen, ctx->nnYLen);
+      if(logger != NULL)
+        logger->write("ONNX backend: ONNX graph built (" + Global::uint64ToString(builtOnnxBytes.size()) + " bytes)");
+      onnxData = builtOnnxBytes.data();
+      onnxSize = builtOnnxBytes.size();
+    }
+
+    if(logger != NULL)
+      logger->write("ONNX backend: creating session...");
+
+    Ort::SessionOptions sessionOpts;
+    sessionOpts.SetIntraOpNumThreads(1);
+
+    // Select execution provider based on providerName
+    const string& provider = ctx->providerName;
+    if(provider == "coreml") {
+#ifdef __APPLE__
+      uint32_t coremlFlags = COREML_FLAG_CREATE_MLPROGRAM;
+      Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CoreML(sessionOpts, coremlFlags));
+      if(logger != NULL)
+        logger->write("ONNX backend: CoreML execution provider enabled (MLProgram mode)");
+#else
+      throw StringError("ONNX backend: CoreML is only available on Apple platforms");
+#endif
+    } else if(provider == "cuda") {
+      OrtCUDAProviderOptions cudaOpts{};
+      cudaOpts.device_id = deviceIdxForThread >= 0 ? deviceIdxForThread : 0;
+      sessionOpts.AppendExecutionProvider_CUDA(cudaOpts);
+      if(logger != NULL)
+        logger->write("ONNX backend: CUDA execution provider enabled, device_id=" + Global::intToString(cudaOpts.device_id));
+    } else if(provider == "tensorrt") {
+      OrtTensorRTProviderOptions trtOpts{};
+      trtOpts.device_id = deviceIdxForThread >= 0 ? deviceIdxForThread : 0;
+      sessionOpts.AppendExecutionProvider_TensorRT(trtOpts);
+      if(logger != NULL)
+        logger->write("ONNX backend: TensorRT execution provider enabled, device_id=" + Global::intToString(trtOpts.device_id));
+    } else if(provider == "migraphx") {
+      OrtMIGraphXProviderOptions migraphxOpts{};
+      migraphxOpts.device_id = deviceIdxForThread >= 0 ? deviceIdxForThread : 0;
+      sessionOpts.AppendExecutionProvider_MIGraphX(migraphxOpts);
+      if(logger != NULL)
+        logger->write("ONNX backend: MIGraphX execution provider enabled, device_id=" + Global::intToString(migraphxOpts.device_id));
+    } else if(provider == "openvino") {
+      std::unordered_map<std::string, std::string> openvinoOpts;
+      openvinoOpts["device_type"] = ctx->openvinoDeviceType;
+      if(!ctx->openvinoDeviceId.empty())
+        openvinoOpts["device_id"] = ctx->openvinoDeviceId;
+      if(!ctx->openvinoCacheDir.empty())
+        openvinoOpts["cache_dir"] = ctx->openvinoCacheDir;
+
+      if(ctx->openvinoEnableNPUFastCompile && logger != NULL) {
+        logger->write(
+          "ONNX backend: onnxOpenVINOEnableNPUFastCompile requested, but this ORT build may not "
+          "accept 'enable_npu_fast_compile'; currently ignoring this option for compatibility."
+        );
+      }
+
+      // Some ORT OpenVINO builds may not accept optional keys like cache_dir.
+      // Retry with only core device keys if optional keys are rejected.
+      try {
+        sessionOpts.AppendExecutionProvider_OpenVINO_V2(openvinoOpts);
+      }
+      catch(const Ort::Exception& e) {
+        bool hadOptionalKeys = openvinoOpts.count("cache_dir") > 0;
+        if(!hadOptionalKeys)
+          throw;
+
+        if(logger != NULL) {
+          logger->write(
+            string("ONNX backend: OpenVINO optional provider options rejected, retrying without optional keys. Error: ") +
+            e.what()
+          );
+        }
+        openvinoOpts.erase("cache_dir");
+        sessionOpts.AppendExecutionProvider_OpenVINO_V2(openvinoOpts);
+      }
+
+      if(logger != NULL) {
+        string deviceId = openvinoOpts.count("device_id") > 0 ? openvinoOpts["device_id"] : "";
+        logger->write(
+          "ONNX backend: OpenVINO execution provider enabled, device_type=" + ctx->openvinoDeviceType +
+          (deviceId.empty() ? "" : (", device_id=" + deviceId))
+        );
+      }
+    } else if(provider == "cpu" || provider.empty()) {
+      if(logger != NULL)
+        logger->write("ONNX backend: using CPU execution provider");
+    } else {
+      throw StringError("ONNX backend: unknown onnxProvider '" + provider + "', expected 'cpu', 'coreml', 'cuda', 'tensorrt', 'migraphx', or 'openvino'");
+    }
+
+    // Create session from in-memory bytes
+    session = std::make_unique<Ort::Session>(ctx->env, onnxData, onnxSize, sessionOpts);
+
+    // Query and store input names
+    Ort::AllocatorWithDefaultOptions allocator;
+    size_t numInputs = session->GetInputCount();
+    for(size_t i = 0; i < numInputs; i++) {
+      Ort::AllocatedStringPtr name = session->GetInputNameAllocated(i, allocator);
+      inputNames.push_back(name.get());
+    }
+    for(auto& n : inputNames)
+      inputNamePtrs.push_back(n.c_str());
+
+    // Query and store output names
+    size_t numOutputs = session->GetOutputCount();
+    for(size_t i = 0; i < numOutputs; i++) {
+      Ort::AllocatedStringPtr name = session->GetOutputNameAllocated(i, allocator);
+      outputNames.push_back(name.get());
+    }
+    for(auto& n : outputNames)
+      outputNamePtrs.push_back(n.c_str());
+
+    if(logger != NULL)
+      logger->write("ONNX backend: session created, inputs=" + Global::uint64ToString(numInputs) +
+                     " outputs=" + Global::uint64ToString(numOutputs));
+  }
+
+  ComputeHandle() = delete;
+  ComputeHandle(const ComputeHandle&) = delete;
+  ComputeHandle& operator=(const ComputeHandle&) = delete;
+};
+
+//--------------------------------------------------------------
+
+struct InputBuffers {
+  int maxBatchSize;
+
+  size_t singleInputElts;
+  size_t singleInputGlobalElts;
+  size_t singleInputMetaElts;
+
+  vector<float> spatialInput;
+  vector<float> globalInput;
+  vector<float> metaInput;
+
+  InputBuffers(const LoadedModel* loadedModel, int maxBatchSz, int nnXLen, int nnYLen) {
+    const ModelDesc& m = loadedModel->modelDesc;
+    maxBatchSize = maxBatchSz;
+    singleInputElts = (size_t)m.numInputChannels * nnXLen * nnYLen;
+    singleInputGlobalElts = (size_t)m.numInputGlobalChannels;
+    singleInputMetaElts = (size_t)m.numInputMetaChannels;
+    spatialInput.resize(singleInputElts * maxBatchSize, 0.0f);
+    globalInput.resize(singleInputGlobalElts * maxBatchSize, 0.0f);
+    if(m.numInputMetaChannels > 0)
+      metaInput.resize(singleInputMetaElts * maxBatchSize, 0.0f);
+  }
+
+  ~InputBuffers() {}
+
+  InputBuffers() = delete;
+  InputBuffers(const InputBuffers&) = delete;
+  InputBuffers& operator=(const InputBuffers&) = delete;
+};
+
+InputBuffers* NeuralNet::createInputBuffers(const LoadedModel* loadedModel, int maxBatchSize, int nnXLen, int nnYLen) {
+  return new InputBuffers(loadedModel, maxBatchSize, nnXLen, nnYLen);
+}
+void NeuralNet::freeInputBuffers(InputBuffers* inputBuffers) {
+  delete inputBuffers;
+}
+
+//--------------------------------------------------------------
+
+void NeuralNet::globalInitialize() {
+}
+
+void NeuralNet::globalCleanup() {
+}
+
+//--------------------------------------------------------------
+
+ComputeContext* NeuralNet::createComputeContext(
+  const std::vector<int>& gpuIdxs,
+  Logger* logger,
+  int nnXLen,
+  int nnYLen,
+  const string& backendExtraParam,
+  const string& homeDataDirOverride,
+  bool openCLReTunePerBoardSize,
+  enabled_t useFP16Mode,
+  enabled_t useNHWCMode,
+  const LoadedModel* loadedModel
+) {
+  (void)gpuIdxs;
+  (void)homeDataDirOverride;
+  (void)openCLReTunePerBoardSize;
+  (void)useFP16Mode;
+  (void)useNHWCMode;
+  (void)loadedModel;
+
+  // Parse backendExtraParam as "key=value;key=value;..."
+  string providerName = "cpu";
+  map<string, string> params;
+  if(!backendExtraParam.empty()) {
+    vector<string> parts = Global::split(backendExtraParam, ';');
+    for(const string& part : parts) {
+      size_t eq = part.find('=');
+      if(eq != string::npos) {
+        string key = Global::trim(part.substr(0, eq));
+        string val = Global::trim(part.substr(eq + 1));
+        params[key] = val;
+      } else {
+        // Legacy: bare string is provider name
+        string trimmed = Global::trim(part);
+        if(!trimmed.empty())
+          providerName = trimmed;
+      }
+    }
+    if(params.count("provider"))
+      providerName = params["provider"];
+  }
+  providerName = Global::toLower(providerName);
+
+  if(logger != NULL)
+    logger->write("ONNX backend: creating compute context for " +
+                   Global::intToString(nnXLen) + "x" + Global::intToString(nnYLen) +
+                   " with provider '" + providerName + "'");
+
+  ComputeContext* ctx = new ComputeContext(nnXLen, nnYLen, providerName);
+
+  // Apply configured node names
+  if(params.count("inputSpatial")) ctx->inputSpatialName = params["inputSpatial"];
+  if(params.count("inputGlobal")) ctx->inputGlobalName = params["inputGlobal"];
+  if(params.count("inputMeta")) ctx->inputMetaName = params["inputMeta"];
+  if(params.count("outputPolicy")) ctx->outputPolicyName = params["outputPolicy"];
+  if(params.count("outputValue")) ctx->outputValueName = params["outputValue"];
+  if(params.count("outputMiscvalue")) ctx->outputMiscvalueName = params["outputMiscvalue"];
+  if(params.count("outputOwnership")) ctx->outputOwnershipName = params["outputOwnership"];
+  if(params.count("openvinoDeviceType")) ctx->openvinoDeviceType = params["openvinoDeviceType"];
+  if(params.count("openvinoDeviceId")) ctx->openvinoDeviceId = params["openvinoDeviceId"];
+  if(params.count("openvinoEnableNPUFastCompile")) {
+    string v = Global::toLower(params["openvinoEnableNPUFastCompile"]);
+    ctx->openvinoEnableNPUFastCompile = (v == "1" || v == "true" || v == "yes" || v == "on");
+  }
+  if(params.count("openvinoCacheDir")) ctx->openvinoCacheDir = params["openvinoCacheDir"];
+  if(params.count("modelVersion")) {
+    int v = Global::stringToInt(params["modelVersion"]);
+    if(v >= 0)
+      ctx->configModelVersion = v;
+  }
+
+  return ctx;
+}
+
+void NeuralNet::freeComputeContext(ComputeContext* computeContext) {
+  delete computeContext;
+}
+
+//--------------------------------------------------------------
+
+ComputeHandle* NeuralNet::createComputeHandle(
+  ComputeContext* context,
+  const LoadedModel* loadedModel,
+  Logger* logger,
+  int maxBatchSize,
+  bool requireExactNNLen,
+  bool inputsUseNHWC,
+  int gpuIdxForThisThread,
+  int serverThreadIdx
+) {
+  (void)maxBatchSize;
+  (void)requireExactNNLen;
+  if(inputsUseNHWC)
+    throw StringError("ONNX backend: inputsUseNHWC = true not supported, must use NCHW");
+
+  if(logger != NULL) {
+    logger->write("ONNX backend thread " + Global::intToString(serverThreadIdx) +
+                  ": Model version " + Global::intToString(loadedModel->modelDesc.modelVersion));
+    logger->write("ONNX backend thread " + Global::intToString(serverThreadIdx) +
+                  ": Model name: " + loadedModel->modelDesc.name);
+    string deviceInfo =
+      context->providerName == "openvino"
+      ? "n/a (use onnxOpenVINODeviceType/onnxOpenVINODeviceId)"
+      : Global::intToString(gpuIdxForThisThread);
+    logger->write("ONNX backend thread " + Global::intToString(serverThreadIdx) +
+                  ": provider=" + context->providerName +
+                  " deviceIdx=" + deviceInfo);
+  }
+
+  return new ComputeHandle(context, *loadedModel, logger, gpuIdxForThisThread);
+}
+
+void NeuralNet::freeComputeHandle(ComputeHandle* computeHandle) {
+  delete computeHandle;
+}
+
+bool NeuralNet::isUsingFP16(const ComputeHandle* handle) {
+  (void)handle;
+  return false;
+}
+
+//--------------------------------------------------------------
+
+// Helper to find the index of a name in a vector, checking multiple alternatives.
+static int findNameIndex(const vector<string>& names, const vector<string>& targets) {
+  for(size_t i = 0; i < names.size(); i++) {
+    for(const auto& t : targets) {
+      if(names[i] == t)
+        return (int)i;
+    }
+  }
+  return -1;
+}
+
+void NeuralNet::getOutput(
+  ComputeHandle* computeHandle,
+  InputBuffers* inputBuffers,
+  int numBatchEltsFilled,
+  NNResultBuf** inputBufs,
+  vector<NNOutput*>& outputs
+) {
+  assert(numBatchEltsFilled <= inputBuffers->maxBatchSize);
+  assert(numBatchEltsFilled > 0);
+  const int batchSize = numBatchEltsFilled;
+  const int nnXLen = computeHandle->context->nnXLen;
+  const int nnYLen = computeHandle->context->nnYLen;
+  const int numSpatialFeatures = computeHandle->numInputChannels;
+  const int numGlobalFeatures = computeHandle->numInputGlobalChannels;
+  const int numPolicyChannels = computeHandle->numPolicyChannels;
+
+  // Fill input buffers
+  for(int nIdx = 0; nIdx < batchSize; nIdx++) {
+    float* rowSpatialInput = inputBuffers->spatialInput.data() + (inputBuffers->singleInputElts * nIdx);
+    float* rowGlobalInput = inputBuffers->globalInput.data() + (inputBuffers->singleInputGlobalElts * nIdx);
+
+    const float* rowGlobal = inputBufs[nIdx]->rowGlobalBuf.data();
+    const float* rowSpatial = inputBufs[nIdx]->rowSpatialBuf.data();
+    std::copy(rowGlobal, rowGlobal + numGlobalFeatures, rowGlobalInput);
+    SymmetryHelpers::copyInputsWithSymmetry(rowSpatial, rowSpatialInput, 1, nnYLen, nnXLen, numSpatialFeatures, false, inputBufs[nIdx]->symmetry);
+
+    if(computeHandle->numInputMetaChannels > 0) {
+      float* rowMetaInput = inputBuffers->metaInput.data() + (inputBuffers->singleInputMetaElts * nIdx);
+      const float* rowMeta = inputBufs[nIdx]->rowMetaBuf.data();
+      std::copy(rowMeta, rowMeta + computeHandle->numInputMetaChannels, rowMetaInput);
+    }
+  }
+
+  // Create ONNX tensors
+  Ort::MemoryInfo memInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
+
+  std::array<int64_t, 4> spatialShape = {batchSize, numSpatialFeatures, nnYLen, nnXLen};
+  Ort::Value spatialTensor = Ort::Value::CreateTensor<float>(
+    memInfo, inputBuffers->spatialInput.data(), inputBuffers->singleInputElts * batchSize,
+    spatialShape.data(), spatialShape.size()
+  );
+
+  std::array<int64_t, 2> globalShape = {batchSize, numGlobalFeatures};
+  Ort::Value globalTensor = Ort::Value::CreateTensor<float>(
+    memInfo, inputBuffers->globalInput.data(), inputBuffers->singleInputGlobalElts * batchSize,
+    globalShape.data(), globalShape.size()
+  );
+
+  // Match input ordering using configured node names
+  const ComputeContext* ctx = computeHandle->context;
+  int spatialIdx = findNameIndex(computeHandle->inputNames, {ctx->inputSpatialName});
+  int globalIdx = findNameIndex(computeHandle->inputNames, {ctx->inputGlobalName});
+  if(spatialIdx < 0 || globalIdx < 0)
+    throw StringError("ONNX backend: could not find expected input names");
+
+  int metaIdx = -1;
+  Ort::Value metaTensor(nullptr);
+  if(computeHandle->numInputMetaChannels > 0) {
+    metaIdx = findNameIndex(computeHandle->inputNames, {ctx->inputMetaName});
+    if(metaIdx < 0)
+      throw StringError("ONNX backend: model has metadata channels but could not find input_meta");
+    std::array<int64_t, 2> metaShape = {batchSize, computeHandle->numInputMetaChannels};
+    metaTensor = Ort::Value::CreateTensor<float>(
+      memInfo, inputBuffers->metaInput.data(), inputBuffers->singleInputMetaElts * batchSize,
+      metaShape.data(), metaShape.size()
+    );
+  }
+
+  vector<Ort::Value> inputTensors;
+  inputTensors.reserve(computeHandle->inputNames.size());
+  for(size_t i = 0; i < computeHandle->inputNames.size(); i++) {
+    if((int)i == spatialIdx)
+      inputTensors.push_back(std::move(spatialTensor));
+    else if((int)i == globalIdx)
+      inputTensors.push_back(std::move(globalTensor));
+    else if((int)i == metaIdx)
+      inputTensors.push_back(std::move(metaTensor));
+    else {
+      throw StringError("ONNX backend: unexpected input node '" + computeHandle->inputNames[i] +
+                         "' -- only spatial, global, and meta inputs are supported");
+    }
+  }
+
+  // Run inference
+  auto outputTensors = computeHandle->session->Run(
+    Ort::RunOptions{nullptr},
+    computeHandle->inputNamePtrs.data(),
+    inputTensors.data(),
+    inputTensors.size(),
+    computeHandle->outputNamePtrs.data(),
+    computeHandle->outputNamePtrs.size()
+  );
+
+  // Find output indices using configured node names
+  int policyOutputIdx = findNameIndex(computeHandle->outputNames, {ctx->outputPolicyName});
+  int valueOutputIdx = findNameIndex(computeHandle->outputNames, {ctx->outputValueName});
+  int miscvalueOutputIdx = findNameIndex(computeHandle->outputNames, {ctx->outputMiscvalueName});
+  int ownershipOutputIdx = findNameIndex(computeHandle->outputNames, {ctx->outputOwnershipName});
+
+  if(policyOutputIdx < 0)
+    throw StringError("ONNX backend: could not find policy output node '" + ctx->outputPolicyName + "'");
+  if(valueOutputIdx < 0)
+    throw StringError("ONNX backend: could not find value output node '" + ctx->outputValueName + "'");
+  if(miscvalueOutputIdx < 0)
+    throw StringError("ONNX backend: could not find miscvalue output node '" + ctx->outputMiscvalueName + "'");
+  if(ownershipOutputIdx < 0)
+    throw StringError("ONNX backend: could not find ownership output node '" + ctx->outputOwnershipName + "'");
+
+  const float* policyData = outputTensors[policyOutputIdx].GetTensorData<float>();
+  const float* valueData = outputTensors[valueOutputIdx].GetTensorData<float>();
+  const float* miscvalueData = outputTensors[miscvalueOutputIdx].GetTensorData<float>();
+  const float* ownershipData = outputTensors[ownershipOutputIdx].GetTensorData<float>();
+
+  assert(policyData != nullptr);
+  assert(valueData != nullptr);
+  assert(miscvalueData != nullptr);
+  assert(ownershipData != nullptr);
+  assert((int)outputs.size() == batchSize);
+
+  const int policyResultLen = computeHandle->policyResultLen;
+  const int spatialPolicyLen = nnXLen * nnYLen;
+  float policyProbsTmp[NNPos::MAX_NN_POLICY_SIZE];
+
+  for(int row = 0; row < batchSize; row++) {
+    NNOutput* output = outputs[row];
+    assert(output->nnXLen == nnXLen);
+    assert(output->nnYLen == nnYLen);
+    float policyOptimism = (float)inputBufs[row]->policyOptimism;
+
+    // Policy: [N, C, H*W+1]
+    {
+      const float* policyRowBase = policyData + row * numPolicyChannels * policyResultLen;
+      float* policyProbs = output->policyProbs;
+
+      if(numPolicyChannels >= 2) {
+        const float* ch0 = policyRowBase;
+        const float* ch1 = policyRowBase + policyResultLen;
+        for(int i = 0; i < spatialPolicyLen; i++) {
+          float p = ch0[i];
+          float pOpt = ch1[i];
+          policyProbsTmp[i] = p + (pOpt - p) * policyOptimism;
+        }
+        SymmetryHelpers::copyOutputsWithSymmetry(policyProbsTmp, policyProbs, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
+        policyProbs[spatialPolicyLen] = ch0[spatialPolicyLen] + (ch1[spatialPolicyLen] - ch0[spatialPolicyLen]) * policyOptimism;
+      } else {
+        assert(numPolicyChannels == 1);
+        const float* ch0 = policyRowBase;
+        SymmetryHelpers::copyOutputsWithSymmetry(ch0, policyProbs, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
+        policyProbs[spatialPolicyLen] = ch0[spatialPolicyLen];
+      }
+    }
+
+    // Value: [N, 3]
+    {
+      int numVC = computeHandle->numValueChannels;
+      assert(numVC == 3);
+      output->whiteWinProb = valueData[row * numVC];
+      output->whiteLossProb = valueData[row * numVC + 1];
+      output->whiteNoResultProb = valueData[row * numVC + 2];
+    }
+
+    // MiscValue: [N, numScoreValueChannels] -- version-dependent interpretation
+    {
+      int numScoreValueChannels = computeHandle->numScoreValueChannels;
+      if(computeHandle->modelVersion >= 9) {
+        assert(numScoreValueChannels >= 6);
+        output->whiteScoreMean = miscvalueData[row * numScoreValueChannels];
+        output->whiteScoreMeanSq = miscvalueData[row * numScoreValueChannels + 1];
+        output->whiteLead = miscvalueData[row * numScoreValueChannels + 2];
+        output->varTimeLeft = miscvalueData[row * numScoreValueChannels + 3];
+        output->shorttermWinlossError = miscvalueData[row * numScoreValueChannels + 4];
+        output->shorttermScoreError = miscvalueData[row * numScoreValueChannels + 5];
+      }
+      else if(computeHandle->modelVersion >= 8) {
+        assert(numScoreValueChannels >= 4);
+        output->whiteScoreMean = miscvalueData[row * numScoreValueChannels];
+        output->whiteScoreMeanSq = miscvalueData[row * numScoreValueChannels + 1];
+        output->whiteLead = miscvalueData[row * numScoreValueChannels + 2];
+        output->varTimeLeft = miscvalueData[row * numScoreValueChannels + 3];
+        output->shorttermWinlossError = 0;
+        output->shorttermScoreError = 0;
+      }
+      else if(computeHandle->modelVersion >= 4) {
+        assert(numScoreValueChannels >= 2);
+        output->whiteScoreMean = miscvalueData[row * numScoreValueChannels];
+        output->whiteScoreMeanSq = miscvalueData[row * numScoreValueChannels + 1];
+        output->whiteLead = output->whiteScoreMean;
+        output->varTimeLeft = 0;
+        output->shorttermWinlossError = 0;
+        output->shorttermScoreError = 0;
+      }
+      else if(computeHandle->modelVersion >= 3) {
+        assert(numScoreValueChannels >= 1);
+        output->whiteScoreMean = miscvalueData[row * numScoreValueChannels];
+        output->whiteScoreMeanSq = output->whiteScoreMean * output->whiteScoreMean;
+        output->whiteLead = output->whiteScoreMean;
+        output->varTimeLeft = 0;
+        output->shorttermWinlossError = 0;
+        output->shorttermScoreError = 0;
+      }
+      else {
+        ASSERT_UNREACHABLE;
+      }
+    }
+
+    // Ownership: [N, 1, H, W]
+    if(output->whiteOwnerMap != NULL) {
+      assert(computeHandle->numOwnershipChannels == 1);
+      const float* ownershipRowBuf = ownershipData + row * nnXLen * nnYLen;
+      SymmetryHelpers::copyOutputsWithSymmetry(ownershipRowBuf, output->whiteOwnerMap, 1, nnYLen, nnXLen, inputBufs[row]->symmetry);
+    }
+  }
+}
+
+void NeuralNet::printDevices() {
+  cout << "ONNX backend: device enumeration is provider-specific." << endl;
+  cout << "Use onnxProvider plus provider-specific settings in config." << endl;
+}
+
+//--------------------------------------------------------------
+// FOR TESTING -- all return false (not implemented for this backend)
+
+bool NeuralNet::testEvaluateConv(
+  const ConvLayerDesc* desc, int batchSize, int nnXLen, int nnYLen,
+  bool useFP16, bool useNHWC, const std::vector<float>& inputBuffer, std::vector<float>& outputBuffer
+) {
+  (void)desc; (void)batchSize; (void)nnXLen; (void)nnYLen;
+  (void)useFP16; (void)useNHWC; (void)inputBuffer; (void)outputBuffer;
+  return false;
+}
+
+bool NeuralNet::testEvaluateBatchNorm(
+  const BatchNormLayerDesc* desc, int batchSize, int nnXLen, int nnYLen,
+  bool useFP16, bool useNHWC, const std::vector<float>& inputBuffer,
+  const std::vector<float>& maskBuffer, std::vector<float>& outputBuffer
+) {
+  (void)desc; (void)batchSize; (void)nnXLen; (void)nnYLen;
+  (void)useFP16; (void)useNHWC; (void)inputBuffer; (void)maskBuffer; (void)outputBuffer;
+  return false;
+}
+
+bool NeuralNet::testEvaluateResidualBlock(
+  const ResidualBlockDesc* desc, int batchSize, int nnXLen, int nnYLen,
+  bool useFP16, bool useNHWC, const std::vector<float>& inputBuffer,
+  const std::vector<float>& maskBuffer, std::vector<float>& outputBuffer
+) {
+  (void)desc; (void)batchSize; (void)nnXLen; (void)nnYLen;
+  (void)useFP16; (void)useNHWC; (void)inputBuffer; (void)maskBuffer; (void)outputBuffer;
+  return false;
+}
+
+bool NeuralNet::testEvaluateGlobalPoolingResidualBlock(
+  const GlobalPoolingResidualBlockDesc* desc, int batchSize, int nnXLen, int nnYLen,
+  bool useFP16, bool useNHWC, const std::vector<float>& inputBuffer,
+  const std::vector<float>& maskBuffer, std::vector<float>& outputBuffer
+) {
+  (void)desc; (void)batchSize; (void)nnXLen; (void)nnYLen;
+  (void)useFP16; (void)useNHWC; (void)inputBuffer; (void)maskBuffer; (void)outputBuffer;
+  return false;
+}
diff --git a/cpp/neuralnet/onnxmodelbuilder.cpp b/cpp/neuralnet/onnxmodelbuilder.cpp
new file mode 100644
index 000000000..a301b7dc0
--- /dev/null
+++ b/cpp/neuralnet/onnxmodelbuilder.cpp
@@ -0,0 +1,774 @@
+// Builds an ONNX computational graph from a KataGo ModelDesc.
+// Uses the ONNX protobuf API (onnx-ml.pb.h) to construct a ModelProto
+// that can be loaded directly by ONNX Runtime.
+
+#include "../neuralnet/onnxmodelbuilder.h"
+#include "../neuralnet/activations.h"
+#include "../core/global.h"
+
+#include <onnx/onnx-ml.pb.h>
+
+#include <string>
+#include <vector>
+
+using namespace std;
+
+static string uniqueName(int& nameCounter, const string& prefix) {
+  return prefix + "_" + to_string(nameCounter++);
+}
+
+// =====================================================================
+// Helper: Add a float tensor initializer to the graph
+// =====================================================================
+static string addInitializer(
+  onnx::GraphProto* graph,
+  const string& name,
+  const vector<int64_t>& shape,
+  const float* data,
+  size_t numElements
+) {
+  onnx::TensorProto* tensor = graph->add_initializer();
+  tensor->set_name(name);
+  tensor->set_data_type(onnx::TensorProto_DataType_FLOAT);
+  for(int64_t d : shape)
+    tensor->add_dims(d);
+  tensor->set_raw_data(data, numElements * sizeof(float));
+  return name;
+}
+
+static string addInitializer(
+  onnx::GraphProto* graph,
+  const string& name,
+  const vector<int64_t>& shape,
+  const vector<float>& data
+) {
+  return addInitializer(graph, name, shape, data.data(), data.size());
+}
+
+// Add a scalar float constant
+static string addScalarInitializer(onnx::GraphProto* graph, const string& name, float value) {
+  return addInitializer(graph, name, {}, &value, 1);
+}
+
+// Add a 1D int64 constant tensor
+static string addInt64Initializer(
+  onnx::GraphProto* graph,
+  const string& name,
+  const vector<int64_t>& data
+) {
+  onnx::TensorProto* tensor = graph->add_initializer();
+  tensor->set_name(name);
+  tensor->set_data_type(onnx::TensorProto_DataType_INT64);
+  tensor->add_dims((int64_t)data.size());
+  tensor->set_raw_data(data.data(), data.size() * sizeof(int64_t));
+  return name;
+}
+
+// =====================================================================
+// Helper: Add ONNX graph node
+// =====================================================================
+
+// Generic node with n inputs, 1 output
+static onnx::NodeProto* addNode(
+  onnx::GraphProto* graph,
+  const string& opType,
+  const vector<string>& inputs,
+  const string& outputName
+) {
+  onnx::NodeProto* node = graph->add_node();
+  node->set_op_type(opType);
+  for(const auto& inp : inputs)
+    node->add_input(inp);
+  node->add_output(outputName);
+  return node;
+}
+
+// Add an attribute (int) to a node
+static void setAttrInt(onnx::NodeProto* node, const string& attrName, int64_t value) {
+  onnx::AttributeProto* attr = node->add_attribute();
+  attr->set_name(attrName);
+  attr->set_type(onnx::AttributeProto_AttributeType_INT);
+  attr->set_i(value);
+}
+
+// Add an attribute (ints) to a node
+static void setAttrInts(onnx::NodeProto* node, const string& attrName, const vector<int64_t>& values) {
+  onnx::AttributeProto* attr = node->add_attribute();
+  attr->set_name(attrName);
+  attr->set_type(onnx::AttributeProto_AttributeType_INTS);
+  for(int64_t v : values)
+    attr->add_ints(v);
+}
+
+// =====================================================================
+// Convolution: Conv with zero-padding
+// =====================================================================
+static string addConvNode(
+  onnx::GraphProto* graph,
+  int& nameCounter,
+  const string& input,
+  const ConvLayerDesc& desc,
+  const string& prefix
+) {
+  string weightsName = addInitializer(
+    graph, prefix + "/w",
+    {desc.outChannels, desc.inChannels, desc.convYSize, desc.convXSize},
+    desc.weights
+  );
+
+  int padY = desc.convYSize / 2;
+  int padX = desc.convXSize / 2;
+  string output = uniqueName(nameCounter, prefix + "/out");
+
+  onnx::NodeProto* convNode = addNode(graph, "Conv", {input, weightsName}, output);
+  setAttrInts(convNode, "kernel_shape", {desc.convYSize, desc.convXSize});
+  setAttrInts(convNode, "pads", {padY, padX, padY, padX});
+  setAttrInts(convNode, "dilations", {desc.dilationY, desc.dilationX});
+  setAttrInts(convNode, "strides", {1, 1});
+
+  return output;
+}
+
+// =====================================================================
+// Merged Batch Norm: output = input * mergedScale + mergedBias
+// Applied channel-wise, broadcasting over [N, C, H, W]
+// =====================================================================
+static string addMergedBNNode(
+  onnx::GraphProto* graph,
+  int& nameCounter,
+  const string& input,
+  const BatchNormLayerDesc& desc,
+  const string& prefix
+) {
+  int C = desc.numChannels;
+  string scaleName = addInitializer(graph, prefix + "/scale", {C, 1, 1}, desc.mergedScale);
+  string biasName = addInitializer(graph, prefix + "/bias", {C, 1, 1}, desc.mergedBias);
+
+  string scaled = uniqueName(nameCounter, prefix + "/scaled");
+  addNode(graph, "Mul", {input, scaleName}, scaled);
+
+  string output = uniqueName(nameCounter, prefix + "/bn_out");
+  addNode(graph, "Add", {scaled, biasName}, output);
+
+  return output;
+}
+
+// =====================================================================
+// Activation: ReLU, Mish (softplus->tanh->mul), or Identity
+// =====================================================================
+static string addActivationNode(
+  onnx::GraphProto* graph,
+  int& nameCounter,
+  const string& input,
+  int activationType,
+  const string& prefix
+) {
+  if(activationType == ACTIVATION_RELU) {
+    string output = uniqueName(nameCounter, prefix + "/relu");
+    addNode(graph, "Relu", {input}, output);
+    return output;
+  } else if(activationType == ACTIVATION_MISH) {
+    // Mish = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+    string sp = uniqueName(nameCounter, prefix + "/softplus");
+    addNode(graph, "Softplus", {input}, sp);
+
+    string th = uniqueName(nameCounter, prefix + "/tanh");
+    addNode(graph, "Tanh", {sp}, th);
+
+    string output = uniqueName(nameCounter, prefix + "/mish");
+    addNode(graph, "Mul", {input, th}, output);
+    return output;
+  } else {
+    // ACTIVATION_IDENTITY -- pass through
+    return input;
+  }
+}
+
+// =====================================================================
+// BN + Activation + Mask multiply
+// output = activation(input * scale + bias) * mask
+// =====================================================================
+static string addBNActivationMask(
+  onnx::GraphProto* graph,
+  int& nameCounter,
+  const string& input,
+  const BatchNormLayerDesc& bnDesc,
+  const ActivationLayerDesc& actDesc,
+  const string& mask,
+  const string& prefix
+) {
+  string bn = addMergedBNNode(graph, nameCounter, input, bnDesc, prefix + "/bn");
+  string act = addActivationNode(graph, nameCounter, bn, actDesc.activation, prefix + "/act");
+  string output = uniqueName(nameCounter, prefix + "/masked");
+  addNode(graph, "Mul", {act, mask}, output);
+  return output;
+}
+
+// =====================================================================
+// MatMul: output = input @ W
+// W is [inC, outC]
+// =====================================================================
+static string addMatMulNode(
+  onnx::GraphProto* graph,
+  int& nameCounter,
+  const string& input,
+  const MatMulLayerDesc& desc,
+  const string& prefix
+) {
+  string weightsName = addInitializer(graph, prefix + "/w", {desc.inChannels, desc.outChannels}, desc.weights);
+  string output = uniqueName(nameCounter, prefix + "/matmul");
+  addNode(graph, "MatMul", {input, weightsName}, output);
+  return output;
+}
+
+// =====================================================================
+// Bias addition: output = input + bias
+// bias is [C], broadcast over [N, C] or [N, C, H, W]
+// =====================================================================
+static string addBiasNode(
+  onnx::GraphProto* graph,
+  int& nameCounter,
+  const string& input,
+  const MatBiasLayerDesc& desc,
+  const string& prefix
+) {
+  string biasName = addInitializer(graph, prefix + "/b", {desc.numChannels}, desc.weights);
+  string output = uniqueName(nameCounter, prefix + "/biased");
+  addNode(graph, "Add", {input, biasName}, output);
+  return output;
+}
+
+// =====================================================================
+// KataGPool: Global pooling producing 3 values per channel
+// Pool 1: mean = ReduceSum(x * mask, [2,3]) / maskSum
+// Pool 2: mean * (sqrt(maskSum) - 14.0) * 0.1
+// Pool 3: ReduceMax(x + (mask - 1.0), [2,3])
+// Output: [N, 3*C]
+// =====================================================================
+static string addGlobalPool(
+  onnx::GraphProto* graph,
+  int& nameCounter,
+  const string& input,
+  const string& mask,
+  const string& maskSumHW,
+  const string& prefix
+) {
+  // x_masked = input * mask  (already masked, but let's be safe)
+  string xMasked = uniqueName(nameCounter, prefix + "/gpool_xm");
+  addNode(graph, "Mul", {input, mask}, xMasked);
+
+  // sum = ReduceSum(xMasked, axes=[2,3])
+  string axesName = addInt64Initializer(graph, uniqueName(nameCounter, prefix + "/axes23"), {2, 3});
+  string sumOut = uniqueName(nameCounter, prefix + "/gpool_sum");
+  onnx::NodeProto* sumNode = addNode(graph, "ReduceSum", {xMasked, axesName}, sumOut);
+  setAttrInt(sumNode, "keepdims", 0);
+
+  // mean = sum / maskSumFlat
+  // maskSumHW is [N,1,1,1], we need [N,1] for division
+  string maskSumFlat = uniqueName(nameCounter, prefix + "/gpool_msf");
+  string reshapeShape = addInt64Initializer(graph, uniqueName(nameCounter, prefix + "/shape_n1"), {0, 1});
+  addNode(graph, "Reshape", {maskSumHW, reshapeShape}, maskSumFlat);
+
+  string mean = uniqueName(nameCounter, prefix + "/gpool_mean");
+  addNode(graph, "Div", {sumOut, maskSumFlat}, mean);
+
+  // sqrtMaskSum = sqrt(maskSumFlat)
+  string sqrtMs = uniqueName(nameCounter, prefix + "/gpool_sqrt");
+  addNode(graph, "Sqrt", {maskSumFlat}, sqrtMs);
+
+  // sqrtMs - 14.0
+  string const14 = addScalarInitializer(graph, uniqueName(nameCounter, prefix + "/c14"), 14.0f);
+  string sqrtMsSub = uniqueName(nameCounter, prefix + "/gpool_sqrtsub");
+  addNode(graph, "Sub", {sqrtMs, const14}, sqrtMsSub);
+
+  // * 0.1
+  string const01 = addScalarInitializer(graph, uniqueName(nameCounter, prefix + "/c01"), 0.1f);
+  string scaledSqrt = uniqueName(nameCounter, prefix + "/gpool_ssm");
+  addNode(graph, "Mul", {sqrtMsSub, const01}, scaledSqrt);
+
+  // pool2 = mean * scaledSqrt
+  string pool2 = uniqueName(nameCounter, prefix + "/gpool_p2");
+  addNode(graph, "Mul", {mean, scaledSqrt}, pool2);
+
+  // Pool3: max over (x + mask - 1)
+  string constNeg1 = addScalarInitializer(graph, uniqueName(nameCounter, prefix + "/cn1"), -1.0f);
+  string maskBias = uniqueName(nameCounter, prefix + "/gpool_mb");
+  addNode(graph, "Add", {mask, constNeg1}, maskBias);
+
+  string xShifted = uniqueName(nameCounter, prefix + "/gpool_xs");
+  addNode(graph, "Add", {input, maskBias}, xShifted);
+
+  // ReduceMax over [2,3]
+  string axesName2 = addInt64Initializer(graph, uniqueName(nameCounter, prefix + "/axes23b"), {2, 3});
+  string pool3 = uniqueName(nameCounter, prefix + "/gpool_max");
+  onnx::NodeProto* maxNode = addNode(graph, "ReduceMax", {xShifted, axesName2}, pool3);
+  setAttrInt(maxNode, "keepdims", 0);
+
+  // Concat [mean, pool2, pool3] along axis=1
+  string output = uniqueName(nameCounter, prefix + "/gpool_out");
+  onnx::NodeProto* concatNode = addNode(graph, "Concat", {mean, pool2, pool3}, output);
+  setAttrInt(concatNode, "axis", 1);
+
+  return output;
+}
+
+// =====================================================================
+// KataValueHeadGPool: Different third pool from KataGPool
+// Pool 3: mean * ((sqrt(maskSum) - 14.0)^2 * 0.01 - 0.1)
+// =====================================================================
+static string addValueHeadGPool(
+  onnx::GraphProto* graph,
+  int& nameCounter,
+  const string& input,
+  const string& mask,
+  const string& maskSumHW,
+  const string& prefix
+) {
+  // x for value head already has activation applied
+  // sum = ReduceSum(input * mask, [2,3])
+  string xMasked = uniqueName(nameCounter, prefix + "/vgpool_xm");
+  addNode(graph, "Mul", {input, mask}, xMasked);
+
+  string axesName = addInt64Initializer(graph, uniqueName(nameCounter, prefix + "/axes23"), {2, 3});
+  string sumOut = uniqueName(nameCounter, prefix + "/vgpool_sum");
+  onnx::NodeProto* sumNode = addNode(graph, "ReduceSum", {xMasked, axesName}, sumOut);
+  setAttrInt(sumNode, "keepdims", 0);
+
+  // mean
+  string maskSumFlat = uniqueName(nameCounter, prefix + "/vgpool_msf");
+  string reshapeShape = addInt64Initializer(graph, uniqueName(nameCounter, prefix + "/shape_n1"), {0, 1});
+  addNode(graph, "Reshape", {maskSumHW, reshapeShape}, maskSumFlat);
+
+  string mean = uniqueName(nameCounter, prefix + "/vgpool_mean");
+  addNode(graph, "Div", {sumOut, maskSumFlat}, mean);
+
+  // sqrt(maskSum)
+  string sqrtMs = uniqueName(nameCounter, prefix + "/vgpool_sqrt");
+  addNode(graph, "Sqrt", {maskSumFlat}, sqrtMs);
+
+  // (sqrt(maskSum) - 14.0)
+  string const14 = addScalarInitializer(graph, uniqueName(nameCounter, prefix + "/c14"), 14.0f);
+  string sqrtMsSub = uniqueName(nameCounter, prefix + "/vgpool_ss");
+  addNode(graph, "Sub", {sqrtMs, const14}, sqrtMsSub);
+
+  // pool2 = mean * (sqrtMsSub) * 0.1
+  string const01 = addScalarInitializer(graph, uniqueName(nameCounter, prefix + "/c01"), 0.1f);
+  string scaledSqrt = uniqueName(nameCounter, prefix + "/vgpool_ssm");
+  addNode(graph, "Mul", {sqrtMsSub, const01}, scaledSqrt);
+  string pool2 = uniqueName(nameCounter, prefix + "/vgpool_p2");
+  addNode(graph, "Mul", {mean, scaledSqrt}, pool2);
+
+  // pool3 = mean * ((sqrtMsSub)^2 * 0.01 - 0.1)
+  string sqrtMsSubSq = uniqueName(nameCounter, prefix + "/vgpool_sq");
+  addNode(graph, "Mul", {sqrtMsSub, sqrtMsSub}, sqrtMsSubSq);
+
+  string constP01 = addScalarInitializer(graph, uniqueName(nameCounter, prefix + "/cp01"), 0.01f);
+  string sqScaled = uniqueName(nameCounter, prefix + "/vgpool_sqs");
+  addNode(graph, "Mul", {sqrtMsSubSq, constP01}, sqScaled);
+
+  string constN01 = addScalarInitializer(graph, uniqueName(nameCounter, prefix + "/cn01"), -0.1f);
+  string sqShifted = uniqueName(nameCounter, prefix + "/vgpool_sqsh");
+  addNode(graph, "Add", {sqScaled, constN01}, sqShifted);
+
+  string pool3 = uniqueName(nameCounter, prefix + "/vgpool_p3");
+  addNode(graph, "Mul", {mean, sqShifted}, pool3);
+
+  // Concat [mean, pool2, pool3] along axis=1
+  string output = uniqueName(nameCounter, prefix + "/vgpool_out");
+  onnx::NodeProto* concatNode = addNode(graph, "Concat", {mean, pool2, pool3}, output);
+  setAttrInt(concatNode, "axis", 1);
+
+  return output;
+}
+
+// =====================================================================
+// Residual Block: BN->Act->Conv->BN->Act->Conv + skip
+// =====================================================================
+static string addResidualBlock(
+  onnx::GraphProto* graph,
+  int& nameCounter,
+  const string& input,
+  const string& mask,
+  const ResidualBlockDesc& desc,
+  const string& prefix
+) {
+  string pre = addBNActivationMask(graph, nameCounter, input, desc.preBN, desc.preActivation, mask, prefix + "/pre");
+  string mid = addConvNode(graph, nameCounter, pre, desc.regularConv, prefix + "/conv1");
+  string midAct = addBNActivationMask(graph, nameCounter, mid, desc.midBN, desc.midActivation, mask, prefix + "/mid");
+  string final_ = addConvNode(graph, nameCounter, midAct, desc.finalConv, prefix + "/conv2");
+
+  // Residual add
+  string output = uniqueName(nameCounter, prefix + "/resadd");
+  addNode(graph, "Add", {input, final_}, output);
+  return output;
+}
+
+// =====================================================================
+// Global Pooling Residual Block
+// =====================================================================
+static string addGPoolResidualBlock(
+  onnx::GraphProto* graph,
+  int& nameCounter,
+  const string& input,
+  const string& mask,
+  const string& maskSumHW,
+  const GlobalPoolingResidualBlockDesc& desc,
+  const string& prefix
+) {
+  string pre = addBNActivationMask(graph, nameCounter, input, desc.preBN, desc.preActivation, mask, prefix + "/pre");
+
+  // Regular path
+  string regOut = addConvNode(graph, nameCounter, pre, desc.regularConv, prefix + "/reg");
+
+  // Global pooling path
+  string gpoolConvOut = addConvNode(graph, nameCounter, pre, desc.gpoolConv, prefix + "/gconv");
+  string gpoolBNAct = addBNActivationMask(graph, nameCounter, gpoolConvOut, desc.gpoolBN, desc.gpoolActivation, mask, prefix + "/gbn");
+  string gpoolResult = addGlobalPool(graph, nameCounter, gpoolBNAct, mask, maskSumHW, prefix + "/gpool");
+
+  // gpoolToBiasMul: [N, 3*gpoolC] -> [N, regC]
+  string gpoolBias = addMatMulNode(graph, nameCounter, gpoolResult, desc.gpoolToBiasMul, prefix + "/g2b");
+
+  // Reshape bias to [N, C, 1, 1] for broadcasting
+  string biasShape = addInt64Initializer(graph, uniqueName(nameCounter, prefix + "/shape_nc11"), {0, -1, 1, 1});
+  string gpoolBiasReshaped = uniqueName(nameCounter, prefix + "/gbr");
+  addNode(graph, "Reshape", {gpoolBias, biasShape}, gpoolBiasReshaped);
+
+  // Add bias to regular conv output
+  string regPlusBias = uniqueName(nameCounter, prefix + "/rpb");
+  addNode(graph, "Add", {regOut, gpoolBiasReshaped}, regPlusBias);
+
+  // Second half: BN->Act->Conv
+  string midAct = addBNActivationMask(graph, nameCounter, regPlusBias, desc.midBN, desc.midActivation, mask, prefix + "/mid");
+  string final_ = addConvNode(graph, nameCounter, midAct, desc.finalConv, prefix + "/conv2");
+
+  // Residual add
+  string output = uniqueName(nameCounter, prefix + "/resadd");
+  addNode(graph, "Add", {input, final_}, output);
+  return output;
+}
+
+// =====================================================================
+// Nested Bottleneck Residual Block
+// Pre: BN->Act->Mask->1x1Conv (c_main->c_mid)
+// Inner: sequence of ordinary/gpool/nested_bottleneck sub-blocks at c_mid
+// Post: BN->Act->Mask->1x1Conv (c_mid->c_main) + residual add
+// =====================================================================
+static string addNestedBottleneckResidualBlock(
+  onnx::GraphProto* graph,
+  int& nameCounter,
+  const string& input,
+  const string& mask,
+  const string& maskSumHW,
+  const NestedBottleneckResidualBlockDesc& desc,
+  const string& prefix
+) {
+  // Pre: BN -> Act -> Mask -> 1x1 Conv (c_main -> c_mid)
+  string pre = addBNActivationMask(graph, nameCounter, input, desc.preBN, desc.preActivation, mask, prefix + "/pre");
+  string midOut = addConvNode(graph, nameCounter, pre, desc.preConv, prefix + "/preconv");
+
+  // Inner sub-blocks at c_mid channels
+  for(int i = 0; i < desc.numBlocks; i++) {
+    int kind = desc.blocks[i].first;
+    string sub = prefix + "/sub" + to_string(i);
+    if(kind == ORDINARY_BLOCK_KIND) {
+      midOut = addResidualBlock(graph, nameCounter, midOut, mask,
+        *((const ResidualBlockDesc*)desc.blocks[i].second.get()), sub);
+    } else if(kind == GLOBAL_POOLING_BLOCK_KIND) {
+      midOut = addGPoolResidualBlock(graph, nameCounter, midOut, mask, maskSumHW,
+        *((const GlobalPoolingResidualBlockDesc*)desc.blocks[i].second.get()), sub);
+    } else if(kind == NESTED_BOTTLENECK_BLOCK_KIND) {
+      midOut = addNestedBottleneckResidualBlock(graph, nameCounter, midOut, mask, maskSumHW,
+        *((const NestedBottleneckResidualBlockDesc*)desc.blocks[i].second.get()), sub);
+    } else {
+      throw StringError("ONNX backend: unknown sub-block kind " + to_string(kind));
+    }
+  }
+
+  // Post: BN -> Act -> Mask -> 1x1 Conv (c_mid -> c_main)
+  string post = addBNActivationMask(graph, nameCounter, midOut, desc.postBN, desc.postActivation, mask, prefix + "/post");
+  string postOut = addConvNode(graph, nameCounter, post, desc.postConv, prefix + "/postconv");
+
+  // Residual add: input + postOut
+  string output = uniqueName(nameCounter, prefix + "/resadd");
+  addNode(graph, "Add", {input, postOut}, output);
+  return output;
+}
+
+// =====================================================================
+// Add ValueInfo for graph input/output
+// =====================================================================
+static void addGraphInput(
+  onnx::GraphProto* graph,
+  const string& name,
+  const vector<int64_t>& shape
+) {
+  onnx::ValueInfoProto* input = graph->add_input();
+  input->set_name(name);
+  onnx::TypeProto* type = input->mutable_type();
+  onnx::TypeProto_Tensor* tensorType = type->mutable_tensor_type();
+  tensorType->set_elem_type(onnx::TensorProto_DataType_FLOAT);
+  onnx::TensorShapeProto* shapeProto = tensorType->mutable_shape();
+  for(int64_t d : shape) {
+    auto* dim = shapeProto->add_dim();
+    if(d < 0)
+      dim->set_dim_param("N");
+    else
+      dim->set_dim_value(d);
+  }
+}
+
+static void addGraphOutput(
+  onnx::GraphProto* graph,
+  const string& name,
+  const vector<int64_t>& shape
+) {
+  onnx::ValueInfoProto* output = graph->add_output();
+  output->set_name(name);
+  onnx::TypeProto* type = output->mutable_type();
+  onnx::TypeProto_Tensor* tensorType = type->mutable_tensor_type();
+  tensorType->set_elem_type(onnx::TensorProto_DataType_FLOAT);
+  onnx::TensorShapeProto* shapeProto = tensorType->mutable_shape();
+  for(int64_t d : shape) {
+    auto* dim = shapeProto->add_dim();
+    if(d < 0)
+      dim->set_dim_param("N");
+    else
+      dim->set_dim_value(d);
+  }
+}
+
+// =====================================================================
+// Main: Build the full ONNX model from ModelDesc
+// =====================================================================
+string OnnxModelBuilder::buildOnnxModel(const ModelDesc& modelDesc, int nnXLen, int nnYLen) {
+  int nameCounter = 0;
+
+  const int modelVersion = modelDesc.modelVersion;
+  const int numInputChannels = modelDesc.numInputChannels;
+  const int numInputGlobalChannels = modelDesc.numInputGlobalChannels;
+  const int numPolicyChannels = modelDesc.numPolicyChannels;
+  const int numValueChannels = modelDesc.numValueChannels;
+  const int numScoreValueChannels = modelDesc.numScoreValueChannels;
+  const int numOwnershipChannels = modelDesc.numOwnershipChannels;
+
+  const TrunkDesc& trunk = modelDesc.trunk;
+  const PolicyHeadDesc& policyHead = modelDesc.policyHead;
+  const ValueHeadDesc& valueHead = modelDesc.valueHead;
+
+  onnx::ModelProto model;
+  model.set_ir_version(8);
+  model.set_producer_name("KataGo");
+  model.set_domain("ai.katago");
+
+  auto* opset = model.add_opset_import();
+  opset->set_domain("");
+  opset->set_version(18);
+
+  onnx::GraphProto* graph = model.mutable_graph();
+  graph->set_name("katago");
+
+  // ------------------------------------------------------------------
+  // Graph Inputs
+  // ------------------------------------------------------------------
+  addGraphInput(graph, "input_spatial", {-1, numInputChannels, nnYLen, nnXLen});
+  addGraphInput(graph, "input_global", {-1, numInputGlobalChannels});
+  if(modelDesc.numInputMetaChannels > 0) {
+    addGraphInput(graph, "input_meta", {-1, modelDesc.numInputMetaChannels});
+  }
+
+  // ------------------------------------------------------------------
+  // Derive mask and maskSumHW from input_spatial.
+  // Channel 0 of the spatial input is the "on board" indicator: 1.0 for
+  // positions on the board, 0.0 for off-board padding.  This is Feature 0
+  // set by fillRowV3/V4/V5/V6/V7 in nninputs.cpp and holds across all
+  // supported input versions (V3-V7).
+  //
+  // mask = input_spatial[:, 0:1, :, :]  -> [N, 1, H, W]
+  // maskSumHW = ReduceSum(mask, [2, 3], keepdims=true) -> [N, 1, 1, 1]
+  // ------------------------------------------------------------------
+
+  // Slice channel 0 to get mask
+  string sliceStarts = addInt64Initializer(graph, "mask_starts", {0});
+  string sliceEnds = addInt64Initializer(graph, "mask_ends", {1});
+  string sliceAxes = addInt64Initializer(graph, "mask_axes", {1});
+  string mask = uniqueName(nameCounter, "mask");
+  addNode(graph, "Slice", {"input_spatial", sliceStarts, sliceEnds, sliceAxes}, mask);
+
+  // maskSumHW
+  string sumAxes = addInt64Initializer(graph, "mask_sum_axes", {2, 3});
+  string maskSumHW = uniqueName(nameCounter, "maskSumHW");
+  onnx::NodeProto* maskSumNode = addNode(graph, "ReduceSum", {mask, sumAxes}, maskSumHW);
+  setAttrInt(maskSumNode, "keepdims", 1);
+
+  // ------------------------------------------------------------------
+  // Trunk: Initial conv + matmul bias
+  // ------------------------------------------------------------------
+  string trunkOut = addConvNode(graph, nameCounter, "input_spatial", trunk.initialConv, "trunk/init_conv");
+
+  // initialMatMul: global features -> [N, trunkNumChannels]
+  string globalBias = addMatMulNode(graph, nameCounter, "input_global", trunk.initialMatMul, "trunk/init_matmul");
+
+  // Reshape to [N, C, 1, 1] for broadcasting
+  string biasShape = addInt64Initializer(graph, "trunk_bias_shape", {0, -1, 1, 1});
+  string globalBiasReshaped = uniqueName(nameCounter, "trunk/gbr");
+  addNode(graph, "Reshape", {globalBias, biasShape}, globalBiasReshaped);
+
+  // Add global bias to conv output
+  string trunkCombined = uniqueName(nameCounter, "trunk/combined");
+  addNode(graph, "Add", {trunkOut, globalBiasReshaped}, trunkCombined);
+  trunkOut = trunkCombined;
+
+  // ------------------------------------------------------------------
+  // Trunk: Metadata encoder (SGF metadata -> trunk bias)
+  // ------------------------------------------------------------------
+  if(trunk.metaEncoderVersion > 0) {
+    const SGFMetadataEncoderDesc& enc = trunk.sgfMetadataEncoder;
+    string metaOut = addMatMulNode(graph, nameCounter, "input_meta", enc.mul1, "trunk/meta_mul1");
+    metaOut = addBiasNode(graph, nameCounter, metaOut, enc.bias1, "trunk/meta_b1");
+    metaOut = addActivationNode(graph, nameCounter, metaOut, enc.act1.activation, "trunk/meta_a1");
+    metaOut = addMatMulNode(graph, nameCounter, metaOut, enc.mul2, "trunk/meta_mul2");
+    metaOut = addBiasNode(graph, nameCounter, metaOut, enc.bias2, "trunk/meta_b2");
+    metaOut = addActivationNode(graph, nameCounter, metaOut, enc.act2.activation, "trunk/meta_a2");
+    metaOut = addMatMulNode(graph, nameCounter, metaOut, enc.mul3, "trunk/meta_mul3");
+
+    // Reshape to [N, C, 1, 1] for spatial broadcasting
+    string metaBiasShape = addInt64Initializer(graph, "trunk_meta_bias_shape", {0, -1, 1, 1});
+    string metaBiasReshaped = uniqueName(nameCounter, "trunk/mbr");
+    addNode(graph, "Reshape", {metaOut, metaBiasShape}, metaBiasReshaped);
+
+    // Add to trunk
+    string trunkWithMeta = uniqueName(nameCounter, "trunk/with_meta");
+    addNode(graph, "Add", {trunkOut, metaBiasReshaped}, trunkWithMeta);
+    trunkOut = trunkWithMeta;
+  }
+
+  // ------------------------------------------------------------------
+  // Trunk: Residual blocks
+  // ------------------------------------------------------------------
+  for(int i = 0; i < trunk.numBlocks; i++) {
+    int blockKind = trunk.blocks[i].first;
+    string blockPrefix = "trunk/block" + to_string(i);
+
+    if(blockKind == ORDINARY_BLOCK_KIND) {
+      const ResidualBlockDesc& blockDesc = *((const ResidualBlockDesc*)trunk.blocks[i].second.get());
+      trunkOut = addResidualBlock(graph, nameCounter, trunkOut, mask, blockDesc, blockPrefix);
+    } else if(blockKind == GLOBAL_POOLING_BLOCK_KIND) {
+      const GlobalPoolingResidualBlockDesc& blockDesc = *((const GlobalPoolingResidualBlockDesc*)trunk.blocks[i].second.get());
+      trunkOut = addGPoolResidualBlock(graph, nameCounter, trunkOut, mask, maskSumHW, blockDesc, blockPrefix);
+    } else if(blockKind == NESTED_BOTTLENECK_BLOCK_KIND) {
+      const NestedBottleneckResidualBlockDesc& blockDesc = *((const NestedBottleneckResidualBlockDesc*)trunk.blocks[i].second.get());
+      trunkOut = addNestedBottleneckResidualBlock(graph, nameCounter, trunkOut, mask, maskSumHW, blockDesc, blockPrefix);
+    } else {
+      throw StringError("ONNX backend: unknown block kind " + to_string(blockKind));
+    }
+  }
+
+  // Trunk tip: BN + activation + mask
+  trunkOut = addBNActivationMask(graph, nameCounter, trunkOut, trunk.trunkTipBN, trunk.trunkTipActivation, mask, "trunk/tip");
+
+  // ------------------------------------------------------------------
+  // Policy Head
+  // ------------------------------------------------------------------
+
+  // p1Conv: spatial path
+  string p1Out = addConvNode(graph, nameCounter, trunkOut, policyHead.p1Conv, "policy/p1conv");
+
+  // g1Conv: global pooling path
+  string g1Out = addConvNode(graph, nameCounter, trunkOut, policyHead.g1Conv, "policy/g1conv");
+  string g1BNAct = addBNActivationMask(graph, nameCounter, g1Out, policyHead.g1BN, policyHead.g1Activation, mask, "policy/g1bn");
+  string g1Pool = addGlobalPool(graph, nameCounter, g1BNAct, mask, maskSumHW, "policy/g1pool");
+
+  // gpoolToBiasMul: [N, 3*g1C] -> [N, p1C]
+  string policyBias = addMatMulNode(graph, nameCounter, g1Pool, policyHead.gpoolToBiasMul, "policy/g2b");
+
+  // Reshape to [N, C, 1, 1]
+  string pBiasShape = addInt64Initializer(graph, uniqueName(nameCounter, "policy/bias_shape"), {0, -1, 1, 1});
+  string policyBiasReshaped = uniqueName(nameCounter, "policy/pbr");
+  addNode(graph, "Reshape", {policyBias, pBiasShape}, policyBiasReshaped);
+
+  // Add bias to p1
+  string p1PlusBias = uniqueName(nameCounter, "policy/p1pb");
+  addNode(graph, "Add", {p1Out, policyBiasReshaped}, p1PlusBias);
+
+  // p1BN + activation + mask
+  string p1BNAct = addBNActivationMask(graph, nameCounter, p1PlusBias, policyHead.p1BN, policyHead.p1Activation, mask, "policy/p1bn");
+
+  // p2Conv: [N, p1C, H, W] -> [N, policyChannels, H, W]
+  string p2Out = addConvNode(graph, nameCounter, p1BNAct, policyHead.p2Conv, "policy/p2conv");
+
+  // Reshape to [N, policyChannels, H*W]
+  string pSpatialShape = addInt64Initializer(graph, uniqueName(nameCounter, "policy/spat_shape"), {0, numPolicyChannels, -1});
+  string policySpatial = uniqueName(nameCounter, "policy/spatial");
+  addNode(graph, "Reshape", {p2Out, pSpatialShape}, policySpatial);
+
+  // Pass move: gpoolToPassMul
+  string passOut;
+  if(modelVersion >= 15) {
+    // gpoolToPassMul -> bias -> activation -> gpoolToPassMul2
+    string passMul1 = addMatMulNode(graph, nameCounter, g1Pool, policyHead.gpoolToPassMul, "policy/pass_mul1");
+    string passBiased = addBiasNode(graph, nameCounter, passMul1, policyHead.gpoolToPassBias, "policy/pass_bias");
+    string passAct = addActivationNode(graph, nameCounter, passBiased, policyHead.passActivation.activation, "policy/pass_act");
+    passOut = addMatMulNode(graph, nameCounter, passAct, policyHead.gpoolToPassMul2, "policy/pass_mul2");
+  } else {
+    passOut = addMatMulNode(graph, nameCounter, g1Pool, policyHead.gpoolToPassMul, "policy/pass_mul");
+  }
+
+  // Reshape pass to [N, policyChannels, 1]
+  string passShape = addInt64Initializer(graph, uniqueName(nameCounter, "policy/pass_shape"), {0, numPolicyChannels, 1});
+  string passReshaped = uniqueName(nameCounter, "policy/pass_r");
+  addNode(graph, "Reshape", {passOut, passShape}, passReshaped);
+
+  // Concat spatial + pass -> out_policy [N, policyChannels, H*W+1]
+  onnx::NodeProto* policyConcatNode = addNode(graph, "Concat", {policySpatial, passReshaped}, "out_policy");
+  setAttrInt(policyConcatNode, "axis", 2);
+
+  // ------------------------------------------------------------------
+  // Value Head
+  // ------------------------------------------------------------------
+
+  // v1Conv
+  string v1Out = addConvNode(graph, nameCounter, trunkOut, valueHead.v1Conv, "value/v1conv");
+
+  // v1BN + activation + mask
+  string v1BNAct = addBNActivationMask(graph, nameCounter, v1Out, valueHead.v1BN, valueHead.v1Activation, mask, "value/v1bn");
+
+  // Value head global pooling
+  string v1Pool = addValueHeadGPool(graph, nameCounter, v1BNAct, mask, maskSumHW, "value/vpool");
+
+  // v2Mul + v2Bias + v2Activation
+  string v2Out = addMatMulNode(graph, nameCounter, v1Pool, valueHead.v2Mul, "value/v2mul");
+  string v2Biased = addBiasNode(graph, nameCounter, v2Out, valueHead.v2Bias, "value/v2bias");
+  string v2Act = addActivationNode(graph, nameCounter, v2Biased, valueHead.v2Activation.activation, "value/v2act");
+
+  // v3Mul + v3Bias -> out_value [N, 3]
+  string v3Out = addMatMulNode(graph, nameCounter, v2Act, valueHead.v3Mul, "value/v3mul");
+  string v3Biased = addBiasNode(graph, nameCounter, v3Out, valueHead.v3Bias, "value/v3bias");
+  addNode(graph, "Identity", {v3Biased}, "out_value");
+
+  // sv3Mul + sv3Bias -> out_miscvalue [N, numScoreValueChannels]
+  string sv3Out = addMatMulNode(graph, nameCounter, v2Act, valueHead.sv3Mul, "value/sv3mul");
+  string sv3Biased = addBiasNode(graph, nameCounter, sv3Out, valueHead.sv3Bias, "value/sv3bias");
+  addNode(graph, "Identity", {sv3Biased}, "out_miscvalue");
+
+  // vOwnershipConv -> out_ownership [N, 1, H, W]
+  string ownOut = addConvNode(graph, nameCounter, v1BNAct, valueHead.vOwnershipConv, "value/own_conv");
+  addNode(graph, "Identity", {ownOut}, "out_ownership");
+
+  // ------------------------------------------------------------------
+  // Graph Outputs
+  // ------------------------------------------------------------------
+  int policyResultLen = nnXLen * nnYLen + 1;
+  addGraphOutput(graph, "out_policy", {-1, numPolicyChannels, policyResultLen});
+  addGraphOutput(graph, "out_value", {-1, numValueChannels});
+  addGraphOutput(graph, "out_miscvalue", {-1, numScoreValueChannels});
+  addGraphOutput(graph, "out_ownership", {-1, numOwnershipChannels, nnYLen, nnXLen});
+
+  // ------------------------------------------------------------------
+  // Serialize to string
+  // ------------------------------------------------------------------
+  string serialized;
+  if(!model.SerializeToString(&serialized))
+    throw StringError("ONNX backend: failed to serialize ONNX model to protobuf");
+
+  return serialized;
+}
diff --git a/cpp/neuralnet/onnxmodelbuilder.h b/cpp/neuralnet/onnxmodelbuilder.h
new file mode 100644
index 000000000..96bc8e07a
--- /dev/null
+++ b/cpp/neuralnet/onnxmodelbuilder.h
@@ -0,0 +1,14 @@
+#ifndef NEURALNET_ONNXMODELBUILDER_H_
+#define NEURALNET_ONNXMODELBUILDER_H_
+
+#include <string>
+#include "../neuralnet/desc.h"
+
+namespace OnnxModelBuilder {
+  // Builds a serialized ONNX ModelProto from a KataGo ModelDesc.
+  // The model is constructed for a fixed spatial size of nnXLen x nnYLen.
+  // Returns the protobuf-serialized bytes, ready for Ort::Session creation.
+  std::string buildOnnxModel(const ModelDesc& modelDesc, int nnXLen, int nnYLen);
+}
+
+#endif // NEURALNET_ONNXMODELBUILDER_H_
diff --git a/cpp/program/gtpconfig.cpp b/cpp/program/gtpconfig.cpp
index d8f1decf3..aa0e8393e 100644
--- a/cpp/program/gtpconfig.cpp
+++ b/cpp/program/gtpconfig.cpp
@@ -280,6 +280,8 @@ nnCacheSizePowerOfTwo = $$NN_CACHE_SIZE_POWER_OF_TWO
 # Size of mutex pool for nnCache is (2 ** this).
 nnMutexPoolSizePowerOfTwo = $$NN_MUTEX_POOL_SIZE_POWER_OF_TWO
 
+$$ONNX_PROVIDER
+
 $$MULTIPLE_GPUS
 
 # ===========================================================================
@@ -466,7 +468,8 @@ string GTPConfig::makeConfig(
   std::vector<int> deviceIdxs,
   int nnCacheSizePowerOfTwo,
   int nnMutexPoolSizePowerOfTwo,
-  int numSearchThreads
+  int numSearchThreads,
+  const string& onnxProvider
 ) {
   string config = gtpBasePart1 + gtpBasePart2;
   auto replace = [&](const string& key, const string& replacement) {
@@ -519,12 +522,27 @@ string GTPConfig::makeConfig(
   replace("$$NN_CACHE_SIZE_POWER_OF_TWO", Global::intToString(nnCacheSizePowerOfTwo));
   replace("$$NN_MUTEX_POOL_SIZE_POWER_OF_TWO", Global::intToString(nnMutexPoolSizePowerOfTwo));
 
+#ifdef USE_ONNX_BACKEND
+  string onnxProviderLower = Global::toLower(Global::trim(onnxProvider));
+  string onnxProviderConfigValue = onnxProviderLower.empty() ? "cpu" : onnxProviderLower;
+  replace("$$ONNX_PROVIDER", "onnxProvider = " + onnxProviderConfigValue);
+#else
+  (void)onnxProvider;
+  replace("$$ONNX_PROVIDER", "");
+#endif
+
   if(deviceIdxs.size() <= 0) {
     replace("$$MULTIPLE_GPUS", "");
   }
   else {
     string replacement = "";
     replacement += "numNNServerThreadsPerModel = " + Global::uint64ToString(deviceIdxs.size()) + "\n";
+#ifdef USE_ONNX_BACKEND
+    bool onnxProviderSupportsThreadDeviceMap =
+      onnxProviderConfigValue == "cuda" ||
+      onnxProviderConfigValue == "tensorrt" ||
+      onnxProviderConfigValue == "migraphx";
+#endif
 
     for(int i = 0; i<deviceIdxs.size(); i++) {
 #ifdef USE_CUDA_BACKEND
@@ -539,7 +557,18 @@ string GTPConfig::makeConfig(
 #ifdef USE_ROCM_BACKEND
       replacement += "rocmDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n";
 #endif
+#ifdef USE_ONNX_BACKEND
+      if(onnxProviderSupportsThreadDeviceMap)
+        replacement += "onnxDeviceToUseThread" + Global::intToString(i) + " = " + Global::intToString(deviceIdxs[i]) + "\n";
+#endif
+    }
+#ifdef USE_ONNX_BACKEND
+    if(!onnxProviderSupportsThreadDeviceMap) {
+      replacement +=
+        "# NOTE: onnxDeviceToUseThread* is mainly for onnxProvider = cuda / tensorrt / migraphx.\n"
+        "# For onnxProvider = " + onnxProviderConfigValue + ", per-thread device mapping is usually unnecessary.\n";
     }
+#endif
     replace("$$MULTIPLE_GPUS", replacement);
   }
 
diff --git a/cpp/program/gtpconfig.h b/cpp/program/gtpconfig.h
index f70e329b5..93a57b6cf 100644
--- a/cpp/program/gtpconfig.h
+++ b/cpp/program/gtpconfig.h
@@ -14,7 +14,8 @@ namespace GTPConfig {
     std::vector<int> deviceIdxs,
     int nnCacheSizePowerOfTwo,
     int nnMutexPoolSizePowerOfTwo,
-    int numSearchThreads
+    int numSearchThreads,
+    const std::string& onnxProvider = "cpu"
   );
 }
 
diff --git a/cpp/program/setup.cpp b/cpp/program/setup.cpp
index fe4e5d7c1..2c7ec0847 100644
--- a/cpp/program/setup.cpp
+++ b/cpp/program/setup.cpp
@@ -21,6 +21,7 @@ std::vector<std::string> Setup::getBackendPrefixes() {
   prefixes.push_back("opencl");
   prefixes.push_back("rocm");
   prefixes.push_back("eigen");
+  prefixes.push_back("onnx");
   prefixes.push_back("dummybackend");
   return prefixes;
 }
@@ -89,12 +90,29 @@ vector<NNEvaluator*> Setup::initializeNNEvaluators(
   string backendPrefix = "opencl";
   #elif defined(USE_ROCM_BACKEND)
   string backendPrefix = "rocm";
+  #elif defined(USE_ONNX_BACKEND)
+  string backendPrefix = "onnx";
   #elif defined(USE_EIGEN_BACKEND)
   string backendPrefix = "eigen";
   #else
   string backendPrefix = "dummybackend";
   #endif
 
+#if !defined(USE_ONNX_BACKEND)
+  // In non-ONNX builds, fail fast on any ONNX-specific config instead of silently ignoring it.
+  {
+    const vector<string> allKeys = cfg.unusedKeys();
+    for(const string& key : allKeys) {
+      if(Global::isPrefix(Global::toLower(key),"onnx")) {
+        throw StringError(
+          "Config key '" + key + "' requires ONNX backend, but this executable is not built with USE_BACKEND=ONNX. "
+          "Remove onnx* settings or rebuild with -DUSE_BACKEND=ONNX."
+        );
+      }
+    }
+  }
+#endif
+
   //Automatically flag keys that are for other backends as used so that we don't warn about unused keys
   //for those options
   for(const string& prefix: getBackendPrefixes()) {
@@ -144,7 +162,7 @@ vector<NNEvaluator*> Setup::initializeNNEvaluators(
         requireExactNNLen = cfg.getBool("requireMaxBoardSize");
     }
 
-    bool inputsUseNHWC = backendPrefix == "opencl" || backendPrefix == "trt" || backendPrefix == "metal" || backendPrefix == "rocm" ? false : true;
+    bool inputsUseNHWC = backendPrefix == "opencl" || backendPrefix == "trt" || backendPrefix == "metal" || backendPrefix == "rocm" || backendPrefix == "onnx" ? false : true;
     if(cfg.contains(backendPrefix+"InputsUseNHWC"+idxStr))
       inputsUseNHWC = cfg.getBool(backendPrefix+"InputsUseNHWC"+idxStr);
     else if(cfg.contains("inputsUseNHWC"+idxStr))
@@ -223,9 +241,38 @@ vector<NNEvaluator*> Setup::initializeNNEvaluators(
 
     string homeDataDirOverride = loadHomeDataDirOverride(cfg);
 
-    string openCLTunerFile;
+    string backendExtraParam;
+#if defined(USE_ONNX_BACKEND)
+    string onnxProvider = cfg.contains("onnxProvider") ? cfg.getString("onnxProvider") : "cpu";
+    backendExtraParam = "provider=" + onnxProvider;
+    if(cfg.contains("onnxInputSpatial"))
+      backendExtraParam += ";inputSpatial=" + cfg.getString("onnxInputSpatial");
+    if(cfg.contains("onnxInputGlobal"))
+      backendExtraParam += ";inputGlobal=" + cfg.getString("onnxInputGlobal");
+    if(cfg.contains("onnxInputMeta"))
+      backendExtraParam += ";inputMeta=" + cfg.getString("onnxInputMeta");
+    if(cfg.contains("onnxOutputPolicy"))
+      backendExtraParam += ";outputPolicy=" + cfg.getString("onnxOutputPolicy");
+    if(cfg.contains("onnxOutputValue"))
+      backendExtraParam += ";outputValue=" + cfg.getString("onnxOutputValue");
+    if(cfg.contains("onnxOutputMiscvalue"))
+      backendExtraParam += ";outputMiscvalue=" + cfg.getString("onnxOutputMiscvalue");
+    if(cfg.contains("onnxOutputOwnership"))
+      backendExtraParam += ";outputOwnership=" + cfg.getString("onnxOutputOwnership");
+    if(cfg.contains("onnxModelVersion"))
+      backendExtraParam += ";modelVersion=" + cfg.getString("onnxModelVersion");
+    if(cfg.contains("onnxOpenVINODeviceType"))
+      backendExtraParam += ";openvinoDeviceType=" + cfg.getString("onnxOpenVINODeviceType");
+    if(cfg.contains("onnxOpenVINODeviceId"))
+      backendExtraParam += ";openvinoDeviceId=" + cfg.getString("onnxOpenVINODeviceId");
+    if(cfg.contains("onnxOpenVINOEnableNPUFastCompile"))
+      backendExtraParam += ";openvinoEnableNPUFastCompile=" + cfg.getString("onnxOpenVINOEnableNPUFastCompile");
+    if(cfg.contains("onnxOpenVINOCacheDir"))
+      backendExtraParam += ";openvinoCacheDir=" + cfg.getString("onnxOpenVINOCacheDir");
+#else
     if(cfg.contains("openclTunerFile"))
-      openCLTunerFile = cfg.getString("openclTunerFile");
+      backendExtraParam = cfg.getString("openclTunerFile");
+#endif
     bool openCLReTunePerBoardSize = false;
     if(cfg.contains("openclReTunePerBoardSize"))
       openCLReTunePerBoardSize = cfg.getBool("openclReTunePerBoardSize");
@@ -318,7 +365,7 @@ vector<NNEvaluator*> Setup::initializeNNEvaluators(
       nnCacheSizePowerOfTwo,
       nnMutexPoolSizePowerOfTwo,
       debugSkipNeuralNet,
-      openCLTunerFile,
+      backendExtraParam,
       homeDataDirOverride,
       openCLReTunePerBoardSize,
       useFP16Mode,
diff --git a/cpp/runonnxtests.sh b/cpp/runonnxtests.sh
new file mode 100644
index 000000000..2aff64733
--- /dev/null
+++ b/cpp/runonnxtests.sh
@@ -0,0 +1,43 @@
+#!/bin/bash -eux
+set -o pipefail
+{
+# ---------------------------------------------------------------
+# ONNX backend integration tests
+#
+# Exercises three levels of the inference pipeline:
+#   1. runtinynntests       — tiny model, full pipeline (no external model)
+#   2. testgpuerror -quick  — FP32 unbatched vs batched comparison
+#   3. runnnevalcanarytests — sanity checks on real game positions
+# ---------------------------------------------------------------
+
+mkdir -p tests/scratch
+
+# 1. Tiny NN tests — self-contained, no external model needed
+echo "=== runtinynntests ==="
+./katago runtinynntests tests/scratch 1.0 \
+  | grep -v ': nnRandSeed0 = ' \
+  | grep -v 'finishing, processed'
+
+# 2. GPU error test (quick) — compares unbatched vs batched inference
+#    For CPU ONNX provider both paths are FP32, so errors should be near zero.
+#    Any ownership indexing bug would surface as large ownership error.
+echo "=== testgpuerror -quick ==="
+./katago testgpuerror \
+  -config configs/gtp_example.cfg \
+  -model tests/models/g170-b6c96-s175395328-d26788732.bin.gz \
+  -quick \
+  -override-config "nnRandSeed=forTesting,forDeterministicTesting=true"
+
+# 3. NN eval canary tests — sanity checks on 5 real game positions
+#    Uses symmetries 0, 3, 6 (same as runsearchtests.sh)
+echo "=== runnnevalcanarytests ==="
+./katago runnnevalcanarytests configs/gtp_example.cfg tests/models/g170e-b10c128-s1141046784-d204142634.bin.gz 0 \
+  | grep -v ': nnRandSeed0 = '
+./katago runnnevalcanarytests configs/gtp_example.cfg tests/models/g170e-b10c128-s1141046784-d204142634.bin.gz 3 \
+  | grep -v ': nnRandSeed0 = '
+./katago runnnevalcanarytests configs/gtp_example.cfg tests/models/g170e-b10c128-s1141046784-d204142634.bin.gz 6 \
+  | grep -v ': nnRandSeed0 = '
+
+echo "=== All ONNX tests passed ==="
+exit 0
+}

From d828f21fba094107d9987f96d849355778a73489 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Mon, 16 Mar 2026 18:39:44 +0800
Subject: [PATCH 20/24] Resume gitignore

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 94076bdce..2e933d553 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,7 +23,7 @@ cpp/mainopencl
 cpp/katago
 cpp/configs
 cpp/evalsgf
-# cpp/run*.sh
+cpp/run*.sh
 cpp/tests/scratch
 cpp/program/gitinfo.h
 

From 358dd84b7ef2efcc4e1225bbb7ac90cd5cbc0565 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Mon, 16 Mar 2026 20:59:28 +0800
Subject: [PATCH 21/24] Edit README.md and Compiling.md

---
 Compiling.md                  | 110 ++++++++++++++++++----------------
 README.md                     |   4 +-
 cpp/neuralnet/onnxbackend.cpp |   2 +-
 3 files changed, 60 insertions(+), 56 deletions(-)

diff --git a/Compiling.md b/Compiling.md
index 8fcea46f8..eda612dae 100644
--- a/Compiling.md
+++ b/Compiling.md
@@ -56,59 +56,6 @@ As also mentioned in the instructions below but repeated here for visibility, if
    * You will probably want to edit `configs/gtp_example.cfg` (see "Tuning for Performance" above).
    * If using OpenCL, you will want to verify that KataGo is picking up the correct device when you run it (e.g. some systems may have both an Intel CPU OpenCL and GPU OpenCL, if KataGo appears to pick the wrong one, you can correct this by specifying `openclGpuToUse` in `configs/gtp_example.cfg`).
 
-## ONNX Runtime Backend
-The ONNX backend uses ONNX Runtime for inference, and supports both:
-* `.onnx` models loaded directly.
-* `.bin.gz` KataGo models via internal conversion to ONNX graph (requires ONNX protobuf dependencies in CMake).
-
-### Windows Intel NPU (OpenVINO EP) Setup
-1. Install Visual Studio Community or Visual Studio 2026 Build Tools:
-   * https://visualstudio.microsoft.com/zh-hans/downloads/
-   * In installer workloads, select **Desktop development with C++**.
-2. Install Intel NPU driver:
-   * https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-windows.html
-3. Install OpenVINO 2026 archive package on Windows:
-   * https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-windows.html
-   * Typical install root looks like: `C:\Program Files (x86)\Intel\openvino_2026.0`
-4. Build ONNX Runtime with OpenVINO EP for NPU (follow official docs):
-   * https://onnxruntime.ai/docs/build/eps.html#openvino
-   * Set OpenVINO EP build option so `use_openvino` is `NPU` (for example `--use_openvino NPU` in ORT build.py).
-
-### Prepare `ONNXRUNTIME_ROOT` in KataGo
-Create:
-* `cpp/external/onnxruntime-win-x64-openvino/include`
-* `cpp/external/onnxruntime-win-x64-openvino/lib`
-
-Copy from your ONNX Runtime build/package output (`<ORT_PACKAGE_ROOT>`) to KataGo:
-* `<ORT_PACKAGE_ROOT>/include/*` -> `cpp/external/onnxruntime-win-x64-openvino/include/`
-* `<ORT_PACKAGE_ROOT>/lib/onnxruntime.lib` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime.lib`
-* `<ORT_PACKAGE_ROOT>/lib/onnxruntime.dll` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime.dll`
-* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_shared.dll` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime_providers_shared.dll`
-* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_openvino.dll` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime_providers_openvino.dll`
-
-Optional if present in your ORT output:
-* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_shared.lib` -> `cpp/external/onnxruntime-win-x64-openvino/lib/`
-* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_openvino.lib` -> `cpp/external/onnxruntime-win-x64-openvino/lib/`
-
-### Minimal KataGo Build Commands (Windows, ONNX backend)
-On Windows, `KATAGO_AUTO_FETCH_DEPS=ON` by default, so missing `zlib`, `onnx`, and `protobuf` dependencies are auto-fetched via vcpkg into `cpp/build/deps/vcpkg`.
-
-```
-cmake -S cpp -B cpp/build -G "Visual Studio 18 2026" -A x64 -DUSE_BACKEND=ONNX -DONNXRUNTIME_ROOT=cpp/external/onnxruntime-win-x64-openvino
-cmake --build cpp/build --config Release
-```
-
-If you want to disable auto-fetch and provide dependencies manually:
-* `-DKATAGO_AUTO_FETCH_DEPS=OFF`
-* plus `-DONNX_INCLUDE_DIR=... -DONNX_PROTO_LIB=... -DPROTOBUF_INCLUDE_DIR=... -DPROTOBUF_LIB=... -DZLIB_INCLUDE_DIR=... -DZLIB_LIBRARY=...`
-
-Typical run config for Intel NPU:
-* `onnxProvider = openvino`
-* `onnxOpenVINODeviceType = NPU`
-* `onnxOpenVINOEnableNPUFastCompile = true` (optional; may be ignored on ORT builds that do not support this key)
-
-Multi-device assignment is mainly for `onnxProvider=cuda/tensorrt/migraphx` (`onnxDeviceToUseThread*`).
-For `onnxProvider=openvino` on Intel NPU, a single device is typically used.
 
 ## Windows
    * TLDR:
@@ -168,6 +115,63 @@ For `onnxProvider=openvino` on Intel NPU, a single device is typically used.
    * You will probably want to edit `configs/gtp_example.cfg` (see "Tuning for Performance" above).
    * If using OpenCL, you will want to verify that KataGo is picking up the correct device (e.g. some systems may have both an Intel CPU OpenCL and GPU OpenCL, if KataGo appears to pick the wrong one, you can correct this by specifying `openclGpuToUse` in `configs/gtp_example.cfg`).
 
+##### ONNX Runtime Backend
+The ONNX backend uses ONNX Runtime for inference, and supports both:
+* `.onnx` models loaded directly.
+* `.bin.gz` KataGo models via internal conversion to ONNX graph (requires ONNX protobuf dependencies in CMake).
+
+##### Windows Intel NPU (OpenVINO EP) Setup
+1. Install Visual Studio 2026 Community or Visual Studio 2026 Build Tools:
+   * https://visualstudio.microsoft.com/zh-hans/downloads/
+   * In installer workloads, select **Desktop development with C++**.
+2. Install Intel NPU driver:
+   * https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-windows.html
+3. Install OpenVINO 2026 archive package on Windows:
+   * https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-windows.html
+   * Typical install root looks like: `C:\Program Files (x86)\Intel\openvino_2026.0`
+4. Add these to System PATH:
+   * `C:\Program Files (x86)\Intel\openvino_2026.0\runtime\bin\intel64\Release`
+   * `C:\Program Files (x86)\Intel\openvino_2026.0\runtime\3rdparty\tbb\bin`
+5. Build ONNX Runtime with OpenVINO EP for NPU (follow official docs):
+   * https://onnxruntime.ai/docs/build/eps.html#openvino
+   * Set OpenVINO EP build option so `use_openvino` is `NPU` (for example `--use_openvino NPU` in ORT build.py).
+
+##### Prepare `ONNXRUNTIME_ROOT` in KataGo
+Create:
+* `cpp/external/onnxruntime-win-x64-openvino/include`
+* `cpp/external/onnxruntime-win-x64-openvino/lib`
+
+Copy from your ONNX Runtime build/package output (`<ORT_PACKAGE_ROOT>`) to KataGo:
+* `<ORT_PACKAGE_ROOT>/include/*` -> `cpp/external/onnxruntime-win-x64-openvino/include/`
+* `<ORT_PACKAGE_ROOT>/lib/onnxruntime.lib` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime.lib`
+* `<ORT_PACKAGE_ROOT>/lib/onnxruntime.dll` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime.dll`
+* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_shared.dll` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime_providers_shared.dll`
+* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_openvino.dll` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime_providers_openvino.dll`
+
+Optional if present in your ORT output:
+* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_shared.lib` -> `cpp/external/onnxruntime-win-x64-openvino/lib/`
+* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_openvino.lib` -> `cpp/external/onnxruntime-win-x64-openvino/lib/`
+
+##### Minimal KataGo Build Commands (Windows, ONNX backend)
+On Windows, `KATAGO_AUTO_FETCH_DEPS=ON` by default, so missing `zlib`, `onnx`, and `protobuf` dependencies are auto-fetched via vcpkg into `cpp/build/deps/vcpkg`.
+
+```
+cmake -S cpp -B cpp/build -G "Visual Studio 18 2026" -A x64 -DUSE_BACKEND=ONNX -DONNXRUNTIME_ROOT=cpp/external/onnxruntime-win-x64-openvino
+cmake --build cpp/build --config Release
+```
+
+If you want to disable auto-fetch and provide dependencies manually:
+* `-DKATAGO_AUTO_FETCH_DEPS=OFF`
+* plus `-DONNX_INCLUDE_DIR=... -DONNX_PROTO_LIB=... -DPROTOBUF_INCLUDE_DIR=... -DPROTOBUF_LIB=... -DZLIB_INCLUDE_DIR=... -DZLIB_LIBRARY=...`
+
+Typical run config for Intel NPU:
+* `onnxProvider = openvino`
+* `onnxOpenVINODeviceType = NPU`
+* `onnxOpenVINOEnableNPUFastCompile = true` (optional; may be ignored on ORT builds that do not support this key)
+
+Multi-device assignment is mainly for `onnxProvider=cuda/tensorrt/migraphx` (`onnxDeviceToUseThread*`).
+For `onnxProvider=openvino` on Intel NPU, a single device is typically used.
+
 ## MacOS
    * TLDR:
      ```
diff --git a/README.md b/README.md
index 04560bb7a..068ab22e5 100644
--- a/README.md
+++ b/README.md
@@ -145,8 +145,8 @@ path/to/katago.exe gtp -model path/to/<NEURALNET>.bin.gz -config path/to/gtp_cus
 If you want to use ONNX Runtime + OpenVINO on Intel NPU:
 * Install Intel NPU driver: https://www.intel.com/content/www/us/en/download/794734/intel-npu-driver-windows.html
 * Install OpenVINO archive package (Windows): https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-windows.html
-* Build ONNX Runtime with OpenVINO EP for NPU (`use_openvino=NPU`): https://onnxruntime.ai/docs/build/eps.html#openvino
-* See [Compiling.md](Compiling.md) for the full build/copy steps for `cpp/external/onnxruntime-win-x64-openvino`.
+* Typical install root looks like: `C:\Program Files (x86)\Intel\openvino_2026.0`
+* Add `C:\Program Files (x86)\Intel\openvino_2026.0\runtime\bin\intel64\Release` and `C:\Program Files (x86)\Intel\openvino_2026.0\runtime\3rdparty\tbb\bin` to System PATH
 
 Minimal commands:
 ```
diff --git a/cpp/neuralnet/onnxbackend.cpp b/cpp/neuralnet/onnxbackend.cpp
index 551fc5d4b..4a537a977 100644
--- a/cpp/neuralnet/onnxbackend.cpp
+++ b/cpp/neuralnet/onnxbackend.cpp
@@ -236,7 +236,7 @@ struct ComputeContext {
       nnXLen(xLen),
       nnYLen(yLen),
       providerName(provider),
-      openvinoDeviceType("CPU"),
+      openvinoDeviceType("NPU"),
       openvinoDeviceId(""),
       openvinoEnableNPUFastCompile(false),
       openvinoCacheDir(""),

From 496bb964460e7b659bb38da6fb30d1159f3883ca Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Tue, 17 Mar 2026 07:29:50 +0800
Subject: [PATCH 22/24] Add Linux Intel NPU support

---
 Compiling.md       | 130 ++++++++++++++++++++++++++++++++++++++-----
 README.md          |  30 ++++++++--
 cpp/CMakeLists.txt | 136 ++++++++++++++++++++++++++++++++++-----------
 3 files changed, 246 insertions(+), 50 deletions(-)

diff --git a/Compiling.md b/Compiling.md
index eda612dae..3908e8350 100644
--- a/Compiling.md
+++ b/Compiling.md
@@ -56,6 +56,79 @@ As also mentioned in the instructions below but repeated here for visibility, if
    * You will probably want to edit `configs/gtp_example.cfg` (see "Tuning for Performance" above).
    * If using OpenCL, you will want to verify that KataGo is picking up the correct device when you run it (e.g. some systems may have both an Intel CPU OpenCL and GPU OpenCL, if KataGo appears to pick the wrong one, you can correct this by specifying `openclGpuToUse` in `configs/gtp_example.cfg`).
 
+##### ONNX Runtime Backend (Linux)
+The ONNX backend uses ONNX Runtime for inference, and supports both:
+* `.onnx` models loaded directly.
+* `.bin.gz` KataGo models via internal conversion to ONNX graph (requires ONNX protobuf dependencies in CMake).
+
+##### Linux Intel NPU (OpenVINO EP) Setup
+1. Install Intel NPU driver on Linux:
+   * https://github.com/intel/linux-npu-driver
+2. Install OpenVINO via system package manager (APT example):
+   * https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-apt.html
+3. Build ONNX Runtime with OpenVINO EP for NPU (same ORT flow as Windows):
+   * https://onnxruntime.ai/docs/build/eps.html#openvino
+   * Set OpenVINO EP build option so `use_openvino` is `NPU` (for example `--use_openvino NPU` in ORT build.py).
+
+##### Prepare `ONNXRUNTIME_ROOT` in KataGo (Linux)
+Use package root:
+* `cpp/external/onnxruntime-linux-x64-openvino`
+
+Linux one-to-one mapping (`<ORT_PACKAGE_ROOT>` -> KataGo):
+
+Include files:
+
+| Source (`<ORT_PACKAGE_ROOT>`) | Destination (KataGo) |
+| --- | --- |
+| `include/core/*` | `cpp/external/onnxruntime-linux-x64-openvino/include/core/` |
+| `include/cpu_provider_factory.h` | `cpp/external/onnxruntime-linux-x64-openvino/include/cpu_provider_factory.h` |
+| `include/provider_options.h` | `cpp/external/onnxruntime-linux-x64-openvino/include/provider_options.h` |
+| `include/onnxruntime_c_api.h` | `cpp/external/onnxruntime-linux-x64-openvino/include/onnxruntime_c_api.h` |
+| `include/onnxruntime_cxx_api.h` | `cpp/external/onnxruntime-linux-x64-openvino/include/onnxruntime_cxx_api.h` |
+| `include/onnxruntime_cxx_inline.h` | `cpp/external/onnxruntime-linux-x64-openvino/include/onnxruntime_cxx_inline.h` |
+| `include/onnxruntime_env_config_keys.h` | `cpp/external/onnxruntime-linux-x64-openvino/include/onnxruntime_env_config_keys.h` |
+| `include/onnxruntime_ep_c_api.h` | `cpp/external/onnxruntime-linux-x64-openvino/include/onnxruntime_ep_c_api.h` |
+| `include/onnxruntime_ep_device_ep_metadata_keys.h` | `cpp/external/onnxruntime-linux-x64-openvino/include/onnxruntime_ep_device_ep_metadata_keys.h` |
+| `include/onnxruntime_float16.h` | `cpp/external/onnxruntime-linux-x64-openvino/include/onnxruntime_float16.h` |
+| `include/onnxruntime_lite_custom_op.h` | `cpp/external/onnxruntime-linux-x64-openvino/include/onnxruntime_lite_custom_op.h` |
+| `include/onnxruntime_run_options_config_keys.h` | `cpp/external/onnxruntime-linux-x64-openvino/include/onnxruntime_run_options_config_keys.h` |
+| `include/onnxruntime_session_options_config_keys.h` | `cpp/external/onnxruntime-linux-x64-openvino/include/onnxruntime_session_options_config_keys.h` |
+
+Library/config/pkgconfig files:
+
+| Source (`<ORT_PACKAGE_ROOT>`) | Destination (KataGo) |
+| --- | --- |
+| `lib/libonnxruntime_providers_openvino.so` | `cpp/external/onnxruntime-linux-x64-openvino/lib/libonnxruntime_providers_openvino.so` |
+| `lib/libonnxruntime_providers_shared.so` | `cpp/external/onnxruntime-linux-x64-openvino/lib/libonnxruntime_providers_shared.so` |
+| `lib/libonnxruntime.so.1.24.3` | `cpp/external/onnxruntime-linux-x64-openvino/lib/libonnxruntime.so.1.24.3` |
+| `lib/libonnxruntime.so.1` (symlink to `.1.24.3`) | `cpp/external/onnxruntime-linux-x64-openvino/lib/libonnxruntime.so.1` |
+| `lib/libonnxruntime.so` (symlink to `.1`) | `cpp/external/onnxruntime-linux-x64-openvino/lib/libonnxruntime.so` |
+| `lib/cmake/onnxruntime/onnxruntimeConfig.cmake` | `cpp/external/onnxruntime-linux-x64-openvino/lib/cmake/onnxruntime/onnxruntimeConfig.cmake` |
+| `lib/cmake/onnxruntime/onnxruntimeConfigVersion.cmake` | `cpp/external/onnxruntime-linux-x64-openvino/lib/cmake/onnxruntime/onnxruntimeConfigVersion.cmake` |
+| `lib/cmake/onnxruntime/onnxruntimeTargets.cmake` | `cpp/external/onnxruntime-linux-x64-openvino/lib/cmake/onnxruntime/onnxruntimeTargets.cmake` |
+| `lib/cmake/onnxruntime/onnxruntimeTargets-release.cmake` | `cpp/external/onnxruntime-linux-x64-openvino/lib/cmake/onnxruntime/onnxruntimeTargets-release.cmake` |
+| `lib/pkgconfig/libonnxruntime.pc` | `cpp/external/onnxruntime-linux-x64-openvino/lib/pkgconfig/libonnxruntime.pc` |
+
+##### Minimal KataGo Build Commands (Linux, ONNX backend)
+On Linux, `KATAGO_AUTO_FETCH_DEPS=ON` can auto-fetch missing `zlib`, `onnx`, and `protobuf` dependencies via vcpkg into `cpp/build/deps/vcpkg`.
+
+```bash
+cmake -S cpp -B cpp/build -G Ninja -DUSE_BACKEND=ONNX -DONNXRUNTIME_ROOT=cpp/external/onnxruntime-linux-x64-openvino
+cmake --build cpp/build -j
+```
+
+If you want to disable auto-fetch and provide dependencies manually:
+* `-DKATAGO_AUTO_FETCH_DEPS=OFF`
+* plus `-DONNX_INCLUDE_DIR=... -DONNX_PROTO_LIB=... -DPROTOBUF_INCLUDE_DIR=... -DPROTOBUF_LIB=... -DZLIB_INCLUDE_DIR=... -DZLIB_LIBRARY=...`
+
+Typical run config for Intel NPU:
+* `onnxProvider = openvino`
+* `onnxOpenVINODeviceType = NPU`
+* `onnxOpenVINOEnableNPUFastCompile = true` (optional; may be ignored on ORT builds that do not support this key)
+
+Multi-device assignment is mainly for `onnxProvider=cuda/tensorrt/migraphx` (`onnxDeviceToUseThread*`).
+For `onnxProvider=openvino` on Intel NPU, a single device is typically used.
+
 
 ## Windows
    * TLDR:
@@ -136,21 +209,48 @@ The ONNX backend uses ONNX Runtime for inference, and supports both:
    * https://onnxruntime.ai/docs/build/eps.html#openvino
    * Set OpenVINO EP build option so `use_openvino` is `NPU` (for example `--use_openvino NPU` in ORT build.py).
 
-##### Prepare `ONNXRUNTIME_ROOT` in KataGo
-Create:
-* `cpp/external/onnxruntime-win-x64-openvino/include`
-* `cpp/external/onnxruntime-win-x64-openvino/lib`
-
-Copy from your ONNX Runtime build/package output (`<ORT_PACKAGE_ROOT>`) to KataGo:
-* `<ORT_PACKAGE_ROOT>/include/*` -> `cpp/external/onnxruntime-win-x64-openvino/include/`
-* `<ORT_PACKAGE_ROOT>/lib/onnxruntime.lib` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime.lib`
-* `<ORT_PACKAGE_ROOT>/lib/onnxruntime.dll` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime.dll`
-* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_shared.dll` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime_providers_shared.dll`
-* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_openvino.dll` -> `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime_providers_openvino.dll`
-
-Optional if present in your ORT output:
-* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_shared.lib` -> `cpp/external/onnxruntime-win-x64-openvino/lib/`
-* `<ORT_PACKAGE_ROOT>/lib/onnxruntime_providers_openvino.lib` -> `cpp/external/onnxruntime-win-x64-openvino/lib/`
+##### Prepare `ONNXRUNTIME_ROOT` in KataGo (Windows)
+Use package root:
+* `cpp/external/onnxruntime-win-x64-openvino`
+
+Windows one-to-one mapping (`<ORT_PACKAGE_ROOT>` -> KataGo):
+
+Windows and Linux generally share the same ORT include/config layout.
+The main differences are binary file names/extensions (`.dll/.lib` vs `.so`).
+
+Include files:
+
+| Source (`<ORT_PACKAGE_ROOT>`) | Destination (KataGo) |
+| --- | --- |
+| `include/core/*` | `cpp/external/onnxruntime-win-x64-openvino/include/core/` |
+| `include/cpu_provider_factory.h` | `cpp/external/onnxruntime-win-x64-openvino/include/cpu_provider_factory.h` |
+| `include/provider_options.h` | `cpp/external/onnxruntime-win-x64-openvino/include/provider_options.h` |
+| `include/onnxruntime_c_api.h` | `cpp/external/onnxruntime-win-x64-openvino/include/onnxruntime_c_api.h` |
+| `include/onnxruntime_cxx_api.h` | `cpp/external/onnxruntime-win-x64-openvino/include/onnxruntime_cxx_api.h` |
+| `include/onnxruntime_cxx_inline.h` | `cpp/external/onnxruntime-win-x64-openvino/include/onnxruntime_cxx_inline.h` |
+| `include/onnxruntime_env_config_keys.h` | `cpp/external/onnxruntime-win-x64-openvino/include/onnxruntime_env_config_keys.h` |
+| `include/onnxruntime_ep_c_api.h` | `cpp/external/onnxruntime-win-x64-openvino/include/onnxruntime_ep_c_api.h` |
+| `include/onnxruntime_ep_device_ep_metadata_keys.h` | `cpp/external/onnxruntime-win-x64-openvino/include/onnxruntime_ep_device_ep_metadata_keys.h` |
+| `include/onnxruntime_float16.h` | `cpp/external/onnxruntime-win-x64-openvino/include/onnxruntime_float16.h` |
+| `include/onnxruntime_lite_custom_op.h` | `cpp/external/onnxruntime-win-x64-openvino/include/onnxruntime_lite_custom_op.h` |
+| `include/onnxruntime_run_options_config_keys.h` | `cpp/external/onnxruntime-win-x64-openvino/include/onnxruntime_run_options_config_keys.h` |
+| `include/onnxruntime_session_options_config_keys.h` | `cpp/external/onnxruntime-win-x64-openvino/include/onnxruntime_session_options_config_keys.h` |
+
+Library/config/pkgconfig files:
+
+| Source (`<ORT_PACKAGE_ROOT>`) | Destination (KataGo) |
+| --- | --- |
+| `lib/onnxruntime.lib` | `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime.lib` |
+| `lib/onnxruntime.dll` | `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime.dll` |
+| `lib/onnxruntime_providers_shared.dll` | `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime_providers_shared.dll` |
+| `lib/onnxruntime_providers_openvino.dll` | `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime_providers_openvino.dll` |
+| `lib/onnxruntime_providers_shared.lib` (optional import lib) | `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime_providers_shared.lib` |
+| `lib/onnxruntime_providers_openvino.lib` (optional import lib) | `cpp/external/onnxruntime-win-x64-openvino/lib/onnxruntime_providers_openvino.lib` |
+| `lib/cmake/onnxruntime/onnxruntimeConfig.cmake` | `cpp/external/onnxruntime-win-x64-openvino/lib/cmake/onnxruntime/onnxruntimeConfig.cmake` |
+| `lib/cmake/onnxruntime/onnxruntimeConfigVersion.cmake` | `cpp/external/onnxruntime-win-x64-openvino/lib/cmake/onnxruntime/onnxruntimeConfigVersion.cmake` |
+| `lib/cmake/onnxruntime/onnxruntimeTargets.cmake` | `cpp/external/onnxruntime-win-x64-openvino/lib/cmake/onnxruntime/onnxruntimeTargets.cmake` |
+| `lib/cmake/onnxruntime/onnxruntimeTargets-release.cmake` | `cpp/external/onnxruntime-win-x64-openvino/lib/cmake/onnxruntime/onnxruntimeTargets-release.cmake` |
+| `lib/pkgconfig/libonnxruntime.pc` (optional) | `cpp/external/onnxruntime-win-x64-openvino/lib/pkgconfig/libonnxruntime.pc` |
 
 ##### Minimal KataGo Build Commands (Windows, ONNX backend)
 On Windows, `KATAGO_AUTO_FETCH_DEPS=ON` by default, so missing `zlib`, `onnx`, and `protobuf` dependencies are auto-fetched via vcpkg into `cpp/build/deps/vcpkg`.
diff --git a/README.md b/README.md
index 068ab22e5..bd9d86b42 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,7 @@
     - [OpenCL vs CUDA vs TensorRT vs ROCm vs Eigen vs ONNX](#opencl-vs-cuda-vs-tensorrt-vs-rocm-vs-eigen-vs-onnx)
     - [How To Use](#how-to-use)
       - [ONNX/OpenVINO Intel NPU Quick Start (Windows)](#onnxopenvino-intel-npu-quick-start-windows)
+      - [ONNX/OpenVINO Intel NPU Quick Start (Linux)](#onnxopenvino-intel-npu-quick-start-linux)
       - [Human-style Play and Analysis](#human-style-play-and-analysis)
       - [Other Commands:](#other-commands)
     - [Tuning for Performance](#tuning-for-performance)
@@ -151,16 +152,37 @@ If you want to use ONNX Runtime + OpenVINO on Intel NPU:
 Minimal commands:
 ```
 # 1) Export .bin/.bin.gz to ONNX (default export size is 19x19)
-./katago.exe exportonnx -model <NEURALNET>.bin.gz -output <NEURALNET>.19x19.onnx
+./katago.exe exportonnx -model <NEURALNET>.bin.gz -output <NEURALNET>.onnx
 
 # 2) Benchmark on Intel NPU (OpenVINO provider)
-./katago.exe benchmark -config cpp/configs/gtp_example.cfg -model <NEURALNET>.19x19.onnx -visits 32 -threads 1 -n 2 -override-config onnxProvider=openvino,onnxOpenVINODeviceType=NPU,numSearchThreads=1,numNNServerThreadsPerModel=1
+./katago.exe benchmark -config cpp/configs/gtp_example.cfg -model <NEURALNET>.onnx
 
 # 3) Run GTP for GUI tools (Sabaki/Lizzie/q5Go/etc)
-./katago.exe gtp -config cpp/configs/gtp_example.cfg -model <NEURALNET>.19x19.onnx
+./katago.exe gtp -config cpp/configs/gtp_example.cfg -model <NEURALNET>.onnx
 
 If you don't prepare config file, then use -override-config args, like:
-./katago.exe gtp -config cpp/configs/gtp_example.cfg -model <NEURALNET>.19x19.onnx -override-config onnxProvider=openvino,onnxOpenVINODeviceType=NPU
+./katago.exe gtp -config cpp/configs/gtp_example.cfg -model <NEURALNET>.onnx -override-config onnxProvider=openvino,onnxOpenVINODeviceType=NPU
+```
+
+#### ONNX/OpenVINO Intel NPU Quick Start (Linux)
+
+If you want to use ONNX Runtime + OpenVINO on Intel NPU:
+* Install Intel NPU driver (Linux): https://github.com/intel/linux-npu-driver
+* Install OpenVINO via system package manager (APT example): https://docs.openvino.ai/2025/get-started/install-openvino/install-openvino-apt.html
+
+Minimal commands:
+```bash
+# 1) Export .bin/.bin.gz to ONNX (default export size is 19x19)
+./katago exportonnx -model <NEURALNET>.bin.gz -output <NEURALNET>.onnx
+
+# 2) Benchmark on Intel NPU (OpenVINO provider)
+./katago benchmark -config cpp/configs/gtp_example.cfg -model <NEURALNET>.onnx
+
+# 3) Run GTP for GUI tools (Sabaki/Lizzie/q5Go/etc)
+./katago gtp -config cpp/configs/gtp_example.cfg -model <NEURALNET>.onnx
+
+# If you don't prepare config file, use -override-config:
+./katago gtp -config cpp/configs/gtp_example.cfg -model <NEURALNET>.onnx -override-config onnxProvider=openvino,onnxOpenVINODeviceType=NPU
 ```
 
 #### Human-style Play and Analysis
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index b30bb964a..1dd8ba5f7 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -58,20 +58,31 @@ set(USE_BIGGER_BOARDS_EXPENSIVE 0 CACHE BOOL "Allow boards up to size 50. Compil
 set(USE_CACHE_TENSORRT_PLAN 0 CACHE BOOL "Use TENSORRT plan cache. May use a lot of disk space. Only applies when USE_BACKEND is TENSORRT.")
 mark_as_advanced(USE_CACHE_TENSORRT_PLAN)
 
-if(WIN32)
+if(WIN32 OR (UNIX AND NOT APPLE))
   set(_katago_auto_fetch_default ON)
 else()
   set(_katago_auto_fetch_default OFF)
 endif()
-option(KATAGO_AUTO_FETCH_DEPS "Automatically fetch missing dependencies into build/deps (Windows uses vcpkg)." ${_katago_auto_fetch_default})
+option(KATAGO_AUTO_FETCH_DEPS "Automatically fetch missing dependencies into build/deps (Windows/Linux use vcpkg)." ${_katago_auto_fetch_default})
 set(KATAGO_DEPS_DIR "${CMAKE_SOURCE_DIR}/build/deps" CACHE PATH "Directory for auto-fetched third-party dependencies")
-set(KATAGO_VCPKG_TRIPLET "x64-windows" CACHE STRING "vcpkg triplet used by KATAGO_AUTO_FETCH_DEPS on Windows")
+if(WIN32)
+  set(_katago_vcpkg_triplet_default "x64-windows")
+elseif(UNIX AND NOT APPLE)
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64)$")
+    set(_katago_vcpkg_triplet_default "arm64-linux")
+  else()
+    set(_katago_vcpkg_triplet_default "x64-linux")
+  endif()
+else()
+  set(_katago_vcpkg_triplet_default "x64-windows")
+endif()
+set(KATAGO_VCPKG_TRIPLET "${_katago_vcpkg_triplet_default}" CACHE STRING "vcpkg triplet used by KATAGO_AUTO_FETCH_DEPS")
 set(KATAGO_VCPKG_ROOT "${KATAGO_DEPS_DIR}/vcpkg" CACHE PATH "Path to local vcpkg clone used by KATAGO_AUTO_FETCH_DEPS")
 mark_as_advanced(KATAGO_VCPKG_TRIPLET KATAGO_VCPKG_ROOT)
 
 function(katago_vcpkg_bootstrap_if_needed)
-  if(NOT WIN32)
-    message(FATAL_ERROR "katago_vcpkg_bootstrap_if_needed is only supported on Windows")
+  if(NOT WIN32 AND NOT (UNIX AND NOT APPLE))
+    message(FATAL_ERROR "katago_vcpkg_bootstrap_if_needed is only supported on Windows and Linux")
   endif()
 
   if(NOT KATAGO_AUTO_FETCH_DEPS)
@@ -80,7 +91,13 @@ function(katago_vcpkg_bootstrap_if_needed)
 
   file(MAKE_DIRECTORY "${KATAGO_DEPS_DIR}")
 
-  if(NOT EXISTS "${KATAGO_VCPKG_ROOT}/vcpkg.exe")
+  if(WIN32)
+    set(_katago_vcpkg_exe "${KATAGO_VCPKG_ROOT}/vcpkg.exe")
+  else()
+    set(_katago_vcpkg_exe "${KATAGO_VCPKG_ROOT}/vcpkg")
+  endif()
+
+  if(NOT EXISTS "${_katago_vcpkg_exe}")
     if(NOT EXISTS "${KATAGO_VCPKG_ROOT}/.git")
       find_package(Git QUIET)
       if(NOT GIT_FOUND)
@@ -99,11 +116,19 @@ function(katago_vcpkg_bootstrap_if_needed)
     endif()
 
     message(STATUS "Auto-fetch deps: bootstrapping vcpkg")
-    execute_process(
-      COMMAND "${KATAGO_VCPKG_ROOT}/bootstrap-vcpkg.bat" -disableMetrics
-      WORKING_DIRECTORY "${KATAGO_VCPKG_ROOT}"
-      RESULT_VARIABLE _bootstrap_result
-    )
+    if(WIN32)
+      execute_process(
+        COMMAND "${KATAGO_VCPKG_ROOT}/bootstrap-vcpkg.bat" -disableMetrics
+        WORKING_DIRECTORY "${KATAGO_VCPKG_ROOT}"
+        RESULT_VARIABLE _bootstrap_result
+      )
+    else()
+      execute_process(
+        COMMAND sh "${KATAGO_VCPKG_ROOT}/bootstrap-vcpkg.sh" -disableMetrics
+        WORKING_DIRECTORY "${KATAGO_VCPKG_ROOT}"
+        RESULT_VARIABLE _bootstrap_result
+      )
+    endif()
     if(NOT _bootstrap_result EQUAL 0)
       message(FATAL_ERROR "Failed to bootstrap vcpkg")
     endif()
@@ -111,16 +136,21 @@ function(katago_vcpkg_bootstrap_if_needed)
 endfunction()
 
 function(katago_vcpkg_install_if_needed package_name)
-  if(NOT WIN32)
-    message(FATAL_ERROR "katago_vcpkg_install_if_needed is only supported on Windows")
-  endif()
-
   katago_vcpkg_bootstrap_if_needed()
 
+  if(WIN32)
+    set(_katago_vcpkg_exe "${KATAGO_VCPKG_ROOT}/vcpkg.exe")
+  else()
+    set(_katago_vcpkg_exe "${KATAGO_VCPKG_ROOT}/vcpkg")
+  endif()
+  if(NOT EXISTS "${_katago_vcpkg_exe}")
+    message(FATAL_ERROR "vcpkg executable not found after bootstrap: ${_katago_vcpkg_exe}")
+  endif()
+
   set(_spec "${package_name}:${KATAGO_VCPKG_TRIPLET}")
   message(STATUS "Auto-fetch deps: ensuring ${_spec} via vcpkg")
   execute_process(
-    COMMAND "${KATAGO_VCPKG_ROOT}/vcpkg.exe" install "${_spec}" --disable-metrics
+    COMMAND "${_katago_vcpkg_exe}" install "${_spec}" --disable-metrics
     WORKING_DIRECTORY "${KATAGO_VCPKG_ROOT}"
     RESULT_VARIABLE _install_result
   )
@@ -682,6 +712,8 @@ elseif(USE_BACKEND STREQUAL "ONNX")
 
   if(WIN32)
     set(_onnx_default_root "${CMAKE_CURRENT_SOURCE_DIR}/external/onnxruntime-win-x64-openvino")
+  elseif(UNIX AND NOT APPLE)
+    set(_onnx_default_root "${CMAKE_CURRENT_SOURCE_DIR}/external/onnxruntime-linux-x64-openvino")
   else()
     set(_onnx_default_root "")
   endif()
@@ -714,6 +746,8 @@ elseif(USE_BACKEND STREQUAL "ONNX")
   set(ONNX_PROTO_LIB "" CACHE FILEPATH "Path to onnx_proto library (required for .bin.gz -> ONNX conversion)")
   set(PROTOBUF_INCLUDE_DIR "" CACHE PATH "Directory containing google/protobuf/message.h (required for .bin.gz -> ONNX conversion)")
   set(PROTOBUF_LIB "" CACHE FILEPATH "Path to protobuf library (protobuf-lite or libprotobuf, required for .bin.gz -> ONNX conversion)")
+  set(ONNX_PROTO_TARGET "" CACHE STRING "Imported CMake target for onnx_proto (optional, preferred over ONNX_PROTO_LIB)")
+  set(PROTOBUF_TARGET "" CACHE STRING "Imported CMake target for protobuf (optional, preferred over PROTOBUF_LIB)")
   mark_as_advanced(CLEAR ONNX_INCLUDE_DIR ONNX_PROTO_LIB PROTOBUF_INCLUDE_DIR PROTOBUF_LIB)
 
   # Backward compatibility with older cache variable name.
@@ -721,7 +755,11 @@ elseif(USE_BACKEND STREQUAL "ONNX")
     set(PROTOBUF_LIB "${PROTOBUF_LITE_LIB}")
   endif()
 
-  if(WIN32 AND KATAGO_AUTO_FETCH_DEPS)
+  if(KATAGO_AUTO_FETCH_DEPS)
+    set(_katago_vcpkg_installed_root "${KATAGO_VCPKG_ROOT}/installed/${KATAGO_VCPKG_TRIPLET}")
+  endif()
+
+  if(KATAGO_AUTO_FETCH_DEPS)
     set(_need_onnx_proto_deps FALSE)
     if(NOT ONNX_INCLUDE_DIR OR NOT ONNX_PROTO_LIB OR NOT PROTOBUF_INCLUDE_DIR OR NOT PROTOBUF_LIB)
       set(_need_onnx_proto_deps TRUE)
@@ -733,41 +771,76 @@ elseif(USE_BACKEND STREQUAL "ONNX")
       if(EXISTS "${_katago_vcpkg_installed_root}/include/onnx/onnx-ml.pb.h")
         set(ONNX_INCLUDE_DIR "${_katago_vcpkg_installed_root}/include" CACHE PATH "Directory containing onnx/onnx-ml.pb.h (required for .bin.gz -> ONNX conversion)" FORCE)
       endif()
-      if(EXISTS "${_katago_vcpkg_installed_root}/lib/onnx_proto.lib")
-        set(ONNX_PROTO_LIB "${_katago_vcpkg_installed_root}/lib/onnx_proto.lib" CACHE FILEPATH "Path to onnx_proto library (required for .bin.gz -> ONNX conversion)" FORCE)
+      find_library(_katago_onnx_proto_lib NAMES onnx_proto HINTS "${_katago_vcpkg_installed_root}/lib" "${_katago_vcpkg_installed_root}/debug/lib" NO_DEFAULT_PATH)
+      if(_katago_onnx_proto_lib)
+        set(ONNX_PROTO_LIB "${_katago_onnx_proto_lib}" CACHE FILEPATH "Path to onnx_proto library (required for .bin.gz -> ONNX conversion)" FORCE)
       endif()
       if(EXISTS "${_katago_vcpkg_installed_root}/include/google/protobuf/message.h")
         set(PROTOBUF_INCLUDE_DIR "${_katago_vcpkg_installed_root}/include" CACHE PATH "Directory containing google/protobuf/message.h (required for .bin.gz -> ONNX conversion)" FORCE)
       endif()
-      if(EXISTS "${_katago_vcpkg_installed_root}/lib/libprotobuf-lite.lib")
-        set(PROTOBUF_LIB "${_katago_vcpkg_installed_root}/lib/libprotobuf-lite.lib" CACHE FILEPATH "Path to protobuf library (protobuf-lite or libprotobuf, required for .bin.gz -> ONNX conversion)" FORCE)
-      elseif(EXISTS "${_katago_vcpkg_installed_root}/lib/libprotobuf.lib")
-        set(PROTOBUF_LIB "${_katago_vcpkg_installed_root}/lib/libprotobuf.lib" CACHE FILEPATH "Path to protobuf library (protobuf-lite or libprotobuf, required for .bin.gz -> ONNX conversion)" FORCE)
+      find_library(_katago_protobuf_lib NAMES protobuf-lite libprotobuf-lite protobuf libprotobuf HINTS "${_katago_vcpkg_installed_root}/lib" "${_katago_vcpkg_installed_root}/debug/lib" NO_DEFAULT_PATH)
+      if(_katago_protobuf_lib)
+        set(PROTOBUF_LIB "${_katago_protobuf_lib}" CACHE FILEPATH "Path to protobuf library (protobuf-lite or libprotobuf, required for .bin.gz -> ONNX conversion)" FORCE)
       endif()
     endif()
   endif()
 
+  # Prefer config-mode packages (vcpkg provides these) so transitive dependencies
+  # like absl/utf8_range are linked automatically.
+  if(_katago_vcpkg_installed_root)
+    list(APPEND CMAKE_PREFIX_PATH "${_katago_vcpkg_installed_root}")
+  endif()
+  if(NOT ONNX_PROTO_TARGET)
+    find_package(ONNX CONFIG QUIET)
+    if(TARGET ONNX::onnx_proto)
+      set(ONNX_PROTO_TARGET "ONNX::onnx_proto")
+    endif()
+  endif()
+  if(NOT PROTOBUF_TARGET)
+    if(_katago_vcpkg_installed_root AND EXISTS "${_katago_vcpkg_installed_root}/share/protobuf/protobuf-config.cmake")
+      set(protobuf_DIR "${_katago_vcpkg_installed_root}/share/protobuf")
+    endif()
+    find_package(protobuf CONFIG QUIET)
+    if(NOT TARGET protobuf::libprotobuf AND NOT TARGET protobuf::libprotobuf-lite)
+      find_package(Protobuf CONFIG QUIET)
+    endif()
+    if(TARGET protobuf::libprotobuf)
+      set(PROTOBUF_TARGET "protobuf::libprotobuf")
+    elseif(TARGET protobuf::libprotobuf-lite)
+      set(PROTOBUF_TARGET "protobuf::libprotobuf-lite")
+    endif()
+  endif()
+
   if(NOT ONNX_INCLUDE_DIR)
     find_path(ONNX_INCLUDE_DIR onnx/onnx-ml.pb.h)
   endif()
-  if(NOT ONNX_PROTO_LIB)
+  if(NOT ONNX_PROTO_LIB AND NOT ONNX_PROTO_TARGET)
     find_library(ONNX_PROTO_LIB onnx_proto)
   endif()
   if(NOT PROTOBUF_INCLUDE_DIR)
     find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h)
   endif()
-  if(NOT PROTOBUF_LIB)
-    find_library(PROTOBUF_LIB protobuf-lite protobuf-lite32 libprotobuf protobuf)
+  if(NOT PROTOBUF_LIB AND NOT PROTOBUF_TARGET)
+    find_library(PROTOBUF_LIB libprotobuf protobuf protobuf-lite protobuf-lite32)
   endif()
 
-  if(NOT ONNX_INCLUDE_DIR OR NOT ONNX_PROTO_LIB OR NOT PROTOBUF_INCLUDE_DIR OR NOT PROTOBUF_LIB)
+  if(NOT ONNX_INCLUDE_DIR OR NOT PROTOBUF_INCLUDE_DIR OR (NOT ONNX_PROTO_TARGET AND NOT ONNX_PROTO_LIB) OR (NOT PROTOBUF_TARGET AND NOT PROTOBUF_LIB))
     message(FATAL_ERROR
       "ONNX backend requires ONNX protobuf dependencies for .bin.gz model conversion. "
       "Set ONNX_INCLUDE_DIR (contains onnx/onnx-ml.pb.h), ONNX_PROTO_LIB, PROTOBUF_INCLUDE_DIR, and PROTOBUF_LIB.")
   endif()
   target_include_directories(katago SYSTEM PRIVATE "${ONNX_INCLUDE_DIR}")
   target_include_directories(katago SYSTEM PRIVATE "${PROTOBUF_INCLUDE_DIR}")
-  target_link_libraries(katago ${ONNX_PROTO_LIB} ${PROTOBUF_LIB})
+  if(ONNX_PROTO_TARGET)
+    target_link_libraries(katago ${ONNX_PROTO_TARGET})
+  else()
+    target_link_libraries(katago ${ONNX_PROTO_LIB})
+  endif()
+  if(PROTOBUF_TARGET)
+    target_link_libraries(katago ${PROTOBUF_TARGET})
+  else()
+    target_link_libraries(katago ${PROTOBUF_LIB})
+  endif()
 
   if(WIN32 AND ONNXRUNTIME_DLLS)
     foreach(_onnxruntime_dll IN LISTS ONNXRUNTIME_DLLS)
@@ -809,7 +882,7 @@ if(NO_GIT_REVISION AND (NOT BUILD_DISTRIBUTED))
   target_compile_definitions(katago PRIVATE NO_GIT_REVISION)
 endif()
 
-if(WIN32 AND KATAGO_AUTO_FETCH_DEPS)
+if(KATAGO_AUTO_FETCH_DEPS)
   set(_need_zlib_deps FALSE)
   if(NOT ZLIB_INCLUDE_DIR OR NOT ZLIB_LIBRARY)
     set(_need_zlib_deps TRUE)
@@ -820,8 +893,9 @@ if(WIN32 AND KATAGO_AUTO_FETCH_DEPS)
     if(EXISTS "${_katago_vcpkg_installed_root}/include/zlib.h")
       set(ZLIB_INCLUDE_DIR "${_katago_vcpkg_installed_root}/include" CACHE PATH "Path to directory with zlib.h and other header files" FORCE)
     endif()
-    if(EXISTS "${_katago_vcpkg_installed_root}/lib/zlib.lib")
-      set(ZLIB_LIBRARY "${_katago_vcpkg_installed_root}/lib/zlib.lib" CACHE FILEPATH "Path to 'libz.so' on Linux or 'libz.lib' on Windows" FORCE)
+    find_library(_katago_zlib_lib NAMES zlib z HINTS "${_katago_vcpkg_installed_root}/lib" "${_katago_vcpkg_installed_root}/debug/lib" NO_DEFAULT_PATH)
+    if(_katago_zlib_lib)
+      set(ZLIB_LIBRARY "${_katago_zlib_lib}" CACHE FILEPATH "Path to 'libz.so' on Linux or 'libz.lib' on Windows" FORCE)
     endif()
   endif()
 endif()

From 115e6daba5f8063fd70d7c89631f123cccced902 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Tue, 17 Mar 2026 00:00:25 +0800
Subject: [PATCH 23/24] Edit Compiling.md

---
 Compiling.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Compiling.md b/Compiling.md
index 3908e8350..d8bab07cf 100644
--- a/Compiling.md
+++ b/Compiling.md
@@ -257,7 +257,7 @@ On Windows, `KATAGO_AUTO_FETCH_DEPS=ON` by default, so missing `zlib`, `onnx`, a
 
 ```
 cmake -S cpp -B cpp/build -G "Visual Studio 18 2026" -A x64 -DUSE_BACKEND=ONNX -DONNXRUNTIME_ROOT=cpp/external/onnxruntime-win-x64-openvino
-cmake --build cpp/build --config Release
+cmake --build cpp/build --config Release -j
 ```
 
 If you want to disable auto-fetch and provide dependencies manually:

From 11433e6c13de8a93b7d65d0d5d9a66d8ea1a3644 Mon Sep 17 00:00:00 2001
From: Looong01 <lizelongdd@hotmail.com>
Date: Wed, 25 Mar 2026 23:50:22 +0800
Subject: [PATCH 24/24] Fix a bug

---
 Compiling.md                       |  2 +-
 cpp/CMakeLists.txt                 | 10 +++++-----
 cpp/neuralnet/onnxmodelbuilder.cpp |  4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Compiling.md b/Compiling.md
index d8bab07cf..9e952534e 100644
--- a/Compiling.md
+++ b/Compiling.md
@@ -35,7 +35,7 @@ As also mentioned in the instructions below but repeated here for visibility, if
       * If using the TensorRT backend, in addition to a compatible CUDA Toolkit (https://developer.nvidia.com/cuda-toolkit), you also need TensorRT (https://developer.nvidia.com/tensorrt) that is at least version 8.5.
       * If using the ROCm backend, ROCm 6.4 or later and a GPU capable of supporting them. More information about installation(https://rocm.docs.amd.com/projects/install-on-linux/en/latest/) and please install all possiable ROCm developer packages, instead of just ROCm runtime packages.
       * If using the Eigen backend, Eigen3. With Debian packages, (i.e. apt or apt-get), this should be `libeigen3-dev`.
-      * If using the ONNX backend, ONNX Runtime headers/libs and ONNX protobuf dependencies (`onnx/onnx-ml.pb.h`, `onnx_proto`, `protobuf-lite`) for `.bin.gz` model conversion support.
+      * If using the ONNX backend, ONNX Runtime headers/libs and ONNX protobuf dependencies (`onnx/onnx_pb.h`, `onnx_proto`, `protobuf-lite`) for `.bin.gz` model conversion support.
       * zlib, libzip. With Debian packages (i.e. apt or apt-get), these should be `zlib1g-dev`, `libzip-dev`.
       * If you want to do self-play training and research, probably Google perftools `libgoogle-perftools-dev` for TCMalloc or some other better malloc implementation. For unknown reasons, the allocation pattern in self-play with large numbers of threads and parallel games causes a lot of memory fragmentation under glibc malloc that will eventually run your machine out of memory, but better mallocs handle it fine.
       * If compiling to contribute to public distributed training runs, OpenSSL is required (`libssl-dev`).
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 1dd8ba5f7..66b093911 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -742,7 +742,7 @@ elseif(USE_BACKEND STREQUAL "ONNX")
 
   # Required by onnxmodelbuilder.cpp for building ONNX graphs from .bin.gz.
   # These are intentionally configurable because package layouts vary.
-  set(ONNX_INCLUDE_DIR "" CACHE PATH "Directory containing onnx/onnx-ml.pb.h (required for .bin.gz -> ONNX conversion)")
+  set(ONNX_INCLUDE_DIR "" CACHE PATH "Directory containing onnx/onnx_pb.h (required for .bin.gz -> ONNX conversion)")
   set(ONNX_PROTO_LIB "" CACHE FILEPATH "Path to onnx_proto library (required for .bin.gz -> ONNX conversion)")
   set(PROTOBUF_INCLUDE_DIR "" CACHE PATH "Directory containing google/protobuf/message.h (required for .bin.gz -> ONNX conversion)")
   set(PROTOBUF_LIB "" CACHE FILEPATH "Path to protobuf library (protobuf-lite or libprotobuf, required for .bin.gz -> ONNX conversion)")
@@ -768,8 +768,8 @@ elseif(USE_BACKEND STREQUAL "ONNX")
       katago_vcpkg_install_if_needed("onnx")
       katago_vcpkg_install_if_needed("protobuf")
       set(_katago_vcpkg_installed_root "${KATAGO_VCPKG_ROOT}/installed/${KATAGO_VCPKG_TRIPLET}")
-      if(EXISTS "${_katago_vcpkg_installed_root}/include/onnx/onnx-ml.pb.h")
-        set(ONNX_INCLUDE_DIR "${_katago_vcpkg_installed_root}/include" CACHE PATH "Directory containing onnx/onnx-ml.pb.h (required for .bin.gz -> ONNX conversion)" FORCE)
+      if(EXISTS "${_katago_vcpkg_installed_root}/include/onnx/onnx_pb.h")
+        set(ONNX_INCLUDE_DIR "${_katago_vcpkg_installed_root}/include" CACHE PATH "Directory containing onnx/onnx_pb.h (required for .bin.gz -> ONNX conversion)" FORCE)
       endif()
       find_library(_katago_onnx_proto_lib NAMES onnx_proto HINTS "${_katago_vcpkg_installed_root}/lib" "${_katago_vcpkg_installed_root}/debug/lib" NO_DEFAULT_PATH)
       if(_katago_onnx_proto_lib)
@@ -812,7 +812,7 @@ elseif(USE_BACKEND STREQUAL "ONNX")
   endif()
 
   if(NOT ONNX_INCLUDE_DIR)
-    find_path(ONNX_INCLUDE_DIR onnx/onnx-ml.pb.h)
+    find_path(ONNX_INCLUDE_DIR onnx/onnx_pb.h)
   endif()
   if(NOT ONNX_PROTO_LIB AND NOT ONNX_PROTO_TARGET)
     find_library(ONNX_PROTO_LIB onnx_proto)
@@ -827,7 +827,7 @@ elseif(USE_BACKEND STREQUAL "ONNX")
   if(NOT ONNX_INCLUDE_DIR OR NOT PROTOBUF_INCLUDE_DIR OR (NOT ONNX_PROTO_TARGET AND NOT ONNX_PROTO_LIB) OR (NOT PROTOBUF_TARGET AND NOT PROTOBUF_LIB))
     message(FATAL_ERROR
       "ONNX backend requires ONNX protobuf dependencies for .bin.gz model conversion. "
-      "Set ONNX_INCLUDE_DIR (contains onnx/onnx-ml.pb.h), ONNX_PROTO_LIB, PROTOBUF_INCLUDE_DIR, and PROTOBUF_LIB.")
+      "Set ONNX_INCLUDE_DIR (contains onnx/onnx_pb.h), ONNX_PROTO_LIB, PROTOBUF_INCLUDE_DIR, and PROTOBUF_LIB.")
   endif()
   target_include_directories(katago SYSTEM PRIVATE "${ONNX_INCLUDE_DIR}")
   target_include_directories(katago SYSTEM PRIVATE "${PROTOBUF_INCLUDE_DIR}")
diff --git a/cpp/neuralnet/onnxmodelbuilder.cpp b/cpp/neuralnet/onnxmodelbuilder.cpp
index a301b7dc0..d0a2ddb39 100644
--- a/cpp/neuralnet/onnxmodelbuilder.cpp
+++ b/cpp/neuralnet/onnxmodelbuilder.cpp
@@ -1,12 +1,12 @@
 // Builds an ONNX computational graph from a KataGo ModelDesc.
-// Uses the ONNX protobuf API (onnx-ml.pb.h) to construct a ModelProto
+// Uses the ONNX protobuf API (onnx_pb.h) to construct a ModelProto
 // that can be loaded directly by ONNX Runtime.
 
 #include "../neuralnet/onnxmodelbuilder.h"
 #include "../neuralnet/activations.h"
 #include "../core/global.h"
 
-#include <onnx/onnx-ml.pb.h>
+#include <onnx/onnx_pb.h>
 
 #include <string>
 #include <vector>