Improve reference implementation of quantization

dsharletg · xnnpack-bot · commit 57c3b8e15ca3 · 2025-12-02T18:44:35.000-08:00
- Add a helper `clamp_float_to_int` that avoids issues with converting large integers to floats and losing information.
- Add more information when tests fail.

PiperOrigin-RevId: 839514077
diff --git a/ynnpack/base/arithmetic.h b/ynnpack/base/arithmetic.h
@@ -13,11 +13,25 @@
 #include <cstdint>
 #include <limits>
 
-#include "ynnpack/base/base.h"
 #include "ynnpack/base/type.h"
 
 namespace ynn {
 
+// Clamp a float to the range of the given integer or quantized integer type.
+template <typename Int>
+float clamp_float_to_int(float x) {
+  using Unwrapped = typename unwrap_quantized<Int>::type;
+  // It's tricky to do this with std::max/std::min, because the min/max values
+  // might not be exactly representable as floats, and so are ineffective to
+  // avoid converting to an out of bounds integer. To avoid this problem, we've
+  // determined a constant that when added to the min/max float values, results
+  // in the upper bound of the integer range.
+  constexpr int half_mantissa = sizeof(Unwrapped) * 8 > 23 ? 127 : 0;
+  x = std::max<float>(x, std::numeric_limits<Unwrapped>::min());
+  x = std::min<float>(x, std::numeric_limits<Unwrapped>::max() - half_mantissa);
+  return x;
+}
+
 // A cast that:
 // - Rounds to nearest integer
 // - Replaces NaN with 0
@@ -27,14 +41,7 @@ Result round_float_to_int(float x) {
   using Unwrapped = typename unwrap_quantized<Result>::type;
   x = std::isnan(x) ? 0.0f : x;
   x = std::round(x);
-  // It's tricky to do this with std::max/std::min, because the min/max values
-  // might not be exactly representable as floats, and so are ineffective to
-  // avoid converting to an out of bounds integer. To avoid this problem, we've
-  // determined a constant that when added to the min/max float values, results
-  // in the upper bound of the integer range.
-  constexpr int half_mantissa = sizeof(Unwrapped) * 8 > 23 ? 127 : 0;
-  x = std::max<float>(x, std::numeric_limits<Unwrapped>::min());
-  x = std::min<float>(x, std::numeric_limits<Unwrapped>::max() - half_mantissa);
+  x = clamp_float_to_int<Result>(x);
   return static_cast<Unwrapped>(x);
 }
 
diff --git a/ynnpack/kernels/binary/reference.h b/ynnpack/kernels/binary/reference.h
@@ -152,8 +152,7 @@ void check_results(const OpInfo& op, const Tensor<quantized<A>>& a,
     const float b_i = dequantize(b(i), b_quantization);
     float expected = op(a_i, b_i);
     expected = fake_quantize(expected, x_quantization);
-    expected = std::max<float>(expected, type_info<X>::min());
-    expected = std::min<float>(expected, type_info<X>::max());
+    expected = clamp_float_to_int<X>(expected);
     if (std::isnan(expected)) {
       // We don't know how to represent NaN for quantized types.
     } else {
diff --git a/ynnpack/kernels/unary/reference.h b/ynnpack/kernels/unary/reference.h
@@ -473,15 +473,15 @@ void check_results(const unary_op_info& op, Tensor<A> a, Tensor<X> x,
       const float input_i = dequantize(a(i), a_quantization);
       float expected = op(input_i);
       expected = fake_quantize(expected, x_quantization);
-      expected = std::max<float>(expected, type_info<X>::min());
-      expected = std::min<float>(expected, type_info<X>::max());
+      expected = clamp_float_to_int<X>(expected);
       if (std::isnan(expected)) {
         // This is expected to overflow.
       } else {
         ASSERT_NEAR(expected, x(i), op.tolerance(expected, type_of<X>()))
             << "i = " << index_to_string(i) << ", a(i) = " << input_i << " ("
             << static_cast<float>(a(i)) << ")"
-            << ", x(i) = " << static_cast<int32_t>(x(i));
+            << ", x(i) = " << static_cast<int32_t>(x(i)) << " ("
+            << dequantize(x(i), x_quantization) << ")" << std::endl;
       }
     } else {
       const float input_i = dequantize(a(i), a_quantization);
diff --git a/ynnpack/xnnpack/dynamic_quantization_test.cc b/ynnpack/xnnpack/dynamic_quantization_test.cc
@@ -83,7 +83,9 @@ void TestImpl(T, size_t rank) {
       broadcast_extent_1(zero_point);
       for (const auto& i : EnumerateIndices(shape)) {
         ASSERT_NEAR(quantize<int8_t>(input(i), 1.0f / scale(i), zero_point(i)),
-                    output(i), 1);
+                    output(i), 1)
+            << "input=" << input(i) << ", scale=" << scale(i)
+            << ", zero_point=" << zero_point(i);
       }
     }
   }

Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,9 @@ void TestImpl(T, size_t rank) {`
`83`	`83`	`broadcast_extent_1(zero_point);`
`84`	`84`	`for (const auto& i : EnumerateIndices(shape)) {`
`85`	`85`	`ASSERT_NEAR(quantize<int8_t>(input(i), 1.0f / scale(i), zero_point(i)),`
`86`		`- output(i), 1);`
	`86`	`+ output(i), 1)`
	`87`	`+ << "input=" << input(i) << ", scale=" << scale(i)`
	`88`	`+ << ", zero_point=" << zero_point(i);`
`87`	`89`	`}`
`88`	`90`	`}`
`89`	`91`	`}`