1313#endif
1414using namespace std ;
1515namespace onnxruntime {
16+
1617#define REGISTER_UNARY_ELEMENTWISE_KERNEL (x, sinceVersion ) \
1718 ONNX_CPU_OPERATOR_TYPED_KERNEL ( \
1819 x, \
@@ -759,10 +760,14 @@ bool CommonFastReduceCopy(OpKernelContext* ctx, TensorShapeVector& input_axes, b
759760 } else {
760761 input_axes.clear ();
761762 }
762- // noop_with_empty_axes is handled upstream by ApplyNoopEmptyAxesElementwise().
763- // Return false for clarity and to prevent unsafe memcpy fallback.
763+
764764 if (input_axes.empty () && noop_with_empty_axes) {
765- return false ;
765+ const Tensor* input = ctx->Input <Tensor>(0 );
766+ auto * output = ctx->Output (0 , input->Shape ());
767+ memcpy (output->MutableDataRaw (),
768+ input->DataRaw (),
769+ input->SizeInBytes ());
770+ return true ;
766771 }
767772 }
768773 return false ;
@@ -795,6 +800,7 @@ bool CommonFastReduceSwitch(OpKernelContext* ctx,
795800 fast_kind = OptimizeShapeForFastReduce (
796801 reduced_dims, input_axes.empty () ? axes_ : input_axes,
797802 fast_shape, output_shape, fast_axes, keepdims_ != 0 , noop_with_empty_axes);
803+
798804 if (which_fast_reduce != FastReduceKind::kNone ) {
799805 if (IsFastReduceKindAvailable (fast_kind, which_fast_reduce)) {
800806 Tensor* output = ctx->Output (0 , output_shape);
@@ -914,30 +920,6 @@ bool check_and_reduce_empty_set_input(OpKernelContext* ctx, const gsl::span<cons
914920 return true ;
915921}
916922
917- template <typename AGG>
918- inline void ApplyNoopEmptyAxesElementwise (OpKernelContext* ctx) {
919- const Tensor* X = ctx->Input <Tensor>(0 );
920- const auto & shape = X->Shape ();
921- Tensor* Y = ctx->Output (0 , shape);
922-
923- if constexpr (!ReduceAggTraits<AGG>::kHasPreOp && !ReduceAggTraits<AGG>::kHasPostOp ) {
924- std::memcpy (Y->MutableDataRaw (), X->DataRaw (), X->SizeInBytes ());
925-
926- } else {
927- using Tin = typename AGG::input_type;
928- using Tacc = typename AGG::value_type;
929- const Tin* x = X->Data <Tin>();
930- Tacc* y = Y->MutableData <Tacc>();
931- const int64_t n = shape.Size ();
932-
933- for (int64_t i = 0 ; i < n; ++i) {
934- AGG agg (1 , x[i]);
935- agg.update (x[i]);
936- y[i] = agg.get_value ();
937- }
938- }
939- }
940-
941923template <typename AGG>
942924void CommonReduce1Loop (OpKernelContext* ctx,
943925 const gsl::span<const int64_t >& axes_, int64_t keepdims_,
@@ -946,11 +928,6 @@ void CommonReduce1Loop(OpKernelContext* ctx,
946928 return ;
947929 }
948930
949- if (axes_.empty () && noop_with_empty_axes) {
950- ApplyNoopEmptyAxesElementwise<AGG>(ctx);
951- return ;
952- }
953-
954931 FastReduceKind fast_kind;
955932 TensorShapeVector fast_shape;
956933 TensorShapeVector output_shape;
@@ -962,7 +939,6 @@ void CommonReduce1Loop(OpKernelContext* ctx,
962939
963940 const Tensor* input = ctx->Input <Tensor>(0 );
964941 Tensor* output = ctx->Output (0 , output_shape);
965-
966942 if (fast_kind == FastReduceKind::kEmpty ) {
967943 const TensorShape& input_shape = input->Shape ();
968944 if (input_shape.Size () == 1 ) {
@@ -989,11 +965,6 @@ void CommonReduce2Loops(OpKernelContext* ctx,
989965 return ;
990966 }
991967
992- if (axes_.empty () && noop_with_empty_axes) {
993- ApplyNoopEmptyAxesElementwise<AGG>(ctx);
994- return ;
995- }
996-
997968 FastReduceKind fast_kind;
998969 TensorShapeVector fast_shape, output_shape, fast_axes;
999970 if (CommonFastReduce<AGG>(ctx, axes_, keepdims_, noop_with_empty_axes,
@@ -1017,6 +988,7 @@ void CommonReduce2Loops(OpKernelContext* ctx,
1017988 }
1018989 return ;
1019990 }
991+
1020992 ResultsNoTransposePrepareForReduce last_results;
1021993 NoTransposeReduce2Loops<AGG>(output, fast_shape, *input, fast_axes, ctx->GetOperatorThreadPool (), last_results);
1022994}
0 commit comments