apache
diff --git a/‎.github/workflows/velox_backend_x86.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/velox_backend_x86.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends-velox/src/main/java/org/apache/gluten/utils/GpuBufferBatchResizerJniWrapper.java‎
Lines changed: 40 additions & 0 deletions b/‎backends-velox/src/main/java/org/apache/gluten/utils/GpuBufferBatchResizerJniWrapper.java‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxMetricsApi.scala‎
Lines changed: 2 additions & 2 deletions b/‎backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxMetricsApi.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala‎
Lines changed: 2 additions & 0 deletions b/‎backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala‎
Lines changed: 8 additions & 0 deletions b/‎backends-velox/src/main/scala/org/apache/gluten/config/VeloxConfig.scala‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎backends-velox/src/main/scala/org/apache/gluten/execution/GpuResizeBufferColumnarBatchExec.scala‎
Lines changed: 68 additions & 0 deletions b/‎backends-velox/src/main/scala/org/apache/gluten/execution/GpuResizeBufferColumnarBatchExec.scala‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎backends-velox/src/main/scala/org/apache/gluten/extension/AppendBatchResizeForShuffleInputAndOutput.scala‎
Lines changed: 3 additions & 0 deletions b/‎backends-velox/src/main/scala/org/apache/gluten/extension/AppendBatchResizeForShuffleInputAndOutput.scala‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends-velox/src/main/scala/org/apache/gluten/extension/CudfNodeValidationRule.scala‎
Lines changed: 22 additions & 21 deletions b/‎backends-velox/src/main/scala/org/apache/gluten/extension/CudfNodeValidationRule.scala‎
Lines changed: 22 additions & 21 deletions
diff --git a/‎backends-velox/src/main/scala/org/apache/gluten/extension/GpuBufferBatchResizeForShuffleInputOutput.scala‎
Lines changed: 80 additions & 0 deletions b/‎backends-velox/src/main/scala/org/apache/gluten/extension/GpuBufferBatchResizeForShuffleInputOutput.scala‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎backends-velox/src/main/scala/org/apache/spark/shuffle/ColumnarShuffleWriter.scala‎
Lines changed: 13 additions & 2 deletions b/‎backends-velox/src/main/scala/org/apache/spark/shuffle/ColumnarShuffleWriter.scala‎
Lines changed: 13 additions & 2 deletions
@@ -1318,7 +1318,7 @@ jobs:
           dnf config-manager --add-repo "$repo_url"
           dnf install -y libnvjitlink-devel-12-8
           df -a
-          bash dev/buildbundle-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --spark_version=3.4 --enable_gpu=ON
+          bash dev/buildbundle-veloxbe.sh --run_setup_script=OFF --build_arrow=OFF --spark_version=3.4 --build_tests=ON --build_benchmarks=ON --enable_gpu=ON
           ccache -s
 
   spark-test-spark40:
 
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.utils;
+
+import org.apache.gluten.runtime.Runtime;
+import org.apache.gluten.runtime.RuntimeAware;
+import org.apache.gluten.vectorized.ColumnarBatchInIterator;
+
+public class GpuBufferBatchResizerJniWrapper implements RuntimeAware {
+  private final Runtime runtime;
+
+  private GpuBufferBatchResizerJniWrapper(Runtime runtime) {
+    this.runtime = runtime;
+  }
+
+  public static GpuBufferBatchResizerJniWrapper create(Runtime runtime) {
+    return new GpuBufferBatchResizerJniWrapper(runtime);
+  }
+
+  @Override
+  public long rtHandle() {
+    return runtime.getHandle();
+  }
+
+  public native long create(int minOutputBatchSize, ColumnarBatchInIterator itr);
+}
@@ -17,7 +17,7 @@
 package org.apache.gluten.backendsapi.velox
 
 import org.apache.gluten.backendsapi.MetricsApi
-import org.apache.gluten.config.{HashShuffleWriterType, RssSortShuffleWriterType, ShuffleWriterType, SortShuffleWriterType}
+import org.apache.gluten.config.{GpuHashShuffleWriterType, HashShuffleWriterType, RssSortShuffleWriterType, ShuffleWriterType, SortShuffleWriterType}
 import org.apache.gluten.metrics._
 import org.apache.gluten.substrait.{AggregationParams, JoinParams}
 
@@ -370,7 +370,7 @@ class VeloxMetricsApi extends MetricsApi with Logging {
       "peakBytes" -> SQLMetrics.createSizeMetric(sparkContext, "peak bytes allocated")
     )
     shuffleWriterType match {
-      case HashShuffleWriterType =>
+      case HashShuffleWriterType | GpuHashShuffleWriterType =>
         baseMetrics ++ Map(
           "splitTime" -> SQLMetrics.createNanoTimingMetric(sparkContext, "time to split"),
           "avgDictionaryFields" -> SQLMetrics
 
@@ -109,6 +109,7 @@ object VeloxRuleApi {
 
     // Legacy: Post-transform rules.
     injector.injectPostTransform(_ => AppendBatchResizeForShuffleInputAndOutput())
+    injector.injectPostTransform(_ => GpuBufferBatchResizeForShuffleInputOutput())
     injector.injectPostTransform(_ => UnionTransformerRule())
     injector.injectPostTransform(c => PartialProjectRule.apply(c.session))
     injector.injectPostTransform(_ => PartialGenerateRule())
@@ -209,6 +210,7 @@ object VeloxRuleApi {
     // Gluten RAS: Post rules.
     injector.injectPostTransform(_ => DistinguishIdenticalScans)
     injector.injectPostTransform(_ => AppendBatchResizeForShuffleInputAndOutput())
+    injector.injectPostTransform(_ => GpuBufferBatchResizeForShuffleInputOutput())
     injector.injectPostTransform(_ => RemoveTransitions)
     injector.injectPostTransform(_ => UnionTransformerRule())
     injector.injectPostTransform(c => PartialProjectRule.apply(c.session))
 
@@ -80,6 +80,8 @@ class VeloxConfig(conf: SQLConf) extends GlutenConfig(conf) {
 
   def cudfEnableValidation: Boolean = getConf(CUDF_ENABLE_VALIDATION)
 
+  def cudfBatchSize: Int = getConf(CUDF_BATCH_SIZE)
+
   def orcUseColumnNames: Boolean = getConf(ORC_USE_COLUMN_NAMES)
 
   def parquetUseColumnNames: Boolean = getConf(PARQUET_USE_COLUMN_NAMES)
@@ -634,6 +636,12 @@ object VeloxConfig extends ConfigRegistry {
       .booleanConf
       .createWithDefault(true)
 
+  val CUDF_BATCH_SIZE =
+    buildConf("spark.gluten.sql.columnar.backend.velox.cudf.batchSize")
+      .doc("Cudf input batch size after shuffle reader")
+      .intConf
+      .createWithDefault(Integer.MAX_VALUE)
+
   val MEMORY_DUMP_ON_EXIT =
     buildConf("spark.gluten.monitor.memoryDumpOnExit")
       .internal()
 
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.execution
+
+import org.apache.gluten.backendsapi.BackendsApiManager
+import org.apache.gluten.backendsapi.velox.VeloxBatchType
+import org.apache.gluten.extension.columnar.transition.Convention
+import org.apache.gluten.iterator.ClosableIterator
+import org.apache.gluten.runtime.Runtimes
+import org.apache.gluten.utils.GpuBufferBatchResizerJniWrapper
+import org.apache.gluten.vectorized.{ColumnarBatchInIterator, ColumnarBatchOutIterator}
+
+import org.apache.spark.sql.catalyst.expressions.SortOrder
+import org.apache.spark.sql.catalyst.plans.physical.Partitioning
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+import scala.collection.JavaConverters._
+
+/**
+ * An operator to resize input BufferBatches generated by shuffle reader, and convert to cudf table.
+ */
+case class GpuResizeBufferColumnarBatchExec(override val child: SparkPlan, minOutputBatchSize: Int)
+  extends ColumnarToColumnarExec(child) {
+
+  override protected def mapIterator(in: Iterator[ColumnarBatch]): Iterator[ColumnarBatch] = {
+    val runtime =
+      Runtimes.contextInstance(BackendsApiManager.getBackendName, "GpuBufferColumnarBatchResizer")
+    val outHandle = GpuBufferBatchResizerJniWrapper
+      .create(runtime)
+      .create(
+        minOutputBatchSize,
+        new ColumnarBatchInIterator(BackendsApiManager.getBackendName, in.asJava))
+    new ColumnarBatchOutIterator(runtime, outHandle).asScala
+  }
+
+  override protected def closeIterator(out: Iterator[ColumnarBatch]): Unit = {
+    out.asJava match {
+      case c: ClosableIterator[ColumnarBatch] => c.close()
+      case _ =>
+    }
+  }
+
+  override protected def needRecyclePayload: Boolean = true
+
+  override def outputPartitioning: Partitioning = child.outputPartitioning
+  override def outputOrdering: Seq[SortOrder] = child.outputOrdering
+  override protected def withNewChildInternal(newChild: SparkPlan): SparkPlan =
+    copy(child = newChild)
+
+  override def batchType(): Convention.BatchType = VeloxBatchType
+
+  override def rowType0(): Convention.RowType = Convention.RowType.None
+}
@@ -30,6 +30,9 @@ import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
  */
 case class AppendBatchResizeForShuffleInputAndOutput() extends Rule[SparkPlan] {
   override def apply(plan: SparkPlan): SparkPlan = {
+    if (VeloxConfig.get.enableColumnarCudf) {
+      return plan
+    }
     val resizeBatchesShuffleInputEnabled = VeloxConfig.get.veloxResizeBatchesShuffleInput
     val resizeBatchesShuffleOutputEnabled = VeloxConfig.get.veloxResizeBatchesShuffleOutput
     if (!resizeBatchesShuffleInputEnabled && !resizeBatchesShuffleOutputEnabled) {
 
@@ -18,8 +18,9 @@ package org.apache.gluten.extension
 
 import org.apache.gluten.config.{GlutenConfig, VeloxConfig}
 import org.apache.gluten.cudf.VeloxCudfPlanValidatorJniWrapper
-import org.apache.gluten.execution.{CudfTag, LeafTransformSupport, TransformSupport, VeloxResizeBatchesExec, WholeStageTransformer}
-import org.apache.gluten.extension.CudfNodeValidationRule.setTagForWholeStageTransformer
+import org.apache.gluten.exception.GlutenNotSupportException
+import org.apache.gluten.execution._
+import org.apache.gluten.extension.CudfNodeValidationRule.{createGPUColumnarExchange, setTagForWholeStageTransformer}
 
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.execution.{ColumnarShuffleExchangeExec, GPUColumnarShuffleExchangeExec, SparkPlan}
@@ -31,37 +32,23 @@ case class CudfNodeValidationRule(glutenConf: GlutenConfig) extends Rule[SparkPl
     if (!glutenConf.enableColumnarCudf) {
       return plan
     }
-    plan.transformUp {
+    val transformedPlan = plan.transformUp {
       case shuffle @ ColumnarShuffleExchangeExec(
             _,
-            v @ VeloxResizeBatchesExec(w: WholeStageTransformer, _, _),
+            VeloxResizeBatchesExec(w: WholeStageTransformer, _, _),
             _,
             _,
             _) =>
         setTagForWholeStageTransformer(w)
-        if (w.isCudf) {
-          log.info("VeloxResizeBatchesExec is not supported in GPU")
-        }
-        GPUColumnarShuffleExchangeExec(
-          shuffle.outputPartitioning,
-          w,
-          shuffle.shuffleOrigin,
-          shuffle.projectOutputAttributes,
-          shuffle.advisoryPartitionSize)
-
+        createGPUColumnarExchange(shuffle)
       case shuffle @ ColumnarShuffleExchangeExec(_, w: WholeStageTransformer, _, _, _) =>
         setTagForWholeStageTransformer(w)
-        GPUColumnarShuffleExchangeExec(
-          shuffle.outputPartitioning,
-          w,
-          shuffle.shuffleOrigin,
-          shuffle.projectOutputAttributes,
-          shuffle.advisoryPartitionSize)
-
+        createGPUColumnarExchange(shuffle)
       case transformer: WholeStageTransformer =>
         setTagForWholeStageTransformer(transformer)
         transformer
     }
+    transformedPlan
   }
 }
 
@@ -93,4 +80,18 @@ object CudfNodeValidationRule {
       transformer.setTagValue(CudfTag.CudfTag, true)
     }
   }
+
+  def createGPUColumnarExchange(shuffle: ColumnarShuffleExchangeExec): SparkPlan = {
+    val exec = GPUColumnarShuffleExchangeExec(
+      shuffle.outputPartitioning,
+      shuffle.child,
+      shuffle.shuffleOrigin,
+      shuffle.projectOutputAttributes,
+      shuffle.advisoryPartitionSize)
+    val res = exec.doValidate()
+    if (!res.ok()) {
+      throw new GlutenNotSupportException(res.reason())
+    }
+    exec
+  }
 }
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.gluten.extension
+
+import org.apache.gluten.config.{HashShuffleWriterType, VeloxConfig}
+import org.apache.gluten.execution.{GpuResizeBufferColumnarBatchExec, VeloxResizeBatchesExec}
+
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.{ColumnarShuffleExchangeExec, ColumnarShuffleExchangeExecBase, SparkPlan}
+import org.apache.spark.sql.execution.adaptive.{AQEShuffleReadExec, ShuffleQueryStageExec}
+import org.apache.spark.sql.execution.exchange.ReusedExchangeExec
+
+/**
+ * Try to append [[GpuBufferBatchResizeForShuffleInputOutput]] for shuffle input and output to make
+ * the batch sizes in good shape.
+ */
+case class GpuBufferBatchResizeForShuffleInputOutput() extends Rule[SparkPlan] {
+  override def apply(plan: SparkPlan): SparkPlan = {
+    if (!VeloxConfig.get.enableColumnarCudf) {
+      return plan
+    }
+    val range = VeloxConfig.get.veloxResizeBatchesShuffleInputOutputRange
+    val batchSize = VeloxConfig.get.cudfBatchSize
+    plan.transformUp {
+      case shuffle: ColumnarShuffleExchangeExec
+          if shuffle.shuffleWriterType == HashShuffleWriterType &&
+            VeloxConfig.get.veloxResizeBatchesShuffleInput =>
+        val appendBatches =
+          VeloxResizeBatchesExec(shuffle.child, range.min, range.max)
+        shuffle.withNewChildren(Seq(appendBatches))
+      case a @ AQEShuffleReadExec(
+            ShuffleQueryStageExec(_, _: ColumnarShuffleExchangeExecBase, _),
+            _) =>
+        GpuResizeBufferColumnarBatchExec(a, batchSize)
+      case a @ AQEShuffleReadExec(
+            ShuffleQueryStageExec(_, ReusedExchangeExec(_, _: ColumnarShuffleExchangeExecBase), _),
+            _) =>
+        GpuResizeBufferColumnarBatchExec(a, batchSize)
+      // Since it's transformed in a bottom to up order, so we may first encounter
+      // ShuffeQueryStageExec, which is transformed to VeloxResizeBatchesExec(ShuffeQueryStageExec),
+      // then we see AQEShuffleReadExec
+      case a @ AQEShuffleReadExec(
+            GpuResizeBufferColumnarBatchExec(
+              s @ ShuffleQueryStageExec(_, _: ColumnarShuffleExchangeExecBase, _),
+              _),
+            _) =>
+        GpuResizeBufferColumnarBatchExec(a.copy(child = s), batchSize)
+      case a @ AQEShuffleReadExec(
+            GpuResizeBufferColumnarBatchExec(
+              s @ ShuffleQueryStageExec(
+                _,
+                ReusedExchangeExec(_, _: ColumnarShuffleExchangeExecBase),
+                _),
+              _),
+            _) =>
+        GpuResizeBufferColumnarBatchExec(a.copy(child = s), batchSize)
+      case s @ ShuffleQueryStageExec(_, _: ColumnarShuffleExchangeExecBase, _) =>
+        GpuResizeBufferColumnarBatchExec(s, batchSize)
+      case s @ ShuffleQueryStageExec(
+            _,
+            ReusedExchangeExec(_, _: ColumnarShuffleExchangeExecBase),
+            _) =>
+        GpuResizeBufferColumnarBatchExec(s, batchSize)
+    }
+  }
+}
@@ -18,7 +18,7 @@ package org.apache.spark.shuffle
 
 import org.apache.gluten.backendsapi.BackendsApiManager
 import org.apache.gluten.columnarbatch.ColumnarBatches
-import org.apache.gluten.config.{GlutenConfig, HashShuffleWriterType, SortShuffleWriterType}
+import org.apache.gluten.config.{GlutenConfig, GpuHashShuffleWriterType, HashShuffleWriterType, SortShuffleWriterType}
 import org.apache.gluten.memory.memtarget.{MemoryTarget, Spiller}
 import org.apache.gluten.runtime.Runtimes
 import org.apache.gluten.vectorized._
@@ -44,7 +44,7 @@ class ColumnarShuffleWriter[K, V](
   private val dep = handle.dependency.asInstanceOf[ColumnarShuffleDependency[K, V, V]]
 
   dep.shuffleWriterType match {
-    case HashShuffleWriterType | SortShuffleWriterType =>
+    case HashShuffleWriterType | SortShuffleWriterType | GpuHashShuffleWriterType =>
     // Valid shuffle writer types
     case _ =>
       throw new IllegalArgumentException(
@@ -171,6 +171,17 @@ class ColumnarShuffleWriter[K, V](
               conf.get(SHUFFLE_SORT_USE_RADIXSORT),
               partitionWriterHandle
             )
+          } else if (dep.shuffleWriterType == GpuHashShuffleWriterType) {
+            shuffleWriterJniWrapper.createGpuHashShuffleWriter(
+              numPartitions,
+              dep.nativePartitioning.getShortName,
+              GlutenShuffleUtils.getStartPartitionId(
+                dep.nativePartitioning,
+                taskContext.partitionId),
+              nativeBufferSize,
+              reallocThreshold,
+              partitionWriterHandle
+            )
           } else {
             shuffleWriterJniWrapper.createHashShuffleWriter(
               numPartitions,