[Improvement](external) Increase scanner concurrency

Gabriel39 · Gabriel39 · commit 920b8eed3a74 · 2025-11-24T16:30:56.000+08:00
diff --git a/be/src/pipeline/exec/file_scan_operator.cpp b/be/src/pipeline/exec/file_scan_operator.cpp
@@ -31,6 +31,15 @@
 
 namespace doris::pipeline {
 #include "common/compile_check_begin.h"
+
+int FileScanLocalState::max_scanners_concurrency(RuntimeState* state) {
+    return (state->max_file_scanners_concurrency()
+                    ? state->max_file_scanners_concurrency()
+                    : state->get_query_ctx()->get_remote_scan_scheduler()->get_max_threads() * 2 /
+                              _parent->parallelism(state)) *
+           _parent->parallelism(state);
+}
+
 Status FileScanLocalState::_init_scanners(std::list<vectorized::ScannerSPtr>* scanners) {
     if (_split_source->num_scan_ranges() == 0) {
         _eos = true;
@@ -44,9 +53,9 @@ Status FileScanLocalState::_init_scanners(std::list<vectorized::ScannerSPtr>* sc
 
     auto& p = _parent->cast<FileScanOperatorX>();
     // There's only one scan range for each backend in batch split mode. Each backend only starts up one ScanNode instance.
-    uint32_t shard_num = std::min(vectorized::ScannerScheduler::get_remote_scan_thread_num() /
-                                          p.query_parallel_instance_num(),
-                                  _max_scanners);
+    uint32_t shard_num = std::min(
+            vectorized::ScannerScheduler::get_remote_scan_thread_num() / p.parallelism(state()),
+            _max_scanners);
     shard_num = std::max(shard_num, 1U);
     _kv_cache.reset(new vectorized::ShardedKVCache(shard_num));
     for (int i = 0; i < _max_scanners; ++i) {
@@ -85,15 +94,15 @@ void FileScanLocalState::set_scan_ranges(RuntimeState* state,
             auto split_source = scan_range.split_source;
             RuntimeProfile::Counter* get_split_timer = ADD_TIMER(custom_profile(), "GetSplitTime");
 
-            _max_scanners = calc_max_scanners(p.query_parallel_instance_num());
+            _max_scanners = calc_max_scanners(p.parallelism(state));
             _split_source = std::make_shared<vectorized::RemoteSplitSourceConnector>(
                     state, get_split_timer, split_source.split_source_id, split_source.num_splits,
                     _max_scanners);
         }
     }
 
     if (!p._batch_split_mode) {
-        _max_scanners = calc_max_scanners(p.query_parallel_instance_num());
+        _max_scanners = calc_max_scanners(p.parallelism(state));
         if (_split_source == nullptr) {
             _split_source = std::make_shared<vectorized::LocalSplitSourceConnector>(scan_ranges,
                                                                                     _max_scanners);
diff --git a/be/src/pipeline/exec/file_scan_operator.h b/be/src/pipeline/exec/file_scan_operator.h
@@ -54,6 +54,7 @@ class FileScanLocalState final : public ScanLocalState<FileScanLocalState> {
                          const std::vector<TScanRangeParams>& scan_ranges) override;
     int parent_id() { return _parent->node_id(); }
     std::string name_suffix() const override;
+    int max_scanners_concurrency(RuntimeState* state) override;
 
 private:
     friend class vectorized::FileScanner;
@@ -83,8 +84,8 @@ class FileScanOperatorX final : public ScanOperatorX<FileScanLocalState> {
     bool is_file_scan_operator() const override { return true; }
 
     // There's only one scan range for each backend in batch split mode. Each backend only starts up one ScanNode instance.
-    int query_parallel_instance_num() const override {
-        return _batch_split_mode ? 1 : _query_parallel_instance_num;
+    int parallelism(RuntimeState* state) const override {
+        return _batch_split_mode ? 1 : ScanOperatorX<FileScanLocalState>::parallelism(state);
     }
 
 private:
diff --git a/be/src/pipeline/exec/operator.h b/be/src/pipeline/exec/operator.h
@@ -113,6 +113,9 @@ class OperatorBase {
     [[nodiscard]] virtual Status terminate(RuntimeState* state) = 0;
     [[nodiscard]] virtual Status close(RuntimeState* state);
     [[nodiscard]] virtual int node_id() const = 0;
+    [[nodiscard]] virtual int parallelism(RuntimeState* state) const {
+        return _is_serial_operator ? 1 : state->query_parallel_instance_num();
+    }
 
     [[nodiscard]] virtual Status set_child(OperatorPtr child) {
         if (_child && child != nullptr) {
diff --git a/be/src/pipeline/exec/scan_operator.cpp b/be/src/pipeline/exec/scan_operator.cpp
@@ -72,6 +72,14 @@ bool ScanLocalState<Derived>::should_run_serial() const {
     return _parent->cast<typename Derived::Parent>()._should_run_serial;
 }
 
+int ScanLocalStateBase::max_scanners_concurrency(RuntimeState* state) {
+    return (state->num_scanner_threads()
+                    ? state->num_scanner_threads()
+                    : _state->get_query_ctx()->get_scan_scheduler()->get_max_threads() * 2 /
+                              _parent->parallelism(state)) *
+           _parent->parallelism(state);
+}
+
 template <typename Derived>
 Status ScanLocalState<Derived>::init(RuntimeState* state, LocalStateInfo& info) {
     RETURN_IF_ERROR(PipelineXLocalState<>::init(state, info));
@@ -1052,19 +1060,14 @@ template <typename Derived>
 Status ScanLocalState<Derived>::_start_scanners(
         const std::list<std::shared_ptr<vectorized::ScannerDelegate>>& scanners) {
     auto& p = _parent->cast<typename Derived::Parent>();
-    // If scan operator is serial operator(like topn), its real parallelism is 1.
-    // Otherwise, its real parallelism is query_parallel_instance_num.
-    // query_parallel_instance_num of olap table is usually equal to session var parallel_pipeline_task_num.
-    // for file scan operator, its real parallelism will be 1 if it is in batch mode.
-    // Related pr:
-    // https://github.com/apache/doris/pull/42460
-    // https://github.com/apache/doris/pull/44635
-    const int parallism_of_scan_operator =
-            p.is_serial_operator() ? 1 : p.query_parallel_instance_num();
-
-    _scanner_ctx = vectorized::ScannerContext::create_shared(
-            state(), this, p._output_tuple_desc, p.output_row_descriptor(), scanners, p.limit(),
-            _scan_dependency, parallism_of_scan_operator);
+    _scanner_ctx = vectorized::ScannerContext::create_shared(state(), this, p._output_tuple_desc,
+                                                             p.output_row_descriptor(), scanners,
+                                                             p.limit(), _scan_dependency
+#ifdef BE_TEST
+                                                             ,
+                                                             max_scanners_concurrency(state())
+#endif
+    );
     return Status::OK();
 }
 
@@ -1273,8 +1276,6 @@ Status ScanOperatorX<LocalStateType>::init(const TPlanNode& tnode, RuntimeState*
         }
     }
 
-    _query_parallel_instance_num = state->query_parallel_instance_num();
-
     return Status::OK();
 }
 
diff --git a/be/src/pipeline/exec/scan_operator.h b/be/src/pipeline/exec/scan_operator.h
@@ -83,6 +83,14 @@ class ScanLocalStateBase : public PipelineXLocalState<> {
     virtual TPushAggOp::type get_push_down_agg_type() = 0;
 
     virtual int64_t get_push_down_count() = 0;
+    // If scan operator is serial operator(like topn), its real parallelism is 1.
+    // Otherwise, its real parallelism is query_parallel_instance_num.
+    // query_parallel_instance_num of olap table is usually equal to session var parallel_pipeline_task_num.
+    // for file scan operator, its real parallelism will be 1 if it is in batch mode.
+    // Related pr:
+    // https://github.com/apache/doris/pull/42460
+    // https://github.com/apache/doris/pull/44635
+    [[nodiscard]] virtual int max_scanners_concurrency(RuntimeState* state);
 
     [[nodiscard]] std::string get_name() { return _parent->get_name(); }
 
@@ -363,10 +371,6 @@ class ScanOperatorX : public OperatorX<LocalStateType> {
 
     [[nodiscard]] virtual bool is_file_scan_operator() const { return false; }
 
-    [[nodiscard]] virtual int query_parallel_instance_num() const {
-        return _query_parallel_instance_num;
-    }
-
     [[nodiscard]] size_t get_reserve_mem_size(RuntimeState* state) override;
 
     const std::vector<TRuntimeFilterDesc>& runtime_filter_descs() override {
@@ -443,8 +447,6 @@ class ScanOperatorX : public OperatorX<LocalStateType> {
     int64_t _push_down_count = -1;
     const int _parallel_tasks = 0;
 
-    int _query_parallel_instance_num = 0;
-
     std::vector<int> _topn_filter_source_node_ids;
 };
 
diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h
@@ -145,6 +145,11 @@ class RuntimeState {
     int num_scanner_threads() const {
         return _query_options.__isset.num_scanner_threads ? _query_options.num_scanner_threads : 0;
     }
+    int max_file_scanners_concurrency() const {
+        return _query_options.__isset.max_file_scanners_concurrency
+                       ? _query_options.max_file_scanners_concurrency
+                       : num_scanner_threads();
+    }
     int min_scan_concurrency_of_scan_scheduler() const {
         return _query_options.__isset.min_scan_scheduler_concurrency
                        ? _query_options.min_scan_scheduler_concurrency
diff --git a/be/src/vec/exec/scan/scanner_context.cpp b/be/src/vec/exec/scan/scanner_context.cpp
@@ -56,7 +56,12 @@ ScannerContext::ScannerContext(
         RuntimeState* state, pipeline::ScanLocalStateBase* local_state,
         const TupleDescriptor* output_tuple_desc, const RowDescriptor* output_row_descriptor,
         const std::list<std::shared_ptr<vectorized::ScannerDelegate>>& scanners, int64_t limit_,
-        std::shared_ptr<pipeline::Dependency> dependency, int parallism_of_scan_operator)
+        std::shared_ptr<pipeline::Dependency> dependency
+#ifdef BE_TEST
+        ,
+        int num_parallel_instances
+#endif
+        )
         : HasTaskExecutionCtx(state),
           _state(state),
           _local_state(local_state),
@@ -68,9 +73,14 @@ ScannerContext::ScannerContext(
           limit(limit_),
           _scanner_scheduler_global(state->exec_env()->scanner_scheduler()),
           _all_scanners(scanners.begin(), scanners.end()),
-          _parallism_of_scan_operator(parallism_of_scan_operator),
           _min_scan_concurrency_of_scan_scheduler(_state->min_scan_concurrency_of_scan_scheduler()),
           _min_scan_concurrency(_state->min_scan_concurrency_of_scanner()) {
+#ifndef BE_TEST
+    _max_scan_concurrency =
+            std::min(local_state->max_scanners_concurrency(state), cast_set<int>(scanners.size()));
+#else
+    _max_scan_concurrency = num_parallel_instances;
+#endif
     DCHECK(_state != nullptr);
     DCHECK(_output_row_descriptor == nullptr ||
            _output_row_descriptor->tuple_descriptors().size() == 1);
@@ -143,33 +153,6 @@ Status ScannerContext::init() {
         _set_scanner_done();
     }
 
-    // The overall target of our system is to make full utilization of the resources.
-    // At the same time, we dont want too many tasks are queued by scheduler, that is not necessary.
-    // Each scan operator can submit _max_scan_concurrency scanner to scheduelr if scheduler has enough resource.
-    // So that for a single query, we can make sure it could make full utilization of the resource.
-    _max_scan_concurrency = _state->num_scanner_threads();
-    if (_max_scan_concurrency == 0) {
-        // Why this is safe:
-        /*
-            1. If num cpu cores is less than or equal to 24:
-                _max_concurrency_of_scan_scheduler will be 96. _parallism_of_scan_operator will be 1 or C/2.
-                so _max_scan_concurrency will be 96 or (96 * 2 / C).
-                For a single scan node, most scanner it can submit will be 96 or (96 * 2 / C) * (C / 2) which is 96 too.
-                So a single scan node could make full utilization of the resource without sumbiting all its tasks.
-            2. If num cpu cores greater than 24:
-                _max_concurrency_of_scan_scheduler will be 4 * C. _parallism_of_scan_operator will be 1 or C/2.
-                so _max_scan_concurrency will be 4 * C or (4 * C * 2 / C).
-                For a single scan node, most scanner it can submit will be 4 * C or (4 * C * 2 / C) * (C / 2) which is 4 * C too.
-
-            So, in all situations, when there is only one scan node, it could make full utilization of the resource.
-        */
-        _max_scan_concurrency =
-                _min_scan_concurrency_of_scan_scheduler / _parallism_of_scan_operator;
-        _max_scan_concurrency = _max_scan_concurrency == 0 ? 1 : _max_scan_concurrency;
-    }
-
-    _max_scan_concurrency = std::min(_max_scan_concurrency, (int32_t)_pending_scanners.size());
-
     // when user not specify scan_thread_num, so we can try downgrade _max_thread_num.
     // becaue we found in a table with 5k columns, column reader may ocuppy too much memory.
     // you can refer https://github.com/apache/doris/issues/35340 for details.
diff --git a/be/src/vec/exec/scan/scanner_context.h b/be/src/vec/exec/scan/scanner_context.h
@@ -123,8 +123,12 @@ class ScannerContext : public std::enable_shared_from_this<ScannerContext>,
                    const TupleDescriptor* output_tuple_desc,
                    const RowDescriptor* output_row_descriptor,
                    const std::list<std::shared_ptr<vectorized::ScannerDelegate>>& scanners,
-                   int64_t limit_, std::shared_ptr<pipeline::Dependency> dependency,
-                   int num_parallel_instances);
+                   int64_t limit_, std::shared_ptr<pipeline::Dependency> dependency
+#ifdef BE_TEST
+                   ,
+                   int num_parallel_instances
+#endif
+    );
 
     ~ScannerContext() override;
     Status init();
@@ -206,7 +210,6 @@ class ScannerContext : public std::enable_shared_from_this<ScannerContext>,
     /// 3. `_free_blocks_memory_usage` < `_max_bytes_in_queue`, remains enough memory to scale up
     /// 4. At most scale up `MAX_SCALE_UP_RATIO` times to `_max_thread_num`
     void _set_scanner_done();
-    Status _try_to_scale_up();
 
     RuntimeState* _state = nullptr;
     pipeline::ScanLocalStateBase* _local_state = nullptr;
@@ -247,7 +250,6 @@ class ScannerContext : public std::enable_shared_from_this<ScannerContext>,
     RuntimeProfile::Counter* _scale_up_scanners_counter = nullptr;
     std::shared_ptr<ResourceContext> _resource_ctx;
     std::shared_ptr<pipeline::Dependency> _dependency = nullptr;
-    const int _parallism_of_scan_operator;
     std::shared_ptr<doris::vectorized::TaskHandle> _task_handle;
 
     std::atomic<int64_t> _block_memory_usage = 0;
@@ -256,6 +258,10 @@ class ScannerContext : public std::enable_shared_from_this<ScannerContext>,
 
     int32_t _min_scan_concurrency_of_scan_scheduler = 0;
     int32_t _min_scan_concurrency = 1;
+    // The overall target of our system is to make full utilization of the resources.
+    // At the same time, we dont want too many tasks are queued by scheduler, that is not necessary.
+    // Each scan operator can submit _max_scan_concurrency scanner to scheduelr if scheduler has enough resource.
+    // So that for a single query, we can make sure it could make full utilization of the resource.
     int32_t _max_scan_concurrency = 0;
 
     std::shared_ptr<ScanTask> _pull_next_scan_task(std::shared_ptr<ScanTask> current_scan_task,
diff --git a/be/test/scan/scanner_context_test.cpp b/be/test/scan/scanner_context_test.cpp
@@ -176,16 +176,13 @@ TEST_F(ScannerContextTest, test_init) {
     Status st = scanner_context->init();
     ASSERT_TRUE(st.ok());
     // actual max_scan_concurrency will be 2 since user specified num_scanner_threads is 2.
-    ASSERT_EQ(scanner_context->_max_scan_concurrency, 2);
+    ASSERT_EQ(scanner_context->_max_scan_concurrency, 1);
 
     query_options.__set_num_scanner_threads(0);
     state->set_query_options(query_options);
 
     st = scanner_context->init();
     ASSERT_TRUE(st.ok());
-
-    ASSERT_EQ(scanner_context->_max_scan_concurrency,
-              scanner_context->_min_scan_concurrency_of_scan_scheduler / parallel_tasks);
 }
 
 TEST_F(ScannerContextTest, test_serial_run) {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -94,6 +94,7 @@ public class SessionVariable implements Serializable, Writable {
     public static final String LOCAL_EXCHANGE_FREE_BLOCKS_LIMIT = "local_exchange_free_blocks_limit";
     public static final String SCAN_QUEUE_MEM_LIMIT = "scan_queue_mem_limit";
     public static final String NUM_SCANNER_THREADS = "num_scanner_threads";
+    public static final String MAX_FILE_SCANNERS_CONCURRENCY = "max_file_scanners_concurrency";
     public static final String MIN_SCANNER_CONCURRENCY = "min_scanner_concurrnency";
     public static final String MIN_SCAN_SCHEDULER_CONCURRENCY = "min_scan_scheduler_concurrency";
     public static final String QUERY_TIMEOUT = "query_timeout";
@@ -982,6 +983,11 @@ public static double getHotValueThreshold() {
     })
     public int numScannerThreads = 0;
 
+    @VariableMgr.VarAttr(name = MAX_FILE_SCANNERS_CONCURRENCY, needForward = true, description = {
+            "FileScanNode 扫描数据的最大并发",
+            "The max threads to read data of FileScanNode"})
+    public int maxFileScannersConcurrency = 16;
+
     @VariableMgr.VarAttr(name = LOCAL_EXCHANGE_FREE_BLOCKS_LIMIT)
     public int localExchangeFreeBlocksLimit = 4;
 
@@ -4727,6 +4733,7 @@ public TQueryOptions toThrift() {
         tResult.setLocalExchangeFreeBlocksLimit(localExchangeFreeBlocksLimit);
         tResult.setScanQueueMemLimit(maxScanQueueMemByte);
         tResult.setNumScannerThreads(numScannerThreads);
+        tResult.setMaxFileScannersConcurrency(maxFileScannersConcurrency);
         tResult.setMaxColumnReaderNum(maxColumnReaderNum);
         tResult.setParallelPrepareThreshold(parallelPrepareThreshold);
 
diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift
@@ -415,6 +415,7 @@ struct TQueryOptions {
   // Target file size in bytes for Iceberg write operations
   // Default 0 means use config::iceberg_sink_max_file_size
   178: optional i64 iceberg_write_target_file_size_bytes = 0;
+  179: optional i32 max_file_scanners_concurrency = 0;
 
   // For cloud, to control if the content would be written into file cache
   // In write path, to control if the content would be written into file cache.