apache
diff --git a/‎docs/content/program-api/python-api.md‎
Lines changed: 16 additions & 1 deletion b/‎docs/content/program-api/python-api.md‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎paimon-python/dev/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎paimon-python/dev/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paimon-python/pypaimon/catalog/rest/rest_token_file_io.py‎
Lines changed: 15 additions & 0 deletions b/‎paimon-python/pypaimon/catalog/rest/rest_token_file_io.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎paimon-python/pypaimon/read/ray_datasource.py‎
Lines changed: 210 additions & 0 deletions b/‎paimon-python/pypaimon/read/ray_datasource.py‎
Lines changed: 210 additions & 0 deletions
diff --git a/‎paimon-python/pypaimon/read/table_read.py‎
Lines changed: 22 additions & 2 deletions b/‎paimon-python/pypaimon/read/table_read.py‎
Lines changed: 22 additions & 2 deletions
@@ -355,7 +355,7 @@ print(duckdb_con.query("SELECT * FROM duckdb_table WHERE f0 = 1").fetchdf())
 
 This requires `ray` to be installed.
 
-You can convert the splits into a Ray dataset and handle it by Ray API:
+You can convert the splits into a Ray Dataset and handle it by Ray Data API for distributed processing:
 
 ```python
 table_read = read_builder.new_read()
@@ -376,6 +376,21 @@ print(ray_dataset.to_pandas())
 # ...
 ```
 
+The `to_ray()` method supports a `parallelism` parameter to control distributed reading. Use `parallelism=1` for single-task read (default) or `parallelism > 1` for distributed read with multiple Ray workers:
+
+```python
+# Simple mode (single task)
+ray_dataset = table_read.to_ray(splits, parallelism=1)
+
+# Distributed mode with 4 parallel tasks
+ray_dataset = table_read.to_ray(splits, parallelism=4)
+
+# Use Ray Data operations
+mapped_dataset = ray_dataset.map(lambda row: {'value': row['value'] * 2})
+filtered_dataset = ray_dataset.filter(lambda row: row['score'] > 80)
+df = ray_dataset.to_pandas()
+```
+
 ### Incremental Read
 
 This API allows reading data committed between two snapshot timestamps. The steps are as follows.
 
@@ -35,6 +35,7 @@ polars==1.8.0; python_version=="3.8"
 polars==1.32.0; python_version>"3.8"
 pyarrow==6.0.1; python_version < "3.8"
 pyarrow==16; python_version >= "3.8"
+ray==2.48.0
 readerwriterlock==1.0.9
 zstandard==0.19.0; python_version<"3.9"
 zstandard==0.24.0; python_version>="3.9"
@@ -40,6 +40,21 @@ def __init__(self, identifier: Identifier, path: str,
         self.log = logging.getLogger(__name__)
         super().__init__(path, catalog_options)
 
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        # Remove non-serializable objects
+        state.pop('lock', None)
+        state.pop('api_instance', None)
+        # token can be serialized, but we'll refresh it on deserialization
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        # Recreate lock after deserialization
+        self.lock = threading.Lock()
+        # api_instance will be recreated when needed
+        self.api_instance = None
+
     def _initialize_oss_fs(self, path) -> FileSystem:
         self.try_to_refresh_token()
         self.properties.update(self.token.token)
 
@@ -0,0 +1,210 @@
+################################################################################
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+"""
+Module to read a Paimon table into a Ray Dataset, by using the Ray Datasource API.
+"""
+import heapq
+import itertools
+import logging
+from functools import partial
+from typing import List, Optional, Iterable
+
+import pyarrow
+
+from pypaimon.read.split import Split
+from pypaimon.read.table_read import TableRead
+from pypaimon.schema.data_types import PyarrowFieldParser
+
+logger = logging.getLogger(__name__)
+
+from ray.data.datasource import Datasource
+
+
+class PaimonDatasource(Datasource):
+    """
+    Ray Data Datasource implementation for reading Paimon tables.
+
+    This datasource enables distributed parallel reading of Paimon table splits,
+    allowing Ray to read multiple splits concurrently across the cluster.
+    """
+
+    def __init__(self, table_read: TableRead, splits: List[Split]):
+        """
+        Initialize PaimonDatasource.
+
+        Args:
+            table_read: TableRead instance for reading data
+            splits: List of splits to read
+        """
+        self.table_read = table_read
+        self.splits = splits
+        self._schema = None
+
+    def get_name(self) -> str:
+        identifier = self.table_read.table.identifier
+        table_name = identifier.get_full_name() if hasattr(identifier, 'get_full_name') else str(identifier)
+        return f"PaimonTable({table_name})"
+
+    def estimate_inmemory_data_size(self) -> Optional[int]:
+        if not self.splits:
+            return 0
+
+        # Sum up file sizes from all splits
+        total_size = sum(split.file_size for split in self.splits)
+        return total_size if total_size > 0 else None
+
+    @staticmethod
+    def _distribute_splits_into_equal_chunks(
+        splits: Iterable[Split], n_chunks: int
+    ) -> List[List[Split]]:
+        """
+        Implement a greedy knapsack algorithm to distribute the splits across tasks,
+        based on their file size, as evenly as possible.
+        """
+        chunks = [list() for _ in range(n_chunks)]
+        chunk_sizes = [(0, chunk_id) for chunk_id in range(n_chunks)]
+        heapq.heapify(chunk_sizes)
+
+        # From largest to smallest, add the splits to the smallest chunk one at a time
+        for split in sorted(
+            splits, key=lambda s: s.file_size if hasattr(s, 'file_size') and s.file_size > 0 else 0, reverse=True
+        ):
+            smallest_chunk = heapq.heappop(chunk_sizes)
+            chunks[smallest_chunk[1]].append(split)
+            split_size = split.file_size if hasattr(split, 'file_size') and split.file_size > 0 else 0
+            heapq.heappush(
+                chunk_sizes,
+                (smallest_chunk[0] + split_size, smallest_chunk[1]),
+            )
+
+        return chunks
+
+    def get_read_tasks(self, parallelism: int) -> List:
+        """Return a list of read tasks that can be executed in parallel."""
+        from ray.data.datasource import ReadTask
+        from ray.data.block import BlockMetadata
+
+        # Validate parallelism parameter
+        if parallelism < 1:
+            raise ValueError(f"parallelism must be at least 1, got {parallelism}")
+
+        # Get schema for metadata
+        if self._schema is None:
+            self._schema = PyarrowFieldParser.from_paimon_schema(self.table_read.read_type)
+
+        # Adjust parallelism if it exceeds the number of splits
+        if parallelism > len(self.splits):
+            parallelism = len(self.splits)
+            logger.warning(
+                f"Reducing the parallelism to {parallelism}, as that is the number of splits"
+            )
+
+        # Store necessary information for creating readers in Ray workers
+        # Extract these to avoid serializing the entire self object in closures
+        table = self.table_read.table
+        predicate = self.table_read.predicate
+        read_type = self.table_read.read_type
+        schema = self._schema
+
+        # Create a partial function to avoid capturing self in closure
+        # This reduces serialization overhead (see https://github.com/ray-project/ray/issues/49107)
+        def _get_read_task(
+            splits: List[Split],
+            table=table,
+            predicate=predicate,
+            read_type=read_type,
+            schema=schema,
+        ) -> Iterable[pyarrow.Table]:
+            """Read function that will be executed by Ray workers."""
+            from pypaimon.read.table_read import TableRead
+            worker_table_read = TableRead(table, predicate, read_type)
+
+            # Read all splits in this chunk
+            arrow_table = worker_table_read.to_arrow(splits)
+
+            # Return as a list to allow Ray to split into multiple blocks if needed
+            if arrow_table is not None and arrow_table.num_rows > 0:
+                return [arrow_table]
+            else:
+                # Return empty table with correct schema
+                empty_table = pyarrow.Table.from_arrays(
+                    [pyarrow.array([], type=field.type) for field in schema],
+                    schema=schema
+                )
+                return [empty_table]
+
+        # Use partial to create read function without capturing self
+        get_read_task = partial(
+            _get_read_task,
+            table=table,
+            predicate=predicate,
+            read_type=read_type,
+            schema=schema,
+        )
+
+        read_tasks = []
+
+        # Distribute splits across tasks using load balancing algorithm
+        for chunk_splits in self._distribute_splits_into_equal_chunks(self.splits, parallelism):
+            if not chunk_splits:
+                continue
+
+            # Calculate metadata for this chunk
+            total_rows = 0
+            total_size = 0
+
+            for split in chunk_splits:
+                if predicate is None:
+                    # Only estimate rows if no predicate (predicate filtering changes row count)
+                    if hasattr(split, 'row_count') and split.row_count > 0:
+                        total_rows += split.row_count
+                if hasattr(split, 'file_size') and split.file_size > 0:
+                    total_size += split.file_size
+
+            input_files = list(itertools.chain.from_iterable(
+                split.file_paths
+                for split in chunk_splits
+                if hasattr(split, 'file_paths') and split.file_paths
+            ))
+
+            # For PrimaryKey tables, we can't accurately estimate num_rows before merge
+            if table and table.is_primary_key_table:
+                num_rows = None  # Let Ray calculate actual row count after merge
+            elif predicate is not None:
+                num_rows = None  # Can't estimate with predicate filtering
+            else:
+                num_rows = total_rows if total_rows > 0 else None
+            size_bytes = total_size if total_size > 0 else None
+
+            metadata = BlockMetadata(
+                num_rows=num_rows,
+                size_bytes=size_bytes,
+                input_files=input_files if input_files else None,
+                exec_stats=None,  # Will be populated by Ray during execution
+            )
+
+            # TODO: per_task_row_limit is not supported in Ray 2.48.0, will be added in future versions
+            read_tasks.append(
+                ReadTask(
+                    read_fn=lambda splits=chunk_splits: get_read_task(splits),
+                    metadata=metadata,
+                    schema=schema,
+                )
+            )
+
+        return read_tasks
@@ -128,10 +128,30 @@ def to_duckdb(self, splits: List[Split], table_name: str,
         con.register(table_name, self.to_arrow(splits))
         return con
 
-    def to_ray(self, splits: List[Split]) -> "ray.data.dataset.Dataset":
+    def to_ray(self, splits: List[Split], parallelism: int = 1) -> "ray.data.dataset.Dataset":
+        """Convert Paimon table data to Ray Dataset."""
         import ray
 
-        return ray.data.from_arrow(self.to_arrow(splits))
+        if not splits:
+            schema = PyarrowFieldParser.from_paimon_schema(self.read_type)
+            empty_table = pyarrow.Table.from_arrays(
+                [pyarrow.array([], type=field.type) for field in schema],
+                schema=schema
+            )
+            return ray.data.from_arrow(empty_table)
+
+        # Validate parallelism parameter
+        if parallelism < 1:
+            raise ValueError(f"parallelism must be at least 1, got {parallelism}")
+
+        if parallelism == 1:
+            # Single-task read (simple mode)
+            return ray.data.from_arrow(self.to_arrow(splits))
+        else:
+            # Distributed read with specified parallelism
+            from pypaimon.read.ray_datasource import PaimonDatasource
+            datasource = PaimonDatasource(self, splits)
+            return ray.data.read_datasource(datasource, parallelism=parallelism)
 
     def _create_split_read(self, split: Split) -> SplitRead:
         if self.table.is_primary_key_table and not split.raw_convertible: