From e54f3ad619d82f1cd66ec1664dfb8a26fdb5c19c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz>
Date: Tue, 17 Mar 2026 16:14:48 +0000
Subject: [PATCH 1/6] feat: update docs

---
 .../get-started/quick-start/tiled-dataset.md  | 79 ++++++++++++++++---
 ratiopath/ray/aggregate/tensor_mean.py        |  2 -
 ratiopath/ray/aggregate/tensor_std.py         |  2 -
 3 files changed, 69 insertions(+), 14 deletions(-)

diff --git a/docs/learn/get-started/quick-start/tiled-dataset.md b/docs/learn/get-started/quick-start/tiled-dataset.md
index 82a1143..29b1b61 100644
--- a/docs/learn/get-started/quick-start/tiled-dataset.md
+++ b/docs/learn/get-started/quick-start/tiled-dataset.md
@@ -5,7 +5,7 @@ In the [tiling tutorial](./tiling.md), we discussed how to create a tiled datase
 Before writing the data loaders, we must consider the structure and approximate size of our dataset. A tiled dataset typically consists of two highly interrelated components:
 
 1. **The Parent Dataset:** Contains high-level metadata about the source files (e.g., Whole Slide Image file paths, original dimensions, patient IDs, or slide-level labels).
-2. **The Tile Dataset:** Contains metadata about the individual chunks derived from those parents (e.g., $x$ and $y$ coordinates, the parent `slide_id`, and sometimes precomputed tile embeddings).
+2. **The Tile Dataset:** Contains metadata about the individual chunks derived from those parents (e.g., `x` and `y` coordinates, the parent `slide_id`, and sometimes precomputed tile embeddings).
 
 If your Parquet files are small enough, you can safely load the entire dataset into RAM using Pandas. However, in digital pathology and large-scale computer vision, a tile dataset can easily span hundreds of gigabytes across multiple Parquet partitions. Loading this entirely into memory will crash your system. Instead, we rely on **lazy loading** techniques to fetch only the necessary data points exactly when the model needs them.
 
@@ -23,7 +23,7 @@ When you load a Parquet file using Hugging Face `datasets`, the library translat
 **Why it is efficient:**
 
 * **Zero RAM Overhead:** You can interact with a 200GB dataset while consuming mere megabytes of actual RAM.
-* **$O(1)$ Random Access:** Reading a specific row is virtually instantaneous.
+* *x`O(1)` Random Access:** Reading a specific row is virtually instantaneous.
 * **Smart Caching:** When you filter the dataset to find tiles belonging to a specific slide, Hugging Face streams the data, finds the matches, and caches the result on disk.
 
 ---
@@ -32,7 +32,7 @@ When you load a Parquet file using Hugging Face `datasets`, the library translat
 
 At the lowest level, we need a standard PyTorch `Dataset` that takes a subset of our tiled data eg. fetches the actual pixel data, or the precomputed embeddings.
 
-In our WSI use case, we use the `openslide` library to dynamically read pixel patches from the WSIs based on the $x$ and $y$ coordinates stored in our Arrow-mapped tile dataset.
+In our WSI use case, we use the `openslide` library to dynamically read pixel patches from the WSIs based on the `x` and `y` coordinates stored in our Arrow-mapped tile dataset.
 
 ```python
 from pathlib import Path
@@ -98,8 +98,10 @@ class SlideDataset(ConcatDataset[TileDataset]):
         slides_parquet_path: str,
         tiles_parquet_path: str,
     ) -> None:
-        slides_dataset = load_dataset("parquet", data_files=slides_parquet_path, split="train") # Train is default split name for Hugging Face datasets, even if we don't have multiple splits
-        tiles_dataset = load_dataset("parquet", data_files=tiles_parquet_path, split="train")
+        self.slides_dataset = load_dataset("parquet", data_files=slides_parquet_path, split="train") # Train is default split name for Hugging Face datasets, even if we don't have multiple splits
+        self.tiles_dataset = load_dataset("parquet", data_files=tiles_parquet_path, split="train").sort("slide_id") # Sort by slide_id for faster filtering
+
+        self._slide_id_to_indices = self._build_tile_index(self.tiles_dataset)
 
         datasets = [
             TileDataset(
@@ -107,15 +109,72 @@ class SlideDataset(ConcatDataset[TileDataset]):
                 level=slide["level"],
                 extent_x=slide["extent_x"],
                 extent_y=slide["extent_y"],
-                tiles=tiles_dataset.filter(
-                    lambda row: row["slide_id"] == slide["slide_id"],
-                    keep_in_memory=False,
-                ),
+                tiles=self.filter_tiles_by_slide(slide.id),
             )
-            for slide in slides_dataset
+            for slide in self.slides_dataset
         ]
 
         super().__init__(datasets)
+
+    
+    @staticmethod
+    def _build_tile_index(tiles: HFDataset) -> dict[str, range]:
+        """Creates a fast lookup table for slide indices.
+
+        This function builds a mapping from `slide_id` to the range of indices in the
+        `tiles` dataset that correspond to that slide. It assumes that the `tiles` dataset
+        is sorted by `slide_id`, which allows for efficient retrieval of tile indices
+        for each slide without needing to scan the entire dataset for each slide.
+
+        Args:
+            tiles: A dataset containing a `slide_id` column, sorted by `slide_id`.
+
+        Returns:
+            A dictionary mapping each `slide_id` to a range of indices in the `tiles` dataset.
+        """
+        if len(tiles) == 0:
+            return {}
+
+        # Get the underlying Arrow table (zero-copy)
+        table = tiles.data.table
+
+        # Since it's sorted, we only care about where 'slide_id' changes.
+        slide_ids = table.column("slide_id")
+
+        # Find unique values and their counts
+        counts = pc.value_counts(slide_ids)  # pyright: ignore[reportAttributeAccessIssue]
+
+        index_map = {}
+        current_offset = 0
+
+        # counts is a StructArray of {values: T, counts: int64}
+        for count in counts:
+            pair = count.as_py()
+            sid = pair["values"]
+            offset = pair["counts"]
+
+            index_map[sid] = range(current_offset, current_offset + offset)
+            current_offset += offset
+
+        return index_map
+
+    def filter_tiles_by_slide(self, slide_id: str) -> HFDataset:
+        """Returns a view of the dataset using a slice or indices.
+
+        This function creates a view of the `self.tiles` dataset that contains only
+        the tiles belonging to the specified slide. It uses the precomputed
+        `_slide_id_to_indices` mapping to efficiently retrieve the relevant tiles
+        without copying data.
+
+        Args:
+            slide_id: The ID of the slide to filter tiles.
+
+        Returns:
+            A view of the tiles dataset containing only the tiles for the specified slide.
+        """
+        tile_range = self._slide_id_to_indices.get(slide_id, range(0))
+        return self.tiles_dataset.select(tile_range)
+
 ```
 
 ### Using the Dataset
diff --git a/ratiopath/ray/aggregate/tensor_mean.py b/ratiopath/ray/aggregate/tensor_mean.py
index 44b090b..cd9a452 100644
--- a/ratiopath/ray/aggregate/tensor_mean.py
+++ b/ratiopath/ray/aggregate/tensor_mean.py
@@ -48,10 +48,8 @@ class TensorMean(AggregateFnV2[dict, np.ndarray | float]):
         ... )
         >>> # 1. Global Mean (axis=None) -> Result: 2.0
         >>> ds.aggregate(TensorMean(on="m", axis=None))
-        >>>
         >>> # 2. Batch Mean (axis=0) -> Result: np.array([[2, 2], [2, 2]])
         >>> ds.aggregate(TensorMean(on="m", axis=0))
-        >>>
         >>> # 3. Mean across Batch and Rows (axis=(0, 1)) -> Result: np.array([2, 2])
         >>> ds.aggregate(TensorMean(on="m", axis=(0, 1)))
     """
diff --git a/ratiopath/ray/aggregate/tensor_std.py b/ratiopath/ray/aggregate/tensor_std.py
index 554b9a2..1c11722 100644
--- a/ratiopath/ray/aggregate/tensor_std.py
+++ b/ratiopath/ray/aggregate/tensor_std.py
@@ -52,11 +52,9 @@ class TensorStd(AggregateFnV2[dict, np.ndarray | float]):
         ... )
         >>> # 1. Global Std (axis=None) -> All elements reduced to one scalar
         >>> ds.aggregate(TensorStd(on="m", axis=None))
-        >>>
         >>> # 2. Batch Std (axis=0) -> Result is a 2x2 matrix of std values
         >>> # calculated across the dataset rows.
         >>> ds.aggregate(TensorStd(on="m", axis=0))
-        >>>
         >>> # 3. Int shorthand (axis=1) -> Internally uses axis=(0, 1)
         >>> # Collapses batch and the first dimension of the tensor.
         >>> ds.aggregate(TensorStd(on="m", axis=1))

From 8175179a8cd3dd6612882f112b2c668bdc80f220 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz>
Date: Thu, 19 Mar 2026 09:43:41 +0000
Subject: [PATCH 2/6] feat: tiled dataset tutorial

---
 .../get-started/quick-start/tiled-dataset.md  | 93 +++++++++----------
 1 file changed, 42 insertions(+), 51 deletions(-)

diff --git a/docs/learn/get-started/quick-start/tiled-dataset.md b/docs/learn/get-started/quick-start/tiled-dataset.md
index 29b1b61..a11eed5 100644
--- a/docs/learn/get-started/quick-start/tiled-dataset.md
+++ b/docs/learn/get-started/quick-start/tiled-dataset.md
@@ -7,32 +7,17 @@ Before writing the data loaders, we must consider the structure and approximate
 1. **The Parent Dataset:** Contains high-level metadata about the source files (e.g., Whole Slide Image file paths, original dimensions, patient IDs, or slide-level labels).
 2. **The Tile Dataset:** Contains metadata about the individual chunks derived from those parents (e.g., `x` and `y` coordinates, the parent `slide_id`, and sometimes precomputed tile embeddings).
 
-If your Parquet files are small enough, you can safely load the entire dataset into RAM using Pandas. However, in digital pathology and large-scale computer vision, a tile dataset can easily span hundreds of gigabytes across multiple Parquet partitions. Loading this entirely into memory will crash your system. Instead, we rely on **lazy loading** techniques to fetch only the necessary data points exactly when the model needs them.
+If your Parquet files are small enough, you can safely load the entire dataset into RAM using Pandas. However, in digital pathology and large-scale computer vision, a tile dataset can easily span hundreds of gigabytes across multiple Parquet partitions. Loading this entirely into memory will crash your system.
 
----
+Let's build our data loading pipeline from the ground up to handle this efficiently.
 
-## 1. Lazy Loading: The Hugging Face `datasets` Backend
+-----
 
-To handle massive tabular metadata, we use the Hugging Face [datasets](https://huggingface.co/docs/datasets/index) library. It is vastly superior to standard Pandas DataFrames for deep learning data loaders because of how it handles memory.
+## 1. The Core Building Block: `TileDataset`
 
-**How it works:**
-Parquet is a heavily compressed, columnar storage format. It is great for saving disk space but terrible for the random row access required by PyTorch (e.g., `dataset[idx]`).
-
-When you load a Parquet file using Hugging Face `datasets`, the library translates the Parquet data into an uncompressed **Apache Arrow** format on your disk. It then utilizes **memory mapping** (`mmap`) to treat that file on your hard drive as if it were in your RAM.
-
-**Why it is efficient:**
-
-* **Zero RAM Overhead:** You can interact with a 200GB dataset while consuming mere megabytes of actual RAM.
-* *x`O(1)` Random Access:** Reading a specific row is virtually instantaneous.
-* **Smart Caching:** When you filter the dataset to find tiles belonging to a specific slide, Hugging Face streams the data, finds the matches, and caches the result on disk.
-
----
+At the lowest level, we need a standard PyTorch `Dataset` that represents a single Whole Slide Image. Its job is simple: take a list of tile coordinates and fetch the actual pixel data (or precomputed embeddings) for those coordinates.
 
-## 2. The Tile Dataset (Reading Individual Tiles)
-
-At the lowest level, we need a standard PyTorch `Dataset` that takes a subset of our tiled data eg. fetches the actual pixel data, or the precomputed embeddings.
-
-In our WSI use case, we use the `openslide` library to dynamically read pixel patches from the WSIs based on the `x` and `y` coordinates stored in our Arrow-mapped tile dataset.
+In our WSI use case, we use the `openslide` library to dynamically read pixel patches from the WSIs based on the `x` and `y` metadata.
 
 ```python
 from pathlib import Path
@@ -57,7 +42,7 @@ class TileDataset(Dataset):
     ) -> None:
         super().__init__()
         self.slide_path = Path(slide_path)
-        self.tiles = tiles
+        self.tiles = tiles  # We will discuss how to efficiently provide this next
         self.level = level
         self.extent_x = extent_x
         self.extent_y = extent_y
@@ -77,16 +62,34 @@ class TileDataset(Dataset):
 
 ```
 
----
+Notice that our `TileDataset` expects a `tiles` object containing the metadata. If we pass a standard Pandas DataFrame here, our RAM will quickly max out as we scale up to thousands of slides.
+
+-----
+
+## 2. Managing the Metadata: The Hugging Face `datasets` Backend
+
+To feed our `TileDataset` without crashing our system, we use the Hugging Face [datasets](https://huggingface.co/docs/datasets/index) library. It acts as our `HFDataset` type hint above and is vastly superior to standard Pandas DataFrames for deep learning because of how it handles memory via **lazy loading**.
+
+**How it works:**
+Parquet is a heavily compressed, columnar storage format. It is great for saving disk space but terrible for the random row access required by PyTorch (`dataset[idx]`). When you load a Parquet file using Hugging Face `datasets`, the library translates the Parquet data into an uncompressed **Apache Arrow** format on your disk. It then utilizes **memory mapping** (`mmap`) to treat that file on your hard drive as if it were in your RAM.
+
+**Why it is efficient:**
 
-## 3. The Main Torch Dataset (Linking Slides and Tiles)
+  * **Zero RAM Overhead:** You can interact with a 200GB dataset while consuming mere megabytes of actual RAM.
+  * **O(1) Random Access:** Reading a specific row coordinate for our `TileDataset` is virtually instantaneous.
+  * **Smart Caching:** When you filter the massive tile dataset to find only the chunks belonging to a specific slide, Hugging Face streams the data, finds the matches, and caches the view on disk.
 
-Now we need a unified approach that combines our parent dataset (the slides) with our tile dataset (the patches). We achieve this through **relative tile splitting**—iterating through the parent metadata and dynamically filtering the massive tile dataset to extract only the chunks relative to that specific parent.
+-----
 
-By utilizing PyTorch's `ConcatDataset`, we can seamlessly chain these individual `SlideTileDataset` instances together into one massive, unified training set.
+## 3. The Orchestrator: `SlideDataset`
+
+Now we need a unified approach that links our parent metadata (the slides) with our lazily-loaded tile metadata (the patches). We achieve this through **relative tile splitting**—iterating through the parent metadata and dynamically filtering the massive Hugging Face tile dataset to extract only the chunks relative to that specific slide.
+
+By utilizing PyTorch's `ConcatDataset`, we can seamlessly chain our individual `TileDataset` instances together into one massive, unified training set.
 
 ```python
-from datasets import load_dataset
+import pyarrow.compute as pc
+from datasets import load_dataset, Dataset as HFDataset
 from torch.utils.data import ConcatDataset
 
 
@@ -98,8 +101,10 @@ class SlideDataset(ConcatDataset[TileDataset]):
         slides_parquet_path: str,
         tiles_parquet_path: str,
     ) -> None:
-        self.slides_dataset = load_dataset("parquet", data_files=slides_parquet_path, split="train") # Train is default split name for Hugging Face datasets, even if we don't have multiple splits
-        self.tiles_dataset = load_dataset("parquet", data_files=tiles_parquet_path, split="train").sort("slide_id") # Sort by slide_id for faster filtering
+        # 'train' is the default split name for Hugging Face datasets.
+        self.slides_dataset = load_dataset("parquet", data_files=slides_parquet_path, split="train") 
+        # Sort by slide_id for much faster filtering
+        self.tiles_dataset = load_dataset("parquet", data_files=tiles_parquet_path, split="train").sort("slide_id") 
 
         self._slide_id_to_indices = self._build_tile_index(self.tiles_dataset)
 
@@ -109,7 +114,7 @@ class SlideDataset(ConcatDataset[TileDataset]):
                 level=slide["level"],
                 extent_x=slide["extent_x"],
                 extent_y=slide["extent_y"],
-                tiles=self.filter_tiles_by_slide(slide.id),
+                tiles=self.filter_tiles_by_slide(slide["slide_id"]),
             )
             for slide in self.slides_dataset
         ]
@@ -122,32 +127,27 @@ class SlideDataset(ConcatDataset[TileDataset]):
         """Creates a fast lookup table for slide indices.
 
         This function builds a mapping from `slide_id` to the range of indices in the
-        `tiles` dataset that correspond to that slide. It assumes that the `tiles` dataset
-        is sorted by `slide_id`, which allows for efficient retrieval of tile indices
-        for each slide without needing to scan the entire dataset for each slide.
-
+        `tiles` dataset that correspond to that slide. It assumes the dataset is sorted.
+        
         Args:
             tiles: A dataset containing a `slide_id` column, sorted by `slide_id`.
 
         Returns:
-            A dictionary mapping each `slide_id` to a range of indices in the `tiles` dataset.
+            A dictionary mapping each `slide_id` to a range of indices.
         """
         if len(tiles) == 0:
             return {}
 
         # Get the underlying Arrow table (zero-copy)
         table = tiles.data.table
-
-        # Since it's sorted, we only care about where 'slide_id' changes.
         slide_ids = table.column("slide_id")
 
         # Find unique values and their counts
-        counts = pc.value_counts(slide_ids)  # pyright: ignore[reportAttributeAccessIssue]
+        counts = pc.value_counts(slide_ids)
 
         index_map = {}
         current_offset = 0
 
-        # counts is a StructArray of {values: T, counts: int64}
         for count in counts:
             pair = count.as_py()
             sid = pair["values"]
@@ -161,16 +161,8 @@ class SlideDataset(ConcatDataset[TileDataset]):
     def filter_tiles_by_slide(self, slide_id: str) -> HFDataset:
         """Returns a view of the dataset using a slice or indices.
 
-        This function creates a view of the `self.tiles` dataset that contains only
-        the tiles belonging to the specified slide. It uses the precomputed
-        `_slide_id_to_indices` mapping to efficiently retrieve the relevant tiles
-        without copying data.
-
-        Args:
-            slide_id: The ID of the slide to filter tiles.
-
-        Returns:
-            A view of the tiles dataset containing only the tiles for the specified slide.
+        This uses the precomputed `_slide_id_to_indices` mapping to efficiently 
+        retrieve the relevant tiles without copying data.
         """
         tile_range = self._slide_id_to_indices.get(slide_id, range(0))
         return self.tiles_dataset.select(tile_range)
@@ -179,7 +171,7 @@ class SlideDataset(ConcatDataset[TileDataset]):
 
 ### Using the Dataset
 
-Once constructed, you can pass this `ConcatDataset` directly into a standard PyTorch `DataLoader`. PyTorch will automatically calculate the cumulative length and map global batch indices to the correct underlying slide and tile.
+Once constructed, you can pass this `SlideDataset` directly into a standard PyTorch `DataLoader`. PyTorch will automatically calculate the cumulative length and map global batch indices to the correct underlying slide and tile.
 
 ```python
 from torch.utils.data import DataLoader
@@ -197,5 +189,4 @@ dataloader = DataLoader(
     shuffle=True, 
     num_workers=8
 )
-
 ```
\ No newline at end of file

From d17c407e33642eae8c694ad86f8af3d34ada4e33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz>
Date: Thu, 19 Mar 2026 14:49:49 +0000
Subject: [PATCH 3/6] feat: bug fix

---
 .../get-started/quick-start/tiled-dataset.md  | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/docs/learn/get-started/quick-start/tiled-dataset.md b/docs/learn/get-started/quick-start/tiled-dataset.md
index a11eed5..dcbd7dd 100644
--- a/docs/learn/get-started/quick-start/tiled-dataset.md
+++ b/docs/learn/get-started/quick-start/tiled-dataset.md
@@ -102,9 +102,9 @@ class SlideDataset(ConcatDataset[TileDataset]):
         tiles_parquet_path: str,
     ) -> None:
         # 'train' is the default split name for Hugging Face datasets.
-        self.slides_dataset = load_dataset("parquet", data_files=slides_parquet_path, split="train") 
+        self.slides_dataset = load_dataset("parquet", data_files=slides_parquet_path, split="train")
         # Sort by slide_id for much faster filtering
-        self.tiles_dataset = load_dataset("parquet", data_files=tiles_parquet_path, split="train").sort("slide_id") 
+        self.tiles_dataset = load_dataset("parquet", data_files=tiles_parquet_path, split="train").sort("slide_id")
 
         self._slide_id_to_indices = self._build_tile_index(self.tiles_dataset)
 
@@ -142,19 +142,19 @@ class SlideDataset(ConcatDataset[TileDataset]):
         table = tiles.data.table
         slide_ids = table.column("slide_id")
 
-        # Find unique values and their counts
-        counts = pc.value_counts(slide_ids)
+        # Since the dataset is sorted by 'slide_id', we can use
+        # run-end encoding to find group boundaries efficiently.
+        run_ends = pc.run_end_encode(slide_ids)
+
+        values = run_ends.field("values")
+        counts = run_ends.field("run_lengths")
 
         index_map = {}
         current_offset = 0
 
-        for count in counts:
-            pair = count.as_py()
-            sid = pair["values"]
-            offset = pair["counts"]
-
-            index_map[sid] = range(current_offset, current_offset + offset)
-            current_offset += offset
+        for sid, count in zip(values.to_pylist(), counts.to_pylist()):
+            index_map[sid] = range(current_offset, current_offset + count)
+            current_offset += count
 
         return index_map
 

From 443d1555b1c34af26e1416972cdfa7189ab31c90 Mon Sep 17 00:00:00 2001
From: Jakub Pekar <46449289+JakubPekar@users.noreply.github.com>
Date: Fri, 20 Mar 2026 00:23:57 +0100
Subject: [PATCH 4/6] Update
 docs/learn/get-started/quick-start/tiled-dataset.md

---
 docs/learn/get-started/quick-start/tiled-dataset.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/learn/get-started/quick-start/tiled-dataset.md b/docs/learn/get-started/quick-start/tiled-dataset.md
index dcbd7dd..9724dfe 100644
--- a/docs/learn/get-started/quick-start/tiled-dataset.md
+++ b/docs/learn/get-started/quick-start/tiled-dataset.md
@@ -147,7 +147,7 @@ class SlideDataset(ConcatDataset[TileDataset]):
         run_ends = pc.run_end_encode(slide_ids)
 
         values = run_ends.field("values")
-        counts = run_ends.field("run_lengths")
+        counts = run_ends.field("run_ends")
 
         index_map = {}
         current_offset = 0

From 7e207dc2b2f1e594db65ff600f4ec763243d7b44 Mon Sep 17 00:00:00 2001
From: Jakub Pekar <46449289+JakubPekar@users.noreply.github.com>
Date: Fri, 20 Mar 2026 00:34:56 +0100
Subject: [PATCH 5/6] Update
 docs/learn/get-started/quick-start/tiled-dataset.md

---
 docs/learn/get-started/quick-start/tiled-dataset.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/learn/get-started/quick-start/tiled-dataset.md b/docs/learn/get-started/quick-start/tiled-dataset.md
index 9724dfe..ad0b692 100644
--- a/docs/learn/get-started/quick-start/tiled-dataset.md
+++ b/docs/learn/get-started/quick-start/tiled-dataset.md
@@ -147,14 +147,14 @@ class SlideDataset(ConcatDataset[TileDataset]):
         run_ends = pc.run_end_encode(slide_ids)
 
         values = run_ends.field("values")
-        counts = run_ends.field("run_ends")
+        ends = run_ends.field("run_ends")
 
         index_map = {}
         current_offset = 0
 
-        for sid, count in zip(values.to_pylist(), counts.to_pylist()):
-            index_map[sid] = range(current_offset, current_offset + count)
-            current_offset += count
+        for sid, end in zip(values.to_pylist(), ends.to_pylist()):
+            index_map[sid] = range(current_offset, end)
+            current_offset = end
 
         return index_map
 

From 15c757337e40e4d2137c7959eab92570ca5b18d5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Pek=C3=A1r?= <492788@mail.muni.cz>
Date: Tue, 24 Mar 2026 20:55:09 +0000
Subject: [PATCH 6/6] feat: fix bugs

---
 docs/learn/get-started/quick-start/tiled-dataset.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/learn/get-started/quick-start/tiled-dataset.md b/docs/learn/get-started/quick-start/tiled-dataset.md
index ad0b692..5e7a5f7 100644
--- a/docs/learn/get-started/quick-start/tiled-dataset.md
+++ b/docs/learn/get-started/quick-start/tiled-dataset.md
@@ -146,15 +146,16 @@ class SlideDataset(ConcatDataset[TileDataset]):
         # run-end encoding to find group boundaries efficiently.
         run_ends = pc.run_end_encode(slide_ids)
 
-        values = run_ends.field("values")
-        ends = run_ends.field("run_ends")
+        values = run_ends.values
+        ends = run_ends.run_ends
 
         index_map = {}
         current_offset = 0
 
-        for sid, end in zip(values.to_pylist(), ends.to_pylist()):
-            index_map[sid] = range(current_offset, end)
-            current_offset = end
+        for sid, end in zip(values, ends):
+            end_py = end.as_py()
+            index_map[sid.as_py()] = range(current_offset, end_py)
+            current_offset = end_py
 
         return index_map