openvdb
diff --git a/‎instance_segmentation/garfvdb/garfvdb/util.py‎
Lines changed: 1 addition & 1 deletion b/‎instance_segmentation/garfvdb/garfvdb/util.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎open_vocabulary_segmentation/langsplatv2/README.md‎
Lines changed: 85 additions & 29 deletions b/‎open_vocabulary_segmentation/langsplatv2/README.md‎
Lines changed: 85 additions & 29 deletions
diff --git a/‎open_vocabulary_segmentation/langsplatv2/langsplatv2/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎open_vocabulary_segmentation/langsplatv2/langsplatv2/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎open_vocabulary_segmentation/langsplatv2/langsplatv2/config.py‎
Lines changed: 33 additions & 1 deletion b/‎open_vocabulary_segmentation/langsplatv2/langsplatv2/config.py‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎open_vocabulary_segmentation/langsplatv2/langsplatv2/loss.py‎
Lines changed: 22 additions & 14 deletions b/‎open_vocabulary_segmentation/langsplatv2/langsplatv2/loss.py‎
Lines changed: 22 additions & 14 deletions
@@ -136,7 +136,7 @@ def pca_projection_fast(
         V = calculate_pca_projection(features_centered, n_components, center=False)
 
     # Project data onto principal components
-    projected = torch.mm(features_flat, V.to(features.device))
+    projected = torch.mm(features_centered, V.to(features.device))
 
     # Normalize to [0, 1] range
     mins = projected.min(dim=0, keepdim=True)[0]
 
@@ -1,55 +1,111 @@
-# LangSplatV2
+# LangSplatV2 (fVDB)
 
-This project implements LangSplatV2 (Li, et al. 2025) with fVDB for open-vocabulary 3D segmentation.
+LangSplatV2-style open-vocabulary 3D segmentation using [fVDB](https://github.com/openvdb/fvdb-core) and pre-trained Gaussian splat reconstructions. This implementation trains per-Gaussian sparse coefficient fields and shared CLIP-aligned codebooks on an existing reconstruction; it does not train the underlying Gaussians or colors.
 
-## Overview
+## What this implements
 
-The LangSplatV2 scene data transformation pipeline consists of two main steps:
+- **Preprocessing**: Multi-scale SAM2 masks and OpenCLIP feature encoding for each image (cached on disk).
+- **Training**: Residual VQ codebooks and per-splat sparse logits so that rendered language features match the CLIP embeddings from SAM masks. One feature level (scale) per run; train multiple levels separately and combine at inference.
+- **Compatibility**: Same feature pipeline and training setup (loss, LR, layer schedule) as the original LangSplatV2; uses fVDB for the 3D representation and rendering.
 
-1. **Multi-scale SAM2 Segmentation**: Uses SAM2 to generate segmentation masks at multiple scales (default, small, medium, large) for each image.
+## Prerequisites
 
-2. **CLIP Feature Encoding**: Encodes each segmented region using OpenCLIP to produce language-aligned features that can be used for open-vocabulary queries.
+- **SfM scene**: COLMAP, `simple_directory`, or E57 dataset (images + cameras + optional point cloud).
+- **Pre-trained Gaussian splat reconstruction**: A `.ply` or `.pt` / `.pth` checkpoint produced by e.g. [fvdb-reality-capture](https://github.com/openvdb/fvdb-reality-capture) or another fVDB-compatible pipeline. The script uses its normalization transform so the scene and Gaussians are aligned.
 
 ## Installation
 
+From this directory (`open_vocabulary_segmentation/langsplatv2/`), with the `fvdb` conda environment active:
+
 ```bash
-# Install from the fvdb-examples repository
+conda activate fvdb
 pip install -e .
+```
+
+Dependencies (see `pyproject.toml`) include `torch`, `open-clip-torch`, `fvdb-reality-capture`, `tyro`, and optional TensorBoard for logging.
+
+## How to run
+
+Training loads the SfM scene, applies preprocessing (SAM2 + CLIP) with caching, then runs the language-feature training loop.
+
+**Minimal (COLMAP scene + PLY reconstruction):**
 
-# Or install dependencies manually
-pip install open-clip-torch fvdb-reality-capture
+```bash
+python train_langsplatv2.py \
+    --sfm-dataset-path /path/to/colmap/scene \
+    --reconstruction-path /path/to/point_cloud.ply
+```
+
+**With explicit feature level and log directory:**
+
+```bash
+python train_langsplatv2.py \
+    --sfm-dataset-path /path/to/colmap/scene \
+    --reconstruction-path /path/to/point_cloud.ply \
+    --config.feature-level 1 \
+    --log-path langsplatv2_logs
 ```
 
+**Train all three scale levels (as in the paper):**
+
+```bash
+for level in 1 2 3; do
+  python train_langsplatv2.py \
+    --sfm-dataset-path /path/to/scene \
+    --reconstruction-path /path/to/gaussians.ply \
+    --config.feature-level $level \
+    --log-path langsplatv2_logs
+done
+```
+
+
+**Useful flags:**
+
+- `--config.feature-level` — 0=default, 1=small, 2=medium, 3=large (default: 1).
+- `--config.max-steps` — Training steps (default from max_epochs if not set).
+- `--preprocess.image-downsample-factor` — Downsample images before SAM2/CLIP (e.g. 2 for speed).
+- `--preprocess.sam2.checkpoint` — SAM2 size: `large`, `small`, `tiny`, `base_plus`.
+- `--log-path` — Directory for run subdirs (checkpoints, metrics). Use `None` to disable saving.
+- `--io.use-tensorboard` — Log scalars (and optionally images) to TensorBoard.
+- `--use-every-n-as-val` — Hold out every N-th image for validation (e.g. 5); -1 = no validation.
+
+## Outputs
+
+With `--log-path` set (e.g. `langsplatv2_logs`), each run writes:
+
+- `log_path/run_<timestamp>/` (or `log_path/<run_name>/` if `--run-name` is set)
+  - `checkpoints/<step>/langsplatv2_ckpt.pt` — Model state and config (when `io.save_checkpoints` is True).
+  - `metrics_log.csv` — Step, loss, and optional validation metrics.
+  - `tensorboard/` — If `io.use_tensorboard` is True.
+  - `images/` — If `io.save_images` is True (e.g. feature visualizations at save steps).
+
+Preprocessing caches (SAM2 masks, CLIP features) are stored under the scene’s cache directory and reused across runs.
 
-## Scene Transform Outputs
+## Preprocessing pipeline and cache format
 
-### SAM2 Masks
+The pipeline (see `LangSplatV2PreprocessConfig` in `config.py`) runs in order: optional scene normalization, point filtering, image downsampling, filter images by visible points, **ComputeMultiScaleSAM2Masks**, **ComputeCLIPFeatures**, and optional cropping.
 
-For each image, the SAM2 transform produces:
+### SAM2 masks (per image)
 
-- `{scale}_segmentations`: Binary masks, shape `[N, H, W]`
-- `{scale}_bboxes`: Bounding boxes in XYWH format, shape `[N, 4]`
-- `{scale}_areas`: Mask areas in pixels, shape `[N]`
-- `{scale}_predicted_ious`: SAM2's IoU predictions, shape `[N]`
-- `{scale}_stability_scores`: Mask stability scores, shape `[N]`
+- `{scale}_segmentations`: `[N, H, W]` binary masks
+- `{scale}_bboxes`: `[N, 4]` XYWH
+- `{scale}_areas`, `{scale}_predicted_ious`, `{scale}_stability_scores`
 
-where `{scale}` is one of: `default`, `s` (small), `m` (medium), `l` (large).
+Scales: `default`, `s` (small, &lt;1% area), `m` (1–10%), `l` (≥10%).
 
-Masks are categorized by area ratio:
-- **Large (l)**: area >= 10% of image
-- **Medium (m)**: 1% <= area < 10%
-- **Small (s)**: area < 1%
-- **Default**: all masks
+### CLIP features (per image)
 
-### CLIP Features
+- `features`: `[N_total, 512]` L2-normalized OpenCLIP embeddings (one per mask, concatenated over scales).
+- `seg_maps`: `[4, H, W]` — pixel → feature index per scale (-1 = no mask).
+- `lengths`: `[4]` — number of masks per scale (default, s, m, l).
 
-For each image, the CLIP transform produces:
+Training uses a single `feature_level` (0–3) to choose which scale’s seg map and features to use.
 
-- `features`: CLIP embeddings, shape `[N_total, 512]`
-- `seg_maps`: Segmentation maps, shape `[4, H, W]`
-- `lengths`: Number of masks per scale, shape `[4]`
+## Training details and comparison with original LangSplatV2
 
-The `seg_maps` tensor maps each pixel to a feature index (or -1 for unmasked pixels).
+- **Feature generation**: Same as original — crop mask region → pad to square → resize to 224 → OpenCLIP encode → L2-normalize. Scale order and seg-map indexing (default → s → m → l, cumulative) match.
+- **Optimization**: Same language-feature LR (0.0025), layer schedule (every 10k steps), and cosine loss over valid pixels with gradient scaling via mask fraction. The scalar `train/loss` is the (mask-fraction-scaled) total loss used for backprop. For a smoother, more interpretable curve when mask coverage varies across images, use `train/cosine_loss_valid`, which is the mean cosine loss over valid pixels only (no mask-fraction scaling), we use this for logging.
+- **Data sampling**: One random permutation of all training views per “epoch” (InfiniteSampler with shuffle), one view per step when `batch_size=1`, matching the original’s viewpoint-stack behavior.
 
 ## References
 
 
@@ -8,11 +8,13 @@
 )
 from .loss import calculate_langsplatv2_loss
 from .model import LangSplatV2Model
+from .training.langsplatv2_writer import LangSplatV2WriterConfig
 
 __all__ = [
     "LangSplatV2PreprocessConfig",
     "LangSplatV2TrainingConfig",
     "LangSplatV2ModelConfig",
+    "LangSplatV2WriterConfig",
     "LangSplatV2Model",
     "calculate_langsplatv2_loss",
 ]
@@ -30,12 +30,29 @@ class SAM2Config:
     points_per_side: int = 32
     """Grid density for point prompts."""
 
+    points_per_batch: int = 64
+    """Points processed simultaneously by SAM2."""
+
     pred_iou_thresh: float = 0.7
     """Predicted IoU threshold for mask filtering."""
 
     stability_score_thresh: float = 0.85
     """Stability score threshold for mask filtering."""
 
+    crop_n_layers: int = 1
+    """Number of crop layers. 1 = also run SAM on image crops (matching
+    the original LangSplatV2 which uses ``crop_n_layers=1``)."""
+
+    crop_n_points_downscale_factor: int = 1
+    """Point grid downscale factor per crop layer."""
+
+    min_mask_region_area: int = 100
+    """Minimum mask region area for post-processing (matching the original
+    LangSplatV2 which uses ``min_mask_region_area=100``)."""
+
+    box_nms_thresh: float = 0.7
+    """Box NMS IoU threshold within each crop."""
+
     nms_iou_thr: float = 0.8
     """IoU threshold for mask NMS post-processing."""
 
@@ -172,8 +189,13 @@ def build_scene_transforms(
                 ComputeMultiScaleSAM2Masks(
                     checkpoint=self.sam2.checkpoint,
                     points_per_side=self.sam2.points_per_side,
+                    points_per_batch=self.sam2.points_per_batch,
                     pred_iou_thresh=self.sam2.pred_iou_thresh,
                     stability_score_thresh=self.sam2.stability_score_thresh,
+                    crop_n_layers=self.sam2.crop_n_layers,
+                    crop_n_points_downscale_factor=self.sam2.crop_n_points_downscale_factor,
+                    min_mask_region_area=self.sam2.min_mask_region_area,
+                    box_nms_thresh=self.sam2.box_nms_thresh,
                     nms_iou_thr=self.sam2.nms_iou_thr,
                     nms_score_thr=self.sam2.nms_score_thr,
                     nms_inner_thr=self.sam2.nms_inner_thr,
@@ -213,8 +235,13 @@ def build_sam2_transform(self):
         return ComputeMultiScaleSAM2Masks(
             checkpoint=self.sam2.checkpoint,
             points_per_side=self.sam2.points_per_side,
+            points_per_batch=self.sam2.points_per_batch,
             pred_iou_thresh=self.sam2.pred_iou_thresh,
             stability_score_thresh=self.sam2.stability_score_thresh,
+            crop_n_layers=self.sam2.crop_n_layers,
+            crop_n_points_downscale_factor=self.sam2.crop_n_points_downscale_factor,
+            min_mask_region_area=self.sam2.min_mask_region_area,
+            box_nms_thresh=self.sam2.box_nms_thresh,
             nms_iou_thr=self.sam2.nms_iou_thr,
             nms_score_thr=self.sam2.nms_score_thr,
             nms_inner_thr=self.sam2.nms_inner_thr,
@@ -294,7 +321,12 @@ class LangSplatV2TrainingConfig:
     model: LangSplatV2ModelConfig = field(default_factory=LangSplatV2ModelConfig)
     """Model architecture configuration."""
 
-    eval_at_percent: list[int] = field(default_factory=lambda: [25, 50, 75, 100])
+    log_test_images: bool = False
+    """Whether to log visualization images (PCA features, error heatmaps)
+    during training steps.  Eval images are always logged when the writer
+    supports image output, regardless of this flag."""
+
+    eval_at_percent: list[int] = field(default_factory=lambda: [5, 10, 20, 30, 40, 50, 75, 100])
     """Percentages of total epochs at which to run evaluation."""
 
     save_at_percent: list[int] = field(default_factory=lambda: [50, 100])
 
@@ -63,25 +63,14 @@ def calculate_langsplatv2_loss(
     """
     assert use_cosine_loss or use_l1_loss, "At least one loss type must be enabled"
 
-    # Apply mask: only compute loss on valid pixels
-    mask_expanded = mask.unsqueeze(-1).float()  # [B, H, W, 1]
-
     # Optionally normalize predicted features
     if normalize_features:
         predicted_features = predicted_features / (predicted_features.norm(dim=-1, keepdim=True) + 1e-10)
 
-    # Mask both prediction and target
-    pred_masked = predicted_features * mask_expanded
-    gt_masked = gt_features * mask_expanded
-
-    # Only compute on valid pixels to avoid diluting the loss
-    valid_pred = pred_masked[mask]  # [N_valid, clip_n_dims]
-    valid_gt = gt_masked[mask]  # [N_valid, clip_n_dims]
-
     loss_dict: dict[str, torch.Tensor] = {}
     total_loss = torch.tensor(0.0, device=predicted_features.device)
 
-    if valid_pred.shape[0] == 0:
+    if not mask.any():
         # No valid pixels - return zero loss
         loss_dict["total_loss"] = total_loss
         if use_cosine_loss:
@@ -90,13 +79,32 @@ def calculate_langsplatv2_loss(
             loss_dict["l1_loss"] = total_loss
         return loss_dict
 
+    # Gather only valid pixels (clean signal, no NaN risk from torch.empty).
+    valid_pred = predicted_features[mask]  # [N_valid, clip_n_dims]
+    valid_gt = gt_features[mask]  # [N_valid, clip_n_dims]
+
+    # The original LangSplatV2 computes .mean() over ALL H*W pixels, where
+    # masked-out pixels are zero-vectors that contribute ~0 to the sum but
+    # inflate the denominator.  This implicitly scales gradients down by
+    # (N_valid / N_total).  We replicate this by computing the loss on valid
+    # pixels only (clean, interpretable values) and multiplying by the mask
+    # coverage fraction so that gradient magnitudes match the original exactly:
+    #
+    #   grad_original = (1/N_total)  * sum_valid(dL_i)
+    #   grad_ours     = (ratio/N_valid) * sum_valid(dL_i)
+    #                 = (1/N_total)  * sum_valid(dL_i)   [identical]
+    mask_fraction = mask.sum().float() / mask.numel()
+
     if use_cosine_loss:
-        cos_loss = cosine_loss(valid_pred, valid_gt)
+        cos_loss_raw = cosine_loss(valid_pred, valid_gt)
+        cos_loss = cos_loss_raw * mask_fraction
         loss_dict["cosine_loss"] = cos_loss
+        # Mean over valid pixels only (no mask_fraction); stable for logging when coverage varies
+        loss_dict["cosine_loss_valid"] = cos_loss_raw
         total_loss = total_loss + cos_loss
 
     if use_l1_loss:
-        l1 = l1_loss(valid_pred, valid_gt)
+        l1 = l1_loss(valid_pred, valid_gt) * mask_fraction
         loss_dict["l1_loss"] = l1
         total_loss = total_loss + l1
Original file line number	Diff line number	Diff line change
`@@ -8,11 +8,13 @@`
`8`	`8`	`)`
`9`	`9`	`from .loss import calculate_langsplatv2_loss`
`10`	`10`	`from .model import LangSplatV2Model`
	`11`	`+from .training.langsplatv2_writer import LangSplatV2WriterConfig`
`11`	`12`
`12`	`13`	`__all__ = [`
`13`	`14`	`"LangSplatV2PreprocessConfig",`
`14`	`15`	`"LangSplatV2TrainingConfig",`
`15`	`16`	`"LangSplatV2ModelConfig",`
	`17`	`+ "LangSplatV2WriterConfig",`
`16`	`18`	`"LangSplatV2Model",`
`17`	`19`	`"calculate_langsplatv2_loss",`
`18`	`20`	`]`