Add sam3-image model (#164)

jamjamjon · web-flow · commit c86d18ba17f8 · 2025-11-30T23:49:43.000+08:00
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "usls"
 edition = "2021"
-version = "0.1.10"
+version = "0.1.11"
 rust-version = "1.85"
 description = "A Rust library integrated with ONNXRuntime, providing a collection of ML models."
 repository = "https://github.com/jamjamjon/usls"
@@ -131,7 +131,8 @@ rfdetr = []
 rtdetr = []
 rtmo = []
 rtmpose = []
-sam = []
+sam = []  # SAM, SAM2
+sam3 = ["tokenizers"]  # SAM3
 slanet = ["pipeline"]
 smolvlm = ["tokenizers"]
 sapiens = []
@@ -148,6 +149,7 @@ all-models = [
     "yolo",
     "yoloe",
     "sam",
+    "sam3",
     "clip",
     "apisr",
     "image-classifier",
@@ -330,6 +332,10 @@ required-features = ["sam"]
 name = "sam2"
 required-features = ["sam"]
 
+[[example]]
+name = "sam3"
+required-features = ["sam3"]
+
 [[example]]
 name = "sapiens"
 required-features = ["sapiens"]
diff --git a/README.md b/README.md
@@ -134,7 +134,8 @@ usls = { version = "latest-version", features = [ "cuda" ] }
 | [RTMW](https://arxiv.org/abs/2407.08634) | Keypoint Detection | `rtmpose` | [demo](examples/rtmw) |
 | [RTMO](https://github.com/open-mmlab/mmpose/tree/main/projects/rtmo) | Keypoint Detection | `rtmo` | [demo](examples/rtmo) |
 | [SAM](https://github.com/facebookresearch/segment-anything) | Segment Anything | `sam` | [demo](examples/sam) |
-| [SAM2](https://github.com/facebookresearch/segment-anything-2) | Segment Anything | `sam` | [demo](examples/sam) |
+| [SAM2](https://github.com/facebookresearch/segment-anything-2) | Segment Anything | `sam2` | [demo](examples/sam2) |
+| [SAM3](https://github.com/facebookresearch/segment-anything-3) | Segment Anything | `sam3` | [demo](examples/sam3) |
 | [MobileSAM](https://github.com/ChaoningZhang/MobileSAM) | Segment Anything | `sam` | [demo](examples/sam) |
 | [EdgeSAM](https://github.com/chongzhou96/EdgeSAM) | Segment Anything | `sam` | [demo](examples/sam) |
 | [SAM-HQ](https://github.com/SysCV/sam-hq) | Segment Anything | `sam` | [demo](examples/sam) |
diff --git a/assets/000000136466.jpg b/assets/000000136466.jpg
diff --git a/assets/sam3-demo.jpg b/assets/sam3-demo.jpg
diff --git a/examples/sam3/README.md b/examples/sam3/README.md
@@ -0,0 +1,40 @@
+
+### Quick Start
+
+```bash
+# Text prompt
+cargo run -r -F sam3 -F cuda --example sam3 -- --device cuda --dtype q4f16 --source ./assets/sam3-demo.jpg -p shoe
+cargo run -r -F sam3 -F cuda --example sam3 -- --device cuda --dtype bnb4 --source ./assets/sam3-demo.jpg -p "person in red vest"
+cargo run -r -F sam3 -F cuda --example sam3 -- --device cuda --dtype q8 --source ./assets/sam3-demo.jpg -p "boy in blue vest"
+
+# Visual prompt: a single bbox
+cargo run -r -F sam3 -F cuda --example sam3 -- --device cuda --source ./assets/sam3-demo.jpg -p "visual;pos:480,290,110,360"
+
+# Visual prompt: multi-boxes prompting(with positive and negative boxes)
+cargo run -r -F sam3 -F cuda --example sam3 -- --device cuda  --source ./assets/sam3-demo.jpg -p "visual;pos:480,290,110,360;neg:370,280,115,375"
+
+# Text + negative box
+cargo run -r -F sam3 -F cuda --example sam3 -- --device cuda --dtype fp16 --source ./assets/000000136466.jpg -p "handle"
+cargo run -r -F sam3 -F cuda --example sam3 -- --device cuda --dtype fp16 --source ./assets/000000136466.jpg -p "handle;neg:40,183,278,21"
+
+# Multiple prompts (Queries)
+cargo run -r -F sam3 -F cuda --example sam3 -- --device cuda --dtype fp16 --source ./assets/sam3-demo.jpg --source ./assets/bus.jpg -p shoe -p face -p person
+```
+
+
+
+### Prompt Format
+
+```
+"text;pos:x,y,w,h;neg:x,y,w,h"
+```
+
+- `text`: Text description
+- `pos:x,y,w,h`: Positive box (find similar)
+- `neg:x,y,w,h`: Negative box (exclude region)
+
+
+### Results
+
+![](https://github.com/jamjamjon/assets/releases/download/sam3/demo.jpg)
+![](https://github.com/jamjamjon/assets/releases/download/sam3/demo2.jpg)
diff --git a/examples/sam3/main.rs b/examples/sam3/main.rs
@@ -0,0 +1,110 @@
+use anyhow::Result;
+use usls::{
+    models::{Sam3Prompt, SAM3},
+    Annotator, Config, DataLoader,
+};
+
+#[derive(argh::FromArgs)]
+/// SAM3 - Segment Anything Model 3
+struct Args {
+    /// device (cpu:0, cuda:0, etc.)
+    #[argh(option, default = "String::from(\"cpu:0\")")]
+    device: String,
+
+    /// source image paths (can specify multiple)
+    #[argh(
+        option,
+        default = "vec![
+        String::from(\"./assets/sam3-demo.jpg\"),
+        // String::from(\"./assets/bus.jpg\")
+    ]"
+    )]
+    source: Vec<String>,
+
+    /// prompts: "text;pos:x,y,w,h;neg:x,y,w,h" (can specify multiple)
+    #[argh(option, short = 'p')]
+    prompt: Vec<String>,
+
+    /// confidence threshold (default: 0.5)
+    #[argh(option, default = "0.5")]
+    conf: f32,
+
+    /// batch size min (default: 1)
+    #[argh(option, default = "1")]
+    batch_min: usize,
+
+    /// batch size (default: 1)
+    #[argh(option, default = "1")]
+    batch: usize,
+
+    /// batch size max (default: 4)
+    #[argh(option, default = "4")]
+    batch_max: usize,
+
+    /// dtype
+    #[argh(option, default = "String::from(\"q4f16\")")]
+    dtype: String,
+
+    /// show mask
+    #[argh(switch)]
+    show_mask: bool,
+}
+
+fn main() -> Result<()> {
+    tracing_subscriber::fmt()
+        .with_env_filter(tracing_subscriber::EnvFilter::from_default_env())
+        .with_timer(tracing_subscriber::fmt::time::ChronoLocal::rfc_3339())
+        .init();
+
+    let args: Args = argh::from_env();
+
+    // Parse prompts
+    if args.prompt.is_empty() {
+        anyhow::bail!("No prompt. Use -p \"text\" or -p \"visual;pos:x,y,w,h\"");
+    }
+    let prompts: Vec<Sam3Prompt> = args
+        .prompt
+        .iter()
+        .map(|s| s.parse())
+        .collect::<std::result::Result<Vec<_>, _>>()
+        .map_err(|e| anyhow::anyhow!("{}", e))?;
+
+    // Build model
+    let config = Config::sam3_image_predictor()
+        .with_batch_size_all_min_opt_max(args.batch_min, args.batch, args.batch_max)
+        .with_device_all(args.device.parse()?)
+        .with_dtype_all(args.dtype.parse()?)
+        .with_class_confs(&[args.conf])
+        .with_num_dry_run_all(1)
+        .commit()?;
+    let mut model = SAM3::new(config)?;
+
+    // Annotator
+    let annotator = Annotator::default().with_mask_style(
+        usls::Style::mask()
+            .with_draw_mask_polygon_largest(true)
+            .with_visible(args.show_mask),
+    );
+    let output_dir = usls::Dir::Current.base_dir_with_subs(&["runs", model.spec()])?;
+
+    // DataLoader with batch iteration
+    let dataloader = DataLoader::from_paths(&args.source)?
+        .with_batch(args.batch)
+        .with_progress_bar(true)
+        .build()?;
+
+    // Process in batches
+    for batch in dataloader {
+        let ys = model.forward(&batch, &prompts)?;
+        println!("ys: {:?}", ys);
+
+        for (img, y) in batch.iter().zip(ys.iter()) {
+            annotator
+                .annotate(img, y)?
+                .save(output_dir.join(format!("{}.jpg", usls::timestamp(None))))?;
+        }
+    }
+
+    usls::perf(false);
+    Ok(())
+}
diff --git a/src/core/dataloader.rs b/src/core/dataloader.rs
@@ -96,6 +96,25 @@ impl FromStr for DataLoader {
 }
 
 impl DataLoader {
+    /// Create DataLoader from multiple paths
+    pub fn from_paths<P: AsRef<Path>>(paths: &[P]) -> Result<Self> {
+        let paths: VecDeque<PathBuf> = paths.iter().map(|p| p.as_ref().to_path_buf()).collect();
+        let nf = paths.len() as u64;
+
+        if paths.is_empty() {
+            anyhow::bail!("No paths provided");
+        }
+
+        info!("Found {:?} x{}", MediaType::Image(Location::Local), nf);
+
+        Ok(Self {
+            paths: Some(paths),
+            media_type: MediaType::Image(Location::Local),
+            nf,
+            ..Default::default()
+        })
+    }
+
     pub fn new(source: &str) -> Result<Self> {
         // paths & media_type
         let (paths, media_type) = Self::try_load_all(source)?;
diff --git a/src/models/mod.rs b/src/models/mod.rs
@@ -136,6 +136,9 @@ mod yoloe;
 #[cfg(feature = "sam")]
 mod sam2;
 
+#[cfg(feature = "sam3")]
+mod sam3;
+
 #[cfg(feature = "rtdetr")]
 mod d_fine;
 
@@ -160,6 +163,12 @@ pub use yolo::*;
 #[cfg(feature = "sam")]
 pub use sam::*;
 
+#[cfg(feature = "sam")]
+pub use sam2::*;
+
+#[cfg(feature = "sam3")]
+pub use sam3::*;
+
 #[cfg(feature = "clip")]
 pub use clip::*;
 
@@ -244,9 +253,6 @@ pub use yolop::*;
 #[cfg(feature = "yoloe")]
 pub use yoloe::*;
 
-#[cfg(feature = "sam")]
-pub use sam2::*;
-
 #[cfg(feature = "rtdetr")]
 pub use d_fine::*;
 
diff --git a/src/models/sam3/README.md b/src/models/sam3/README.md
@@ -0,0 +1,11 @@
+# SAM3: Segment Anything with Concepts
+
+A powerful multimodal segmentation model supporting text, bounding box, and combined prompts.
+
+## References
+
+- Official: [facebookresearch/sam3](https://github.com/facebookresearch/sam3)
+
+## Example
+
+See [examples/sam3](../../../examples/sam3)
diff --git a/src/models/sam3/config.rs b/src/models/sam3/config.rs
@@ -0,0 +1,40 @@
+use crate::Config;
+
+/// Model configuration for `SAM3`
+impl Config {
+    /// SAM3 base configuration
+    ///
+    /// - Input size: 1008x1008 (FitExact, no aspect ratio preserved)
+    /// - Normalization: mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]
+    /// - Tokenizer: CLIP BPE (max_length=32)
+    /// - Confidence threshold: 0.5
+    pub fn sam3() -> Self {
+        Self::default()
+            .with_name("sam3")
+            .with_batch_size_all_min_opt_max(1, 1, 4)
+            .with_visual_encoder_ixx(0, 1, 3.into())
+            .with_visual_encoder_ixx(0, 2, 1008.into())
+            .with_visual_encoder_ixx(0, 3, 1008.into())
+            .with_textual_encoder_ixx(0, 1, 32.into())
+            .with_resize_mode(crate::ResizeMode::FitExact)
+            .with_resize_filter("Bilinear")
+            .with_image_mean(&[0.5, 0.5, 0.5])
+            .with_image_std(&[0.5, 0.5, 0.5])
+            .with_normalize(true)
+            .with_find_contours(true)
+            .with_class_confs(&[0.5])
+            .with_model_max_length(32) // CLIP max length, enables auto padding/truncation
+            .with_tokenizer_file("sam3/tokenizer.json")
+            .with_tokenizer_config_file("sam3/tokenizer_config.json")
+            .with_special_tokens_map_file("sam3/special_tokens_map.json")
+            .with_config_file("sam3/config.json")
+    }
+
+    pub fn sam3_image_predictor() -> Self {
+        Self::sam3()
+            .with_visual_encoder_file("vision-encoder.onnx")
+            .with_textual_encoder_file("text-encoder.onnx")
+            .with_encoder_file("geometry-encoder.onnx")
+            .with_decoder_file("decoder.onnx")
+    }
+}
diff --git a/src/models/sam3/impl.rs b/src/models/sam3/impl.rs
diff --git a/src/models/sam3/mod.rs b/src/models/sam3/mod.rs