feat: support cache for ovis-image (#530)

DefTruth · web-flow · commit 6c21efcd519f · 2025-12-05T10:41:12.000+08:00
* support cache for ovis-image

* support cache for ovis-image

* support cache for ovis-image

* support cache for ovis-image
diff --git a/README.md b/README.md
@@ -114,9 +114,9 @@ The comparison between **cache-dit** and other algorithms shows that within a sp
 
 | 📚Model | Cache  | CP | TP | 📚Model | Cache  | CP | TP |
 |:---|:---|:---|:---|:---|:---|:---|:---|
+| **🔥[Z-Image](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️🔥 | ✔️🔥 | ✔️🔥 | **🔥[Ovis-Image](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️🔥 | ✖️ | ✖️ |
 | **🔥[FLUX.2: 56B](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️🔥 | ✔️🔥 | ✔️🔥 | **🔥[HuyuanVideo 1.5](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️🔥 | ✖️ | ✖️ |
-| **🔥[Z-Image-Turbo](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️🔥 | ✔️🔥 | ✔️🔥 | **🎉[FLUX.1 `Q`](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✖️ |
-| **🎉[FLUX.1](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✔️ | **🎉[FLUX.1-Fill `Q`](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✖️ |
+| **🎉[FLUX.1](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✔️ | **🎉[FLUX.1 `Q`](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✖️ |
 | **🎉[FLUX.1-Fill](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✔️ | **🎉[Qwen-Image `Q`](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✖️ |
 | **🎉[Qwen-Image](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✔️ | **🎉[Qwen...Edit `Q`](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✖️ |
 | **🎉[Qwen...Edit](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✔️ | **🎉[Qwen...E...Plus `Q`](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✖️ |
diff --git a/docs/User_Guide.md b/docs/User_Guide.md
@@ -81,9 +81,9 @@ Currently, **cache-dit** library supports almost **Any** Diffusion Transformers
 
 | 📚Model | Cache  | CP | TP | 📚Model | Cache  | CP | TP |
 |:---|:---|:---|:---|:---|:---|:---|:---|
+| **🔥[Z-Image](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️🔥 | ✔️🔥 | ✔️🔥 | **🔥[Ovis-Image](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️🔥 | ✖️ | ✖️ |
 | **🔥[FLUX.2: 56B](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️🔥 | ✔️🔥 | ✔️🔥 | **🔥[HuyuanVideo 1.5](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️🔥 | ✖️ | ✖️ |
-| **🔥[Z-Image-Turbo](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️🔥 | ✔️🔥 | ✔️🔥 | **🎉[FLUX.1 `Q`](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✖️ |
-| **🎉[FLUX.1](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✔️ | **🎉[FLUX.1-Fill `Q`](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✖️ |
+| **🎉[FLUX.1](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✔️ | **🎉[FLUX.1 `Q`](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✖️ |
 | **🎉[FLUX.1-Fill](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✔️ | **🎉[Qwen-Image `Q`](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✖️ |
 | **🎉[Qwen-Image](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✔️ | **🎉[Qwen...Edit `Q`](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✖️ |
 | **🎉[Qwen...Edit](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✔️ | **🎉[Qwen...E...Plus `Q`](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline)** | ✔️ | ✔️ | ✖️ |
diff --git a/examples/pipeline/run_ovis_image.py b/examples/pipeline/run_ovis_image.py
@@ -0,0 +1,109 @@
+import os
+import sys
+
+sys.path.append("..")
+
+import time
+import torch
+from diffusers import OvisImagePipeline, OvisImageTransformer2DModel
+from utils import get_args, strify, cachify, MemoryTracker, create_profiler_from_args
+import cache_dit
+
+args = get_args()
+print(args)
+
+pipe = OvisImagePipeline.from_pretrained(
+    (
+        args.model_path
+        if args.model_path is not None
+        else os.environ.get(
+            "OVIS_IMAGE_DIR",
+            "AIDC-AI/Ovis-Image-7B",
+        )
+    ),
+    torch_dtype=torch.bfloat16,
+)
+
+if args.cache:
+    cachify(args, pipe)
+
+assert isinstance(pipe.transformer, OvisImageTransformer2DModel)
+if args.quantize:
+    pipe.transformer = cache_dit.quantize(
+        pipe.transformer,
+        quant_type=args.quantize_type,
+        exclude_layers=[
+            "embedder",
+            "embed",
+        ],
+    )
+    pipe.text_encoder = cache_dit.quantize(
+        pipe.text_encoder,
+        quant_type=args.quantize_type,
+    )
+
+pipe.to("cuda")
+
+if args.attn is not None:
+    if hasattr(pipe.transformer, "set_attention_backend"):
+        pipe.transformer.set_attention_backend(args.attn)
+        print(f"Set attention backend to {args.attn}")
+
+
+if args.compile:
+    cache_dit.set_compile_configs()
+    pipe.transformer = torch.compile(pipe.transformer)
+    pipe.text_encoder = torch.compile(pipe.text_encoder)
+    pipe.vae = torch.compile(pipe.vae)
+
+
+prompt = 'A creative 3D artistic render where the text "OVIS-IMAGE" is written in a bold, expressive handwritten brush style using thick, wet oil paint. The paint is a mix of vibrant rainbow colors (red, blue, yellow) swirling together like toothpaste or impasto art. You can see the ridges of the brush bristles and the glossy, wet texture of the paint. The background is a clean artist\'s canvas. Dynamic lighting creates soft shadows behind the floating paint strokes. Colorful, expressive, tactile texture, 4k detail.'
+if args.prompt is not None:
+    prompt = args.prompt
+
+
+def run_pipe():
+    steps = args.steps if args.steps is not None else 28
+    if args.profile and args.steps is None:
+        steps = 3
+    image = pipe(
+        prompt,
+        negative_prompt="",
+        height=1024 if args.height is None else args.height,
+        width=1024 if args.width is None else args.width,
+        num_inference_steps=steps,
+        guidance_scale=5.0,  # has separate cfg for ovis image
+        generator=torch.Generator("cpu").manual_seed(0),
+    ).images[0]
+    return image
+
+
+# warmup
+_ = run_pipe()
+
+memory_tracker = MemoryTracker() if args.track_memory else None
+
+if memory_tracker:
+    memory_tracker.__enter__()
+
+start = time.time()
+if args.profile:
+    profiler = create_profiler_from_args(args, profile_name="ovis_image_inference")
+    with profiler:
+        image = run_pipe()
+    print(f"Profiler traces saved to: {profiler.output_dir}/{profiler.trace_path.name}")
+else:
+    image = run_pipe()
+end = time.time()
+
+if memory_tracker:
+    memory_tracker.__exit__(None, None, None)
+    memory_tracker.report()
+
+cache_dit.summary(pipe)
+
+time_cost = end - start
+save_path = f"ovis_image.{strify(args, pipe)}.png"
+print(f"Time cost: {time_cost:.2f}s")
+print(f"Saving image to {save_path}")
+image.save(save_path)
diff --git a/src/cache_dit/caching/block_adapters/__init__.py b/src/cache_dit/caching/block_adapters/__init__.py
@@ -833,3 +833,31 @@ def zimage_adapter(pipe, **kwargs) -> BlockAdapter:
             "ZImageTransformer2DModel is not available in the current diffusers version. "
             "Please upgrade diffusers>=0.36.dev0 to use this adapter."
         )
+
+
+@BlockAdapterRegister.register("OvisImage")
+def ovis_image_adapter(pipe, **kwargs) -> BlockAdapter:
+    try:
+        from diffusers import OvisImageTransformer2DModel
+
+        _relaxed_assert_transformer(pipe.transformer, OvisImageTransformer2DModel)
+        return BlockAdapter(
+            pipe=pipe,
+            transformer=pipe.transformer,
+            blocks=[
+                pipe.transformer.transformer_blocks,
+                pipe.transformer.single_transformer_blocks,
+            ],
+            forward_pattern=[
+                ForwardPattern.Pattern_1,
+                ForwardPattern.Pattern_1,
+            ],
+            check_forward_pattern=True,
+            has_separate_cfg=True,
+            **kwargs,
+        )
+    except ImportError:
+        raise ImportError(
+            "OvisImageTransformer2DModel is not available in the current diffusers version. "
+            "Please upgrade diffusers>=0.36.dev0 to use this adapter."
+        )
diff --git a/src/cache_dit/caching/block_adapters/block_registers.py b/src/cache_dit/caching/block_adapters/block_registers.py
@@ -25,6 +25,7 @@ class BlockAdapterRegister:
         "Kandinsky5",
         "ChronoEdit",
         "HunyuanVideo15",
+        "OvisImage",
     ]
 
     @classmethod

Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,7 @@ class BlockAdapterRegister:`
`25`	`25`	`"Kandinsky5",`
`26`	`26`	`"ChronoEdit",`
`27`	`27`	`"HunyuanVideo15",`
	`28`	`+ "OvisImage",`
`28`	`29`	`]`
`29`	`30`
`30`	`31`	`@classmethod`