|
| 1 | +import os |
| 2 | +import sys |
| 3 | + |
| 4 | +sys.path.append("..") |
| 5 | + |
| 6 | +import time |
| 7 | +import torch |
| 8 | +from diffusers import OvisImagePipeline, OvisImageTransformer2DModel |
| 9 | +from utils import get_args, strify, cachify, MemoryTracker, create_profiler_from_args |
| 10 | +import cache_dit |
| 11 | + |
| 12 | +args = get_args() |
| 13 | +print(args) |
| 14 | + |
| 15 | +pipe = OvisImagePipeline.from_pretrained( |
| 16 | + ( |
| 17 | + args.model_path |
| 18 | + if args.model_path is not None |
| 19 | + else os.environ.get( |
| 20 | + "OVIS_IMAGE_DIR", |
| 21 | + "AIDC-AI/Ovis-Image-7B", |
| 22 | + ) |
| 23 | + ), |
| 24 | + torch_dtype=torch.bfloat16, |
| 25 | +) |
| 26 | + |
| 27 | +if args.cache: |
| 28 | + cachify(args, pipe) |
| 29 | + |
| 30 | +assert isinstance(pipe.transformer, OvisImageTransformer2DModel) |
| 31 | +if args.quantize: |
| 32 | + pipe.transformer = cache_dit.quantize( |
| 33 | + pipe.transformer, |
| 34 | + quant_type=args.quantize_type, |
| 35 | + exclude_layers=[ |
| 36 | + "embedder", |
| 37 | + "embed", |
| 38 | + ], |
| 39 | + ) |
| 40 | + pipe.text_encoder = cache_dit.quantize( |
| 41 | + pipe.text_encoder, |
| 42 | + quant_type=args.quantize_type, |
| 43 | + ) |
| 44 | + |
| 45 | +pipe.to("cuda") |
| 46 | + |
| 47 | +if args.attn is not None: |
| 48 | + if hasattr(pipe.transformer, "set_attention_backend"): |
| 49 | + pipe.transformer.set_attention_backend(args.attn) |
| 50 | + print(f"Set attention backend to {args.attn}") |
| 51 | + |
| 52 | + |
| 53 | +if args.compile: |
| 54 | + cache_dit.set_compile_configs() |
| 55 | + pipe.transformer = torch.compile(pipe.transformer) |
| 56 | + pipe.text_encoder = torch.compile(pipe.text_encoder) |
| 57 | + pipe.vae = torch.compile(pipe.vae) |
| 58 | + |
| 59 | + |
| 60 | +prompt = 'A creative 3D artistic render where the text "OVIS-IMAGE" is written in a bold, expressive handwritten brush style using thick, wet oil paint. The paint is a mix of vibrant rainbow colors (red, blue, yellow) swirling together like toothpaste or impasto art. You can see the ridges of the brush bristles and the glossy, wet texture of the paint. The background is a clean artist\'s canvas. Dynamic lighting creates soft shadows behind the floating paint strokes. Colorful, expressive, tactile texture, 4k detail.' |
| 61 | +if args.prompt is not None: |
| 62 | + prompt = args.prompt |
| 63 | + |
| 64 | + |
| 65 | +def run_pipe(): |
| 66 | + steps = args.steps if args.steps is not None else 28 |
| 67 | + if args.profile and args.steps is None: |
| 68 | + steps = 3 |
| 69 | + image = pipe( |
| 70 | + prompt, |
| 71 | + negative_prompt="", |
| 72 | + height=1024 if args.height is None else args.height, |
| 73 | + width=1024 if args.width is None else args.width, |
| 74 | + num_inference_steps=steps, |
| 75 | + guidance_scale=5.0, # has separate cfg for ovis image |
| 76 | + generator=torch.Generator("cpu").manual_seed(0), |
| 77 | + ).images[0] |
| 78 | + return image |
| 79 | + |
| 80 | + |
| 81 | +# warmup |
| 82 | +_ = run_pipe() |
| 83 | + |
| 84 | +memory_tracker = MemoryTracker() if args.track_memory else None |
| 85 | + |
| 86 | +if memory_tracker: |
| 87 | + memory_tracker.__enter__() |
| 88 | + |
| 89 | +start = time.time() |
| 90 | +if args.profile: |
| 91 | + profiler = create_profiler_from_args(args, profile_name="ovis_image_inference") |
| 92 | + with profiler: |
| 93 | + image = run_pipe() |
| 94 | + print(f"Profiler traces saved to: {profiler.output_dir}/{profiler.trace_path.name}") |
| 95 | +else: |
| 96 | + image = run_pipe() |
| 97 | +end = time.time() |
| 98 | + |
| 99 | +if memory_tracker: |
| 100 | + memory_tracker.__exit__(None, None, None) |
| 101 | + memory_tracker.report() |
| 102 | + |
| 103 | +cache_dit.summary(pipe) |
| 104 | + |
| 105 | +time_cost = end - start |
| 106 | +save_path = f"ovis_image.{strify(args, pipe)}.png" |
| 107 | +print(f"Time cost: {time_cost:.2f}s") |
| 108 | +print(f"Saving image to {save_path}") |
| 109 | +image.save(save_path) |
0 commit comments