pytorch
diff --git a/‎benchmarks/microbenchmarks/README.md‎
Lines changed: 61 additions & 1 deletion b/‎benchmarks/microbenchmarks/README.md‎
Lines changed: 61 additions & 1 deletion
diff --git a/‎benchmarks/microbenchmarks/benchmark_inference.py‎
Lines changed: 10 additions & 12 deletions b/‎benchmarks/microbenchmarks/benchmark_inference.py‎
Lines changed: 10 additions & 12 deletions
diff --git a/‎benchmarks/microbenchmarks/benchmark_runner.py‎
Lines changed: 42 additions & 4 deletions b/‎benchmarks/microbenchmarks/benchmark_runner.py‎
Lines changed: 42 additions & 4 deletions
diff --git a/‎benchmarks/microbenchmarks/test/benchmark_config.yml‎
Lines changed: 52 additions & 32 deletions b/‎benchmarks/microbenchmarks/test/benchmark_config.yml‎
Lines changed: 52 additions & 32 deletions
diff --git a/‎benchmarks/microbenchmarks/test/test_benchmark_profiler.py‎
Lines changed: 1 addition & 1 deletion b/‎benchmarks/microbenchmarks/test/test_benchmark_profiler.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/microbenchmarks/test/test_benchmark_runner.py‎
Lines changed: 60 additions & 0 deletions b/‎benchmarks/microbenchmarks/test/test_benchmark_runner.py‎
Lines changed: 60 additions & 0 deletions
@@ -63,14 +63,74 @@ Currently, quantization string is in same format as the one being passed in llam
 
 ### Model Types
 - `linear`: Simple linear layer
-- `ln_linear_sigmoid`: LayerNorm + Linear + Sigmoid
+- `ln_linear_<activation>`: LayerNorm + Linear + Activation, where activation can be:
+  - `ln_linear_sigmoid`: LayerNorm + Linear + Sigmoid
+  - `ln_linear_relu`: LayerNorm + Linear + ReLU
+  - `ln_linear_leakyrelu`: LayerNorm + Linear + LeakyReLU
+  - `ln_linear_relu6`: LayerNorm + Linear + ReLU6
+  - `ln_linear_gelu`: LayerNorm + Linear + GELU
+  - `ln_linear_silu`: LayerNorm + Linear + SiLU
+  - `ln_linear_hardswish`: LayerNorm + Linear + Hardswish
+- `transformer_block`: Transformer block with self-attention and MLP
 
 ### Device Options
 - `cuda`: NVIDIA GPU
 - `xpu`: Intel GPU
 - `mps`: Apple Silicon GPU
 - `cpu`: CPU fallback
 
+### Shape Generation Options
+- `custom`: Manually specify shapes as a list of [m, k, n] dimensions
+  ```yaml
+  matrix_shapes:
+    - name: "custom"
+      shapes: [
+        [1024, 1024, 1024],  # [m, k, n]
+        [2048, 4096, 1024]
+      ]
+  ```
+
+- `llama`: Use LLaMa 2 70B single-node weight shapes (assumes fused attn.wqkv and ffn.w13)
+  - Generates shapes for: "attn.wqkv", "attn.w0", "ffn.w13", "ffn.w2"
+  ```yaml
+  matrix_shapes:
+    - name: "llama"
+  ```
+
+- `pow2`: Generate shapes with dimensions that are powers of 2
+  - Parameters:
+    - `min_power`: Minimum power of 2 (default: 10, which is 1024)
+    - `max_power`: Maximum power of 2 (default: 14, which is 16,384)
+  ```yaml
+  matrix_shapes:
+    - name: "pow2"
+      min_power: 10  # 2^10 = 1024
+      max_power: 12  # 2^12 = 4096
+  ```
+
+- `pow2_extended`: Generate shapes with dimensions that are powers of 2 and powers of 2 + half
+  - Parameters:
+    - `min_power`: Minimum power of 2 (default: 10, which is 1024)
+    - `max_power`: Maximum power of 2 (default: 14, which is 16,384)
+  ```yaml
+  matrix_shapes:
+    - name: "pow2_extended"
+      min_power: 10  # Generates: 1024, 1536, 2048, 3072, etc.
+      max_power: 11
+  ```
+
+- `sweep`: Generate a sweep of shapes with different powers of 2 for M, K, N dimensions
+  - Parameters:
+    - `min_power`: Minimum power of 2 (default: 8, which is 256)
+    - `max_power`: Maximum power of 2 (default: 15, which is 32,768)
+  - Note: This generates all combinations of M, K, N dimensions, which can be a large number of shapes
+  ```yaml
+  matrix_shapes:
+    - name: "sweep"
+      min_power: 8  # 2^8 = 256
+      max_power: 9  # 2^9 = 512
+  ```
+
 ## Output
 
 Results are saved to a CSV file in the specified output directory
 
@@ -19,13 +19,15 @@
     BenchmarkConfig,
     BenchmarkResult,
     clean_caches,
-    create_model_and_input,
     generate_model_profile,
     model_inference_time_in_ms,
     string_to_config,
 )
 from torchao.quantization import quantize_
 from torchao.sparsity.sparse_api import sparsify_
+from torchao.testing.model_architectures import (
+    create_model_and_input_data,
+)
 
 
 def run(config: BenchmarkConfig) -> BenchmarkResult:
@@ -36,7 +38,7 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
         # Create output directory if it doesn't exist
         Path(config.output_dir).mkdir(parents=True, exist_ok=True)
 
-        base_model, input_data = create_model_and_input(
+        base_model, input_data = create_model_and_input_data(
             config.model_type,
             config.m,
             config.k,
@@ -94,16 +96,12 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
         if config.enable_profiler:
             print("Running profiler...")
             try:
-                result.profiler_json_path, result.perfetto_url = generate_model_profile(
+                result.profiler_json_path = generate_model_profile(
                     m_copy, input_data, config.profiler_file_name
                 )
-            except Exception as e:
-                print(f"Error running profiler: {e}")
-
+            except Exception:
+                print(f"Error running profiler for {config.name}")
         return result
-    except Exception as e:
-        print(f"Error in benchmark run: {e}")
-        import traceback
-
-        print(traceback.format_exc())
-        return None
+    except Exception:
+        print(f"Error in benchmark run: {config.name}")
+        return
@@ -48,9 +48,50 @@ def get_shapes_for_config(
         name = shape_config["name"]
         if name == "custom":
             shapes.extend([(name, shape) for shape in shape_config["shapes"]])
+        elif name == "llama":
+            # LLaMa 2 70B single-node weight shapes
+            # assumes fused attn.wqkv and ffn.w13
+            bsz, seq_len = 4, 4096
+            M = bsz * seq_len
+            llama_shapes = {
+                "attn.wqkv": (M, 8192, 1280),
+                "attn.w0": (M, 1024, 8192),
+                "ffn.w13": (M, 8192, 7168),
+                "ffn.w2": (M, 3584, 8192),
+            }
+            shapes.extend([(f"{name}_{k}", v) for k, v in llama_shapes.items()])
+        elif name == "pow2":
+            # Generate shapes with dimensions that are powers of 2
+            min_power_of_2 = shape_config.get("min_power", 10)  # 1024
+            max_power_of_2 = shape_config.get("max_power", 14)  # 16,384
+            for idx, power_of_2 in enumerate(range(min_power_of_2, max_power_of_2 + 1)):
+                val = 2**power_of_2
+                shapes.append((f"{name}_{idx}", [val, val, val]))
+        elif name == "pow2_extended":
+            # Generate shapes with dimensions that are powers of 2 and powers of 2 + half
+            min_power_of_2 = shape_config.get("min_power", 10)  # 1024
+            max_power_of_2 = shape_config.get("max_power", 14)  # 16,384
+            for idx, power_of_2 in enumerate(range(min_power_of_2, max_power_of_2 + 1)):
+                val1 = 2**power_of_2
+                val2 = 2**power_of_2 + 2 ** (power_of_2 - 1)
+                shapes.append((f"{name}_{idx*2}", [val1, val1, val1]))
+                shapes.append((f"{name}_{idx*2+1}", [val2, val2, val2]))
+        elif name == "sweep":
+            # Generate a sweep of shapes with different powers of 2 for M, K, N
+            min_p2 = shape_config.get("min_power", 8)  # 256
+            max_p2 = shape_config.get("max_power", 15)  # 32,768
+            counter = 0
+            for M_p2 in range(min_p2, max_p2 + 1):
+                M = 2**M_p2
+                for K_p2 in range(min_p2, max_p2 + 1):
+                    K = 2**K_p2
+                    for N_p2 in range(min_p2, max_p2 + 1):
+                        N = 2**N_p2
+                        shapes.append((f"{name}_{counter}", [M, K, N]))
+                        counter += 1
         else:
             raise NotImplementedError(
-                f"Shape config {name} not supported. Currently only supports custom shapes."
+                f"Shape config {name} not supported. Supported options: custom, llama, pow2, pow2_extended, sweep."
             )
     return shapes
 
@@ -167,10 +208,7 @@ def run_inference_benchmarks_from_config(configs: List[BenchmarkConfig]) -> None
             if result is not None:  # Only add successful results
                 results.append(result)
         except Exception as e:
-            import traceback
-
             print(f"Error running benchmark {config.name} with error: {e}")
-            print(traceback.format_exc())
             continue
 
     # Add results to csv if there are any
 
@@ -2,51 +2,71 @@
 benchmark_mode: "inference"
 quantization_config_recipe_names:
   # Will run a baseline inference for model by default, without quantization for comparison
-  # - "int4wo-32"
-  # - "marlin"
   - "int8wo"
-# sparsity_config_recipe_names:
+  - "int8dq"
+  - "float8dq"
+sparsity_config_recipe_names:
   # Will run a baseline inference for model by default, without sparsity for comparison
-  # - "semi-sparse"
-  # - "block"
+  - "semi-sparse"
+  - "block"
 output_dir: "benchmarks/microbenchmarks/results"
 model_params:
-  # - name: "small_bf16_linear"
-  #   matrix_shapes:
-  #     - name: "custom"
-  #       shapes: [
-  #         [1024, 1024, 1024],  # [m, k, n]
-  #       ]
-  #   high_precision_dtype: "torch.bfloat16"
-  #   use_torch_compile: true
-  #   torch_compile_mode: "max-autotune"
-  #   device: "cuda"
-  #   model_type: "linear"
-  #   enable_profiler: true  # Enable profiling for this model
-
-  - name: "large_bf16_ln_linear"
+  - name: "small_bf16_linear"
     matrix_shapes:
       - name: "custom"
         shapes: [
+          [1024, 1024, 1024],  # [m, k, n]
           [2048, 4096, 1024],
-          # [4096, 4096, 1024]
+          [4096, 4096, 1024]
         ]
     high_precision_dtype: "torch.bfloat16"
     use_torch_compile: true
     torch_compile_mode: "max-autotune"
     device: "cuda"
     model_type: "linear"
     enable_profiler: true  # Enable profiling for this model
-    enable_memory_profile: true  # Enable memory profiling for this model
 
-  # - name: "cpu_fp32_linear"
-  #   matrix_shapes:
-  #     - name: "custom"
-  #       shapes: [
-  #         [4096, 4096, 1024]
-  #       ]
-  #   high_precision_dtype: "torch.float32"
-  #   use_torch_compile: false
-  #   device: "cpu"
-  #   model_type: "linear"
-  #   enable_profiler: true  # Enable profiling for this model
+  - name: "ln_linear_sigmoid_cuda"
+    matrix_shapes:
+      - name: "custom"
+        shapes: [
+          [2048, 4096, 1024],
+        ]
+    high_precision_dtype: "torch.bfloat16"
+    use_torch_compile: true
+    torch_compile_mode: "max-autotune"
+    device: "cuda"
+    model_type: "ln_linear_sigmoid"
+    enable_profiler: true
+
+  - name: "bf16_transformer_block"
+    matrix_shapes:
+      - name: "custom"
+        shapes: [
+          [2048, 4096, 1024],  # For transformer_block, k is the hidden dimension
+        ]
+    high_precision_dtype: "torch.bfloat16"
+    use_torch_compile: true
+    torch_compile_mode: "max-autotune"
+    device: "cuda"
+    model_type: "transformer_block" # TODO: Add a custom model (Figure out how to do this, maybe pass a .py file with model definition)
+    enable_profiler: true
+
+  - name: "large_bf16_ln_linear"
+    matrix_shapes:
+      - name: "llama"  # Example of using LLaMa shapes
+      - name: "pow2"  # Example of using power of 2 shapes
+        min_power: 10  # 1024
+        max_power: 12  # 4096
+      - name: "pow2_extended"  # Example of using extended power of 2 shapes
+        min_power: 10  # 1024
+        max_power: 11  # 2048
+      - name: "sweep"  # Example of using sweep shapes (commented out as it generates many shapes)
+        min_power: 8   # 256
+        max_power: 9   # 512
+    high_precision_dtype: "torch.bfloat16"
+    use_torch_compile: true
+    torch_compile_mode: "max-autotune"
+    device: "cuda"
+    model_type: "linear"
+    enable_profiler: true  # Enable profiling for this model
@@ -12,9 +12,9 @@
 
 from benchmarks.microbenchmarks.utils import (
     BenchmarkConfig,
-    ToyLinearModel,
     generate_model_profile,
 )
+from torchao.testing.model_architectures import ToyLinearModel
 
 
 class TestBenchmarkProfiler(unittest.TestCase):
 
@@ -57,12 +57,72 @@ def tearDown(self):
         shutil.rmtree(self.temp_dir)
 
     def test_get_shapes_for_config(self):
+        # Test custom shapes
         shapes = get_shapes_for_config(
             self.test_config["model_params"][0]["matrix_shapes"]
         )
         self.assertEqual(len(shapes), 1)
         self.assertEqual(shapes[0], ("custom", [1024, 1024, 1024]))
 
+        # Test llama shapes
+        llama_shapes = get_shapes_for_config([{"name": "llama"}])
+        self.assertEqual(len(llama_shapes), 4)  # 4 LLaMa shapes
+        self.assertTrue(
+            any(name.startswith("llama_attn.wqkv") for name, _ in llama_shapes)
+        )
+        self.assertTrue(
+            any(name.startswith("llama_attn.w0") for name, _ in llama_shapes)
+        )
+        self.assertTrue(
+            any(name.startswith("llama_ffn.w13") for name, _ in llama_shapes)
+        )
+        self.assertTrue(
+            any(name.startswith("llama_ffn.w2") for name, _ in llama_shapes)
+        )
+
+        # Test pow2 shapes
+        pow2_shapes = get_shapes_for_config(
+            [{"name": "pow2", "min_power": 10, "max_power": 12}]
+        )
+        self.assertEqual(len(pow2_shapes), 3)  # 3 powers of 2 (10, 11, 12)
+        self.assertEqual(pow2_shapes[0], ("pow2_0", [1024, 1024, 1024]))  # 2^10
+        self.assertEqual(pow2_shapes[1], ("pow2_1", [2048, 2048, 2048]))  # 2^11
+        self.assertEqual(pow2_shapes[2], ("pow2_2", [4096, 4096, 4096]))  # 2^12
+
+        # Test pow2_extended shapes
+        pow2_extended_shapes = get_shapes_for_config(
+            [{"name": "pow2_extended", "min_power": 10, "max_power": 11}]
+        )
+        self.assertEqual(
+            len(pow2_extended_shapes), 4
+        )  # 2 powers of 2, each with 2 variants
+        self.assertEqual(
+            pow2_extended_shapes[0], ("pow2_extended_0", [1024, 1024, 1024])
+        )  # 2^10
+        self.assertEqual(
+            pow2_extended_shapes[1], ("pow2_extended_1", [1536, 1536, 1536])
+        )  # 2^10 + 2^9
+        self.assertEqual(
+            pow2_extended_shapes[2], ("pow2_extended_2", [2048, 2048, 2048])
+        )  # 2^11
+        self.assertEqual(
+            pow2_extended_shapes[3], ("pow2_extended_3", [3072, 3072, 3072])
+        )  # 2^11 + 2^10
+
+        # Test sweep shapes (limited to a small range for testing)
+        sweep_shapes = get_shapes_for_config(
+            [{"name": "sweep", "min_power": 8, "max_power": 9}]
+        )
+        # For min_power=8, max_power=9, we should have 8 shapes (2^3 = 8 combinations)
+        self.assertEqual(len(sweep_shapes), 8)
+        # Check that all shapes have the expected format
+        for name, shape in sweep_shapes:
+            self.assertTrue(name.startswith("sweep_"))
+            self.assertEqual(len(shape), 3)  # [M, K, N]
+            # Check that all dimensions are powers of 2 between 2^8 and 2^9
+            for dim in shape:
+                self.assertTrue(dim in [256, 512])  # 2^8, 2^9
+
     def test_get_param_combinations(self):
         model_param = self.test_config["model_params"][0]
         shapes, params = get_param_combinations(model_param)
Original file line number	Diff line number	Diff line change
`@@ -12,9 +12,9 @@`
`12`	`12`
`13`	`13`	`from benchmarks.microbenchmarks.utils import (`
`14`	`14`	`BenchmarkConfig,`
`15`		`- ToyLinearModel,`
`16`	`15`	`generate_model_profile,`
`17`	`16`	`)`
	`17`	`+from torchao.testing.model_architectures import ToyLinearModel`
`18`	`18`
`19`	`19`
`20`	`20`	`class TestBenchmarkProfiler(unittest.TestCase):`