Update

vkuzo · vkuzo · commit 1581808080ce · 2025-12-08T07:24:11.000-08:00
[ghstack-poisoned]
diff --git a/benchmarks/quantization/eval_accuracy_for_readme.py b/benchmarks/quantization/eval_accuracy_for_readme.py
@@ -25,17 +25,17 @@ def string_to_config(s):
         return None
     elif s == "float8_rowwise":
         return Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
-    elif s == "int4_weight_float8_rowwise_activation":
+    elif s == "int4_groupwise_weight_float8_rowwise_activation":
         return Float8DynamicActivationInt4WeightConfig()
-    elif s == "int4_weight_only_hqq":
+    elif s == "int4_groupwise_hqq_weight_only":
         return Int4WeightOnlyConfig(
             group_size=32,
             int4_packing_format="tile_packed_to_4d",
             int4_choose_qparams_algorithm="hqq",
         )
-    elif s == "int8_weight_only":
+    elif s == "int8_rowwise_weight_only":
         return Int8WeightOnlyConfig()
-    elif s == "int8":
+    elif s == "int8_rowwise":
         return Int8DynamicActivationInt8WeightConfig()
     else:
         raise AssertionError(f"unsupported {s}")
diff --git a/benchmarks/quantization/eval_accuracy_for_readme.sh b/benchmarks/quantization/eval_accuracy_for_readme.sh
@@ -21,9 +21,9 @@ time python -u benchmarks/quantization/eval_accuracy_for_readme.py $BASE_ARGS 2>
 
 # quantized recipes
 # note:
-# * `int4_weight_float8_rowwise_activation` doesn't work with dtype_map auto: https://gist.github.com/vkuzo/6b128681b628744d445c553cdeac8a85
-# * `int4_weight_only_hqq` only works on A100
-for quant_recipe in float8_rowwise int4_weight_float8_rowwise_activation int4_weight_only_hqq int8_weight_only int8; do
+# * `int4_groupwise_hqq_weight_float8_rowwise_activation` doesn't work with dtype_map auto: https://gist.github.com/vkuzo/6b128681b628744d445c553cdeac8a85
+# * `int4_groupwise_hqq_weight_only` only works on A100
+for quant_recipe in float8_rowwise int4_groupwise_weight_float8_rowwise_activation int4_groupwise_hqq_weight_only int8_rowwise_weight_only int8_rowwise; do
   time python -u benchmarks/quantization/eval_accuracy_for_readme.py $BASE_ARGS --quant_recipe_name $quant_recipe 2>&1 | tee -a "$LOG_FILE"
 done
 
diff --git a/torchao/quantization/README.md b/torchao/quantization/README.md
@@ -5,12 +5,12 @@ Typically quantization algorithms will have different schemes for how the activa
 
 All the following benchmarks are for `meta-llama/Llama-3-8.1B` using `lm-eval` measured on an H100 GPU.
 
-| Technique | wikitext-perplexity | winogrande | checkpoint size (GB) |
+| weight | activation | wikitext-perplexity | winogrande | checkpoint size (GB) |
 | --------- | ------------------- | ---------- | -------------------- |
-| baseline (bfloat16) | 7.3315 | 0.7380 | 16.1 |
-| float8_rowwise weight, float8_rowwise activation | 7.4197 | 0.7388 | 9.1 |
-| int8_weight_only | 7.3451 | 0.7340 | 9.1 |
-| int8 weight, int8 activation | 7.4535 | 0.7285 | 9.1 |
+| bfloat16 | bfloat16 | 7.3315 | 0.7380 | 16.1 |
+| float8_rowwise | float8_rowwise | 7.4197 | 0.7388 | 9.1 |
+| int8_rowwise | bfloat16 | 7.3451 | 0.7340 | 9.1 |
+| int8_rowwise | int8_rowwise | 7.4535 | 0.7285 | 9.1 |
 
 To reproduce, run the following command: