diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py index 70da622c73..4710e08e5a 100644 --- a/test/integration/test_integration.py +++ b/test/integration/test_integration.py @@ -69,11 +69,12 @@ from torchao.quantization.utils import ( compute_error as SQNR, ) -from torchao.testing.utils import skip_if_rocm +from torchao.testing.utils import skip_if_rocm, skip_if_xpu from torchao.utils import ( benchmark_model, check_cpu_version, check_xpu_version, + get_current_accelerator_device, is_fbcode, is_sm_at_least_89, is_sm_at_least_90, @@ -93,8 +94,9 @@ torch.manual_seed(0) config.cache_size_limit = 100 +_DEVICE = get_current_accelerator_device() -COMMON_DEVICES = ["cpu", "cuda"] +COMMON_DEVICES = ["cpu", _DEVICE] COMMON_DTYPES = [torch.float32, torch.float16, torch.bfloat16] @@ -191,10 +193,10 @@ def wrapper(*args, **kwargs): ) device = args[2] dtype = kwargs["test_dtype"] if "test_dtype" in kwargs else args[3] - if device == "cuda" and not torch.cuda.is_available(): - raise unittest.SkipTest("Need CUDA available.") + if device == _DEVICE and not torch.accelerator.is_available(): + raise unittest.SkipTest("Need GPU available.") if ( - device == "cuda" + device == _DEVICE and torch.cuda.is_available() and dtype == torch.bfloat16 and torch.cuda.get_device_capability() < (8, 0) @@ -317,9 +319,9 @@ def _test_smooth_linear_impl(self, x_shape, lin_shape, device): def test_smooth_linear_cpu(self): self._test_smooth_linear_impl((1, 5, 3), (3, 4), "cpu") - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_smooth_linear_cuda(self): - self._test_smooth_linear_impl((1, 32, 32), (32, 16), "cuda") + self._test_smooth_linear_impl((1, 32, 32), (32, 16), _DEVICE) def test_smooth_linear_edge_cases(self): orig_backend = torch.backends.quantized.engine @@ -370,13 +372,13 @@ def test_swap(self): y = m_copy(x) assert torch.allclose(y_ref, y) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_weight_t_and_non_t_numerics_match(self): # verify that numerics match whether weight is stored # in transposed format (for cuBLAS) vs non-transposed format # (for torch.compile) dtype = torch.half - device = "cuda" + device = _DEVICE lin_ref = nn.Linear(32, 16, dtype=dtype, device=device) lin_eager_t = copy.deepcopy(lin_ref) lin_opt_t = copy.deepcopy(lin_eager_t) @@ -522,12 +524,12 @@ def test_dynamic_quant_per_channel_numerics_cpu(self): for row in test_cases: self._test_dynamic_quant_per_channel_numerics_impl(*row) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") @unittest.skip("AssertionError: Tensor-likes are not close!") def test_dynamic_quant_per_channel_numerics_cuda(self): test_cases = ( - (-128, 127, torch.int8, torch.qint8, torch.float32, "cuda"), - (-128, 127, torch.int8, torch.qint8, torch.float16, "cuda"), + (-128, 127, torch.int8, torch.qint8, torch.float32, _DEVICE), + (-128, 127, torch.int8, torch.qint8, torch.float16, _DEVICE), ) for row in test_cases: self._test_dynamic_quant_per_channel_numerics_impl(*row) @@ -546,15 +548,10 @@ def test_quantize_per_token_cpu(self): for dtype in (torch.float32, torch.float16, torch.bfloat16): self._test_quantize_per_token_impl("cpu", dtype) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_quantize_per_token_cuda(self): for dtype in (torch.float32, torch.float16, torch.bfloat16): - self._test_quantize_per_token_impl("cuda", dtype) - - @unittest.skipIf(not torch.xpu.is_available(), "XPU not available") - def test_quantize_per_token_xpu(self): - for dtype in (torch.float32, torch.float16, torch.bfloat16): - self._test_quantize_per_token_impl("xpu", dtype) + self._test_quantize_per_token_impl(_DEVICE, dtype) def _test_per_token_linear_impl(self, device, dtype): x = torch.randn(2, 16, 8, device=device, dtype=dtype) @@ -574,20 +571,20 @@ def test_per_token_linear_cpu(self): for dtype in (torch.float32,): self._test_per_token_linear_impl("cpu", dtype) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") @skip_if_rocm("ROCm enablement in progress") def test_per_token_linear_cuda(self): for dtype in (torch.float32, torch.float16, torch.bfloat16): - self._test_per_token_linear_impl("cuda", dtype) + self._test_per_token_linear_impl(_DEVICE, dtype) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test__int_mm(self): # TODO(future): figure out what here needs to move to PT core, # if it's not already tested there m, k, n = 32, 32, 16 - x = torch.randint(-128, 127, (m, k), dtype=torch.int8, device="cuda") - w = torch.randint(-128, 127, (k, n), dtype=torch.int8, device="cuda") + x = torch.randint(-128, 127, (m, k), dtype=torch.int8, device=_DEVICE) + w = torch.randint(-128, 127, (k, n), dtype=torch.int8, device=_DEVICE) y_ref = torch.matmul(x.float(), w.float()).to(torch.int32) y_raw = safe_int_mm(x, w) @@ -601,13 +598,13 @@ def test__int_mm(self): torch.testing.assert_close(y_ref, y_raw, atol=0, rtol=0) torch.testing.assert_close(y_ref, y_opt, atol=0, rtol=0) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test__int_mm_eager_and_torch_compile_numerics(self): def __int_mm_ref(x, w): x = x.cpu().to(torch.int32) w = w.cpu().to(torch.int32) y = torch.matmul(x, w) - return y.cuda() + return y.to(_DEVICE) shapes = ( # minimal test shape @@ -635,8 +632,8 @@ def wrap_torch_int_mm(x, w): wrap_torch_int_mm, mode="max-autotune" ) - x = torch.randint(-128, 127, x_shape, dtype=torch.int8, device="cuda") - w = torch.randint(-128, 127, w_shape, dtype=torch.int8, device="cuda") + x = torch.randint(-128, 127, x_shape, dtype=torch.int8, device=_DEVICE) + w = torch.randint(-128, 127, w_shape, dtype=torch.int8, device=_DEVICE) z_ref = __int_mm_ref(x, w) z_eager = wrap_torch_int_mm(x, w) @@ -685,8 +682,8 @@ def _test_lin_weight_subclass_impl( test_dtype=torch.bfloat16, test_shape=(32, 64, 32), ): - if not "cuda" in test_device: - self.skipTest("test requires cuda") + if not torch.accelerator.is_available(): + self.skipTest("test requires gpu") with torch.no_grad(): m, k, n = test_shape x = torch.randn(m, k, device=test_device, dtype=test_dtype) @@ -889,17 +886,19 @@ def test_int8_weight_only_quant_with_freeze(self, device, dtype): ) @parameterized.expand(COMMON_DEVICE_DTYPE) + @skip_if_xpu("XPU enablement in progress") def test_int4_weight_only_quant_subclass_api(self, device, dtype): if dtype != torch.bfloat16: self.skipTest(f"Fails for {dtype}") for test_shape in [(16, 1024, 16)] + ( - [(1, 1024, 256)] if device == "cuda" else [] + [(1, 1024, 256)] if device == _DEVICE else [] ): self._test_lin_weight_subclass_api_impl( _int4wo_api, device, 15, test_shape=test_shape, test_dtype=dtype ) @parameterized.expand(COMMON_DEVICE_DTYPE) + @skip_if_xpu("XPU enablement in progress") def test_int4_weight_only_hqq_quant_subclass_api(self, device, dtype): if dtype != torch.bfloat16: self.skipTest(f"Fails for {dtype}") @@ -958,6 +957,7 @@ def test_gemlite_layout(self, device, dtype): @parameterized.expand(COMMON_DEVICE_DTYPE) @skip_if_rocm("ROCm enablement in progress") + @skip_if_xpu("XPU enablement in progress") def test_int4_weight_only_quant_subclass_api_grouped(self, device, dtype): if dtype != torch.bfloat16: self.skipTest(f"Fails for {dtype}") @@ -970,7 +970,7 @@ def test_int4_weight_only_quant_subclass_api_grouped(self, device, dtype): for inner_k_tiles in [4, 2]: layout_list.append(TensorCoreTiledLayout(inner_k_tiles=inner_k_tiles)) for test_shape in [(256, 256, 16)] + ( - [(256, 256, 8)] if device == "cuda" else [] + [(256, 256, 8)] if device == _DEVICE else [] ): for groupsize in [64, 32]: for layout in layout_list: @@ -1049,14 +1049,18 @@ def test_weight_only_groupwise_embedding_quant(self): @parameterized.expand(COMMON_DEVICE_DTYPE) @torch.no_grad() - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_weight_only_quant_force_mixed_mm(self, device, dtype): undo_recommended_configs() - if device != "cuda": + if device != _DEVICE: self.skipTest( f"weight_only_quant_force_mixed_mm can't be constructed on {device}" ) - if dtype == torch.bfloat16 and torch.cuda.get_device_capability() < (8, 0): + if ( + torch.cuda.is_available() + and dtype == torch.bfloat16 + and torch.cuda.get_device_capability() < (8, 0) + ): self.skipTest("test requires SM capability of at least (8, 0).") from torch._inductor import config @@ -1081,14 +1085,18 @@ def test_weight_only_quant_force_mixed_mm(self, device, dtype): self.assertGreaterEqual(sqnr, 38) @parameterized.expand(COMMON_DEVICE_DTYPE) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_weight_only_quant_use_mixed_mm(self, device, dtype): undo_recommended_configs() - if device != "cuda": + if device != _DEVICE: self.skipTest( f"weight_only_quant_force_mixed_mm can't be constructed on {device}" ) - if dtype == torch.bfloat16 and torch.cuda.get_device_capability() < (8, 0): + if ( + torch.cuda.is_available() + and dtype == torch.bfloat16 + and torch.cuda.get_device_capability() < (8, 0) + ): self.skipTest("test requires SM capability of at least (8, 0).") torch.manual_seed(0) from torch._inductor import config @@ -1207,14 +1215,14 @@ def test_save_load_int4woqtensors(self, device, dtype): class TorchCompileUnitTest(unittest.TestCase): - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_fullgraph(self): - lin_fp16 = nn.Linear(32, 16, device="cuda", dtype=torch.float16) + lin_fp16 = nn.Linear(32, 16, device=_DEVICE, dtype=torch.float16) lin_smooth = SmoothFakeDynamicallyQuantizedLinear.from_float( lin_fp16, alpha=0.25 ) - x0 = torch.randn(17, 1, 32, device="cuda", dtype=torch.float16) + x0 = torch.randn(17, 1, 32, device=_DEVICE, dtype=torch.float16) # calibrate _ = lin_smooth(x0) @@ -1255,7 +1263,7 @@ def test_shape_logger(self): class SmoothquantIntegrationTest(unittest.TestCase): @torch.no_grad() - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") @unittest.skip("Seg fault?") def test_non_dynamically_quantizable_linear(self): if torch.cuda.is_available() and torch.cuda.get_device_capability() < (8, 0): @@ -1265,10 +1273,10 @@ def test_non_dynamically_quantizable_linear(self): torch.nn.modules.linear.NonDynamicallyQuantizableLinear(32, 32), torch.nn.ReLU(), ) - .to("cuda") + .to(_DEVICE) .to(torch.bfloat16) ) - example_input = torch.randn(32, 32, device="cuda", dtype=torch.bfloat16) + example_input = torch.randn(32, 32, device=_DEVICE, dtype=torch.bfloat16) ref = model(example_input) swap_linear_with_smooth_fq_linear(model) model(ref) @@ -1354,7 +1362,7 @@ class TestAutoQuant(unittest.TestCase): def test_autoquant_one_input(self, device, dtype, m, k, n): undo_recommended_configs() print("(m, k, n): ", (m, k, n)) - if device != "cuda" or not torch.cuda.is_available(): + if device != _DEVICE or not torch.accelerator.is_available(): self.skipTest(f"autoquant currently does not support {device}") if torch.cuda.is_available() and torch.cuda.get_device_capability() < (8, 0): if dtype == torch.bfloat16: @@ -1395,8 +1403,8 @@ def test_autoquant_one_input(self, device, dtype, m, k, n): def test_autoquant_compile(self, device, dtype, m1, m2, k, n): undo_recommended_configs() - is_supported_device = device == "cuda" and ( - torch.cuda.is_available() or torch.version.hip is not None + is_supported_device = device == _DEVICE and ( + torch.accelerator.is_available() or torch.version.hip is not None ) if not is_supported_device: @@ -1406,12 +1414,13 @@ def test_autoquant_compile(self, device, dtype, m1, m2, k, n): if ( is_supported_device and torch.version.hip is None ): # Only apply to CUDA, not ROCm - device_capability = torch.cuda.get_device_capability() - if device_capability < (8, 0): - if dtype == torch.bfloat16: - self.skipTest("bfloat16 requires sm80+") - if m1 == 1 or m2 == 1: - self.skipTest(f"Shape {(m1, m2, k, n)} requires sm80+") + if torch.cuda.is_available(): + device_capability = torch.cuda.get_device_capability() + if device_capability < (8, 0): + if dtype == torch.bfloat16: + self.skipTest("bfloat16 requires sm80+") + if m1 == 1 or m2 == 1: + self.skipTest(f"Shape {(m1, m2, k, n)} requires sm80+") # TODO remove this once https://github.com/pytorch/pytorch/issues/155838 is resolved if m1 == 1 or m2 == 1: @@ -1442,7 +1451,7 @@ def test_autoquant_compile(self, device, dtype, m1, m2, k, n): @parameterized.expand(COMMON_DEVICE_DTYPE) def test_autoquant_mha(self, device, dtype): - if device != "cuda" or not torch.cuda.is_available(): + if device != _DEVICE or not torch.accelerator.is_available(): self.skipTest(f"autoquant currently does not support {device}") class MHAModel(torch.nn.Module): @@ -1470,7 +1479,7 @@ def forward(self, x): @parameterized.expand(COMMON_DEVICE_DTYPE) def test_autoquant_manual(self, device, dtype): undo_recommended_configs() - if device != "cuda" or not torch.cuda.is_available(): + if device != _DEVICE or not torch.accelerator.is_available(): self.skipTest(f"autoquant currently does not support {device}") if torch.cuda.is_available() and torch.cuda.get_device_capability() < (8, 0): if dtype == torch.bfloat16: @@ -1519,7 +1528,7 @@ def test_autoquant_manual(self, device, dtype): ) def test_autoquant_kwargs(self, device, dtype, m1, m2, k, n): undo_recommended_configs() - if device != "cuda" or not torch.cuda.is_available(): + if device != _DEVICE or not torch.accelerator.is_available(): self.skipTest(f"autoquant currently does not support {device}") if torch.cuda.is_available() and torch.cuda.get_device_capability() < (8, 0): if dtype == torch.bfloat16: @@ -1583,7 +1592,7 @@ def forward(self, x, y): ) def test_autoquant_double_access(self, device, dtype, m, k, n): undo_recommended_configs() - if device != "cuda" or not torch.cuda.is_available(): + if device != _DEVICE or not torch.accelerator.is_available(): self.skipTest(f"autoquant currently does not support {device}") if torch.cuda.is_available() and torch.cuda.get_device_capability() < (8, 0): if dtype == torch.bfloat16: @@ -1610,8 +1619,8 @@ def forward(self, x): assert not isinstance(model.lin1.weight.weight, AutoQuantizableLinearWeight) model(x_in) - @parameterized.expand(list(itertools.product(["cuda"], COMMON_DTYPES))) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @parameterized.expand(list(itertools.product([_DEVICE], COMMON_DTYPES))) + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_autoquant_min_sqnr(self, device, dtype): m, k, n = 128, 128, 128 example_input = torch.randn(m, k, device=device, dtype=dtype) @@ -1662,7 +1671,7 @@ def test_autoquant_hp_float(self): self.assertGreater(compute_error(out, ref), 40) @parameterized.expand(COMMON_DEVICE_DTYPE) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") @unittest.skipIf(not has_gemlite, "gemlite not available") def test_autoquant_int4wo(self, device, dtype): if device == "cpu": @@ -1697,7 +1706,10 @@ def test_autoquant_int4wo(self, device, dtype): self.assertGreater(compute_error(ref, out), 20) @parameterized.expand(COMMON_DEVICE_DTYPE) - @unittest.skipIf(not is_sm_at_least_90(), "Need cuda arch greater than SM90") + @unittest.skipIf( + torch.cuda.is_available() and not is_sm_at_least_90(), + "Need cuda arch greater than SM90", + ) @unittest.skipIf( True, "Skipping for now, do to lowering bug in inductor" ) # TODO unblock when fixed @@ -1737,7 +1749,7 @@ def test_autoquant_float8(self, device, dtype): self.assertGreater(compute_error(ref, out), 20) -@unittest.skipIf(not torch.cuda.is_available(), "requires cuda") +@unittest.skipIf(not torch.accelerator.is_available(), "requires gpu") @unittest.skip( "AOTI tests are failing right now, repro by commenting out the skip and run:" "python test/integration/test_integration.py -k TestAOTI.test_aoti_06" @@ -1794,7 +1806,7 @@ def forward(self, x): ) -@unittest.skipIf(not torch.cuda.is_available(), "requires cuda") +@unittest.skipIf(not torch.accelerator.is_available(), "requires gpu") class TestExport(unittest.TestCase): @parameterized.expand( list( @@ -1872,8 +1884,8 @@ def __init__(self): def forward(self, x): return self.linear(x) - model = SimpleNetwork().eval().cuda() - inp = torch.randn(2, 32).cuda() + model = SimpleNetwork().eval().to(_DEVICE) + inp = torch.randn(2, 32).to(_DEVICE) config = Float8DynamicActivationFloat8WeightConfig() quantize_(model, config) @@ -1893,7 +1905,7 @@ class TestUtils(unittest.TestCase): def test_get_model_size_aqt(self, api, test_device, test_dtype): if test_dtype != torch.bfloat16: self.skipTest(f"{api} in {test_dtype} is not supported yet") - if test_device != "cuda" or not torch.cuda.is_available(): + if test_device != _DEVICE or not torch.accelerator.is_available(): self.skipTest(f"{api} currently does not support {test_device}") k, n = 1024, 1024 model = ( @@ -1940,9 +1952,9 @@ def run_benchmark_model(self, device): num_runs = 1 return benchmark_model(m_bf16, num_runs, example_inputs) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not torch.accelerator.is_available(), "Need GPU available") def test_benchmark_model_cuda(self): - assert self.run_benchmark_model("cuda") is not None + assert self.run_benchmark_model(_DEVICE) is not None def test_benchmark_model_cpu(self): assert self.run_benchmark_model("cpu") is not None diff --git a/torchao/utils.py b/torchao/utils.py index 35471a1864..28aed7bb4f 100644 --- a/torchao/utils.py +++ b/torchao/utils.py @@ -113,6 +113,21 @@ def benchmark_model(model, num_runs, args=(), kwargs=None, device_type=None): average_time_per_run = (end_time - start_time) / num_runs return average_time_per_run + elif device_type == "xpu": + torch.xpu.synchronize() + start_event = torch.xpu.Event(enable_timing=True) + end_event = torch.xpu.Event(enable_timing=True) + start_event.record() + + # benchmark + for _ in range(num_runs): + with torch.autograd.profiler.record_function("timed region"): + model(*args, **kwargs) + + end_event.record() + torch.xpu.synchronize() + return start_event.elapsed_time(end_event) / num_runs + def profiler_runner(path, fn, *args, **kwargs): with torch.profiler.profile(