update

2026-01-27 17:22:53 +03:00 · 2025-12-15 16:12:15 +05:30
parent eae7543712
commit dcd6026d17
1 changed files with 63 additions and 56 deletions
--- a/tests/models/testing_utils/quantization.py
+++ b/tests/models/testing_utils/quantization.py
@@ -128,9 +128,9 @@ class QuantizationTesterMixin:
        model_quantized = self._create_quantized_model(config_kwargs)
        num_params_quantized = model_quantized.num_parameters()

-        assert (
-            num_params == num_params_quantized
-        ), f"Parameter count mismatch: unquantized={num_params}, quantized={num_params_quantized}"
+        assert num_params == num_params_quantized, (
+            f"Parameter count mismatch: unquantized={num_params}, quantized={num_params_quantized}"
+        )

    def _test_quantization_memory_footprint(self, config_kwargs, expected_memory_reduction=1.2):
        model = self._load_unquantized_model()
@@ -140,9 +140,9 @@ class QuantizationTesterMixin:
        mem_quantized = model_quantized.get_memory_footprint()

        ratio = mem / mem_quantized
-        assert (
-            ratio >= expected_memory_reduction
-        ), f"Memory ratio {ratio:.2f} is less than expected ({expected_memory_reduction}x). unquantized={mem}, quantized={mem_quantized}"
+        assert ratio >= expected_memory_reduction, (
+            f"Memory ratio {ratio:.2f} is less than expected ({expected_memory_reduction}x). unquantized={mem}, quantized={mem_quantized}"
+        )

    def _test_quantization_inference(self, config_kwargs):
        model_quantized = self._create_quantized_model(config_kwargs)
@@ -237,12 +237,12 @@ class QuantizationTesterMixin:
                self._verify_if_layer_quantized(name, module, config_kwargs)
                num_quantized_layers += 1

-        assert (
-            num_quantized_layers > 0
-        ), f"No quantized layers found in model (expected {expected_quantized_layers} linear layers, {num_fp32_modules} kept in FP32)"
-        assert (
-            num_quantized_layers == expected_quantized_layers
-        ), f"Quantized layer count mismatch: expected {expected_quantized_layers}, got {num_quantized_layers} (total linear layers: {num_linear_layers}, FP32 modules: {num_fp32_modules})"
+        assert num_quantized_layers > 0, (
+            f"No quantized layers found in model (expected {expected_quantized_layers} linear layers, {num_fp32_modules} kept in FP32)"
+        )
+        assert num_quantized_layers == expected_quantized_layers, (
+            f"Quantized layer count mismatch: expected {expected_quantized_layers}, got {num_quantized_layers} (total linear layers: {num_linear_layers}, FP32 modules: {num_fp32_modules})"
+        )

    def _test_quantization_modules_to_not_convert(self, config_kwargs, modules_to_not_convert):
        """
@@ -266,9 +266,9 @@ class QuantizationTesterMixin:
                if any(excluded in name for excluded in modules_to_not_convert):
                    found_excluded = True
                    # This module should NOT be quantized
-                    assert not self._is_module_quantized(
-                        module
-                    ), f"Module {name} should not be quantized but was found to be quantized"
+                    assert not self._is_module_quantized(module), (
+                        f"Module {name} should not be quantized but was found to be quantized"
+                    )

        assert found_excluded, f"No linear layers found in excluded modules: {modules_to_not_convert}"

@@ -290,9 +290,9 @@ class QuantizationTesterMixin:
        mem_with_exclusion = model_with_exclusion.get_memory_footprint()
        mem_fully_quantized = model_fully_quantized.get_memory_footprint()

-        assert (
-            mem_with_exclusion > mem_fully_quantized
-        ), f"Model with exclusions should be larger. With exclusion: {mem_with_exclusion}, fully quantized: {mem_fully_quantized}"
+        assert mem_with_exclusion > mem_fully_quantized, (
+            f"Model with exclusions should be larger. With exclusion: {mem_with_exclusion}, fully quantized: {mem_fully_quantized}"
+        )

    def _test_quantization_device_map(self, config_kwargs):
        """
@@ -399,40 +399,40 @@ class BitsAndBytesTesterMixin(QuantizationTesterMixin):

    def _verify_if_layer_quantized(self, name, module, config_kwargs):
        expected_weight_class = bnb.nn.Params4bit if config_kwargs.get("load_in_4bit") else bnb.nn.Int8Params
-        assert (
-            module.weight.__class__ == expected_weight_class
-        ), f"Layer {name} has weight type {module.weight.__class__}, expected {expected_weight_class}"
+        assert module.weight.__class__ == expected_weight_class, (
+            f"Layer {name} has weight type {module.weight.__class__}, expected {expected_weight_class}"
+        )

-    @pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()))
+    @pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()), ids=list(BNB_CONFIGS.keys()))
    def test_bnb_quantization_num_parameters(self, config_name):
        self._test_quantization_num_parameters(self.BNB_CONFIGS[config_name])

-    @pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()))
+    @pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()), ids=list(BNB_CONFIGS.keys()))
    def test_bnb_quantization_memory_footprint(self, config_name):
        expected = self.BNB_EXPECTED_MEMORY_REDUCTIONS.get(config_name, 1.2)
        self._test_quantization_memory_footprint(self.BNB_CONFIGS[config_name], expected_memory_reduction=expected)

-    @pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()))
+    @pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()), ids=list(BNB_CONFIGS.keys()))
    def test_bnb_quantization_inference(self, config_name):
        self._test_quantization_inference(self.BNB_CONFIGS[config_name])

-    @pytest.mark.parametrize("config_name", ["4bit_nf4"])
+    @pytest.mark.parametrize("config_name", ["4bit_nf4"], ids=["4bit_nf4"])
    def test_bnb_quantization_dtype_assignment(self, config_name):
        self._test_quantization_dtype_assignment(self.BNB_CONFIGS[config_name])

-    @pytest.mark.parametrize("config_name", ["4bit_nf4"])
+    @pytest.mark.parametrize("config_name", ["4bit_nf4"], ids=["4bit_nf4"])
    def test_bnb_quantization_lora_inference(self, config_name):
        self._test_quantization_lora_inference(self.BNB_CONFIGS[config_name])

-    @pytest.mark.parametrize("config_name", ["4bit_nf4"])
+    @pytest.mark.parametrize("config_name", ["4bit_nf4"], ids=["4bit_nf4"])
    def test_bnb_quantization_serialization(self, config_name):
        self._test_quantization_serialization(self.BNB_CONFIGS[config_name])

-    @pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()))
+    @pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()), ids=list(BNB_CONFIGS.keys()))
    def test_bnb_quantized_layers(self, config_name):
        self._test_quantized_layers(self.BNB_CONFIGS[config_name])

-    @pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()))
+    @pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()), ids=list(BNB_CONFIGS.keys()))
    def test_bnb_quantization_config_serialization(self, config_name):
        model = self._create_quantized_model(self.BNB_CONFIGS[config_name])

@@ -469,13 +469,13 @@ class BitsAndBytesTesterMixin(QuantizationTesterMixin):
            for name, module in model.named_modules():
                if isinstance(module, torch.nn.Linear):
                    if any(fp32_name in name for fp32_name in model._keep_in_fp32_modules):
-                        assert (
-                            module.weight.dtype == torch.float32
-                        ), f"Module {name} should be FP32 but is {module.weight.dtype}"
+                        assert module.weight.dtype == torch.float32, (
+                            f"Module {name} should be FP32 but is {module.weight.dtype}"
+                        )
                    else:
-                        assert (
-                            module.weight.dtype == torch.uint8
-                        ), f"Module {name} should be uint8 but is {module.weight.dtype}"
+                        assert module.weight.dtype == torch.uint8, (
+                            f"Module {name} should be uint8 but is {module.weight.dtype}"
+                        )

            with torch.no_grad():
                inputs = self.get_dummy_inputs()
@@ -492,9 +492,10 @@ class BitsAndBytesTesterMixin(QuantizationTesterMixin):

        self._test_quantization_modules_to_not_convert(self.BNB_CONFIGS["4bit_nf4"], modules_to_exclude)

-    def test_bnb_device_map(self):
+    @pytest.mark.parametrize("config_name", ["4bit_nf4", "8bit"], ids=["4bit_nf4", "8bit"])
+    def test_bnb_device_map(self, config_name):
        """Test that device_map='auto' works correctly with quantization."""
-        self._test_quantization_device_map(self.BNB_CONFIGS["4bit_nf4"])
+        self._test_quantization_device_map(self.BNB_CONFIGS[config_name])

    def test_bnb_dequantize(self):
        """Test that dequantize() works correctly."""
@@ -548,30 +549,36 @@ class QuantoTesterMixin(QuantizationTesterMixin):
    def _verify_if_layer_quantized(self, name, module, config_kwargs):
        assert isinstance(module, QLinear), f"Layer {name} is not QLinear, got {type(module)}"

-    @pytest.mark.parametrize("weight_type_name", list(QUANTO_WEIGHT_TYPES.keys()))
+    @pytest.mark.parametrize(
+        "weight_type_name", list(QUANTO_WEIGHT_TYPES.keys()), ids=list(QUANTO_WEIGHT_TYPES.keys())
+    )
    def test_quanto_quantization_num_parameters(self, weight_type_name):
        self._test_quantization_num_parameters(self.QUANTO_WEIGHT_TYPES[weight_type_name])

-    @pytest.mark.parametrize("weight_type_name", list(QUANTO_WEIGHT_TYPES.keys()))
+    @pytest.mark.parametrize(
+        "weight_type_name", list(QUANTO_WEIGHT_TYPES.keys()), ids=list(QUANTO_WEIGHT_TYPES.keys())
+    )
    def test_quanto_quantization_memory_footprint(self, weight_type_name):
        expected = self.QUANTO_EXPECTED_MEMORY_REDUCTIONS.get(weight_type_name, 1.2)
        self._test_quantization_memory_footprint(
            self.QUANTO_WEIGHT_TYPES[weight_type_name], expected_memory_reduction=expected
        )

-    @pytest.mark.parametrize("weight_type_name", list(QUANTO_WEIGHT_TYPES.keys()))
+    @pytest.mark.parametrize(
+        "weight_type_name", list(QUANTO_WEIGHT_TYPES.keys()), ids=list(QUANTO_WEIGHT_TYPES.keys())
+    )
    def test_quanto_quantization_inference(self, weight_type_name):
        self._test_quantization_inference(self.QUANTO_WEIGHT_TYPES[weight_type_name])

-    @pytest.mark.parametrize("weight_type_name", ["int8"])
+    @pytest.mark.parametrize("weight_type_name", ["int8"], ids=["int8"])
    def test_quanto_quantized_layers(self, weight_type_name):
        self._test_quantized_layers(self.QUANTO_WEIGHT_TYPES[weight_type_name])

-    @pytest.mark.parametrize("weight_type_name", ["int8"])
+    @pytest.mark.parametrize("weight_type_name", ["int8"], ids=["int8"])
    def test_quanto_quantization_lora_inference(self, weight_type_name):
        self._test_quantization_lora_inference(self.QUANTO_WEIGHT_TYPES[weight_type_name])

-    @pytest.mark.parametrize("weight_type_name", ["int8"])
+    @pytest.mark.parametrize("weight_type_name", ["int8"], ids=["int8"])
    def test_quanto_quantization_serialization(self, weight_type_name):
        self._test_quantization_serialization(self.QUANTO_WEIGHT_TYPES[weight_type_name])

@@ -636,30 +643,30 @@ class TorchAoTesterMixin(QuantizationTesterMixin):
    def _verify_if_layer_quantized(self, name, module, config_kwargs):
        assert isinstance(module, torch.nn.Linear), f"Layer {name} is not Linear, got {type(module)}"

-    @pytest.mark.parametrize("quant_type", list(TORCHAO_QUANT_TYPES.keys()))
+    @pytest.mark.parametrize("quant_type", list(TORCHAO_QUANT_TYPES.keys()), ids=list(TORCHAO_QUANT_TYPES.keys()))
    def test_torchao_quantization_num_parameters(self, quant_type):
        self._test_quantization_num_parameters(self.TORCHAO_QUANT_TYPES[quant_type])

-    @pytest.mark.parametrize("quant_type", list(TORCHAO_QUANT_TYPES.keys()))
+    @pytest.mark.parametrize("quant_type", list(TORCHAO_QUANT_TYPES.keys()), ids=list(TORCHAO_QUANT_TYPES.keys()))
    def test_torchao_quantization_memory_footprint(self, quant_type):
        expected = self.TORCHAO_EXPECTED_MEMORY_REDUCTIONS.get(quant_type, 1.2)
        self._test_quantization_memory_footprint(
            self.TORCHAO_QUANT_TYPES[quant_type], expected_memory_reduction=expected
        )

-    @pytest.mark.parametrize("quant_type", list(TORCHAO_QUANT_TYPES.keys()))
+    @pytest.mark.parametrize("quant_type", list(TORCHAO_QUANT_TYPES.keys()), ids=list(TORCHAO_QUANT_TYPES.keys()))
    def test_torchao_quantization_inference(self, quant_type):
        self._test_quantization_inference(self.TORCHAO_QUANT_TYPES[quant_type])

-    @pytest.mark.parametrize("quant_type", ["int8wo"])
+    @pytest.mark.parametrize("quant_type", ["int8wo"], ids=["int8wo"])
    def test_torchao_quantized_layers(self, quant_type):
        self._test_quantized_layers(self.TORCHAO_QUANT_TYPES[quant_type])

-    @pytest.mark.parametrize("quant_type", ["int8wo"])
+    @pytest.mark.parametrize("quant_type", ["int8wo"], ids=["int8wo"])
    def test_torchao_quantization_lora_inference(self, quant_type):
        self._test_quantization_lora_inference(self.TORCHAO_QUANT_TYPES[quant_type])

-    @pytest.mark.parametrize("quant_type", ["int8wo"])
+    @pytest.mark.parametrize("quant_type", ["int8wo"], ids=["int8wo"])
    def test_torchao_quantization_serialization(self, quant_type):
        self._test_quantization_serialization(self.TORCHAO_QUANT_TYPES[quant_type])

@@ -801,34 +808,34 @@ class ModelOptTesterMixin(QuantizationTesterMixin):
    def _verify_if_layer_quantized(self, name, module, config_kwargs):
        assert mtq.utils.is_quantized(module), f"Layer {name} does not have weight_quantizer attribute (not quantized)"

-    @pytest.mark.parametrize("config_name", ["fp8"])
+    @pytest.mark.parametrize("config_name", ["fp8"], ids=["fp8"])
    def test_modelopt_quantization_num_parameters(self, config_name):
        self._test_quantization_num_parameters(self.MODELOPT_CONFIGS[config_name])

-    @pytest.mark.parametrize("config_name", list(MODELOPT_CONFIGS.keys()))
+    @pytest.mark.parametrize("config_name", list(MODELOPT_CONFIGS.keys()), ids=list(MODELOPT_CONFIGS.keys()))
    def test_modelopt_quantization_memory_footprint(self, config_name):
        expected = self.MODELOPT_EXPECTED_MEMORY_REDUCTIONS.get(config_name, 1.2)
        self._test_quantization_memory_footprint(
            self.MODELOPT_CONFIGS[config_name], expected_memory_reduction=expected
        )

-    @pytest.mark.parametrize("config_name", list(MODELOPT_CONFIGS.keys()))
+    @pytest.mark.parametrize("config_name", list(MODELOPT_CONFIGS.keys()), ids=list(MODELOPT_CONFIGS.keys()))
    def test_modelopt_quantization_inference(self, config_name):
        self._test_quantization_inference(self.MODELOPT_CONFIGS[config_name])

-    @pytest.mark.parametrize("config_name", ["fp8"])
+    @pytest.mark.parametrize("config_name", ["fp8"], ids=["fp8"])
    def test_modelopt_quantization_dtype_assignment(self, config_name):
        self._test_quantization_dtype_assignment(self.MODELOPT_CONFIGS[config_name])

-    @pytest.mark.parametrize("config_name", ["fp8"])
+    @pytest.mark.parametrize("config_name", ["fp8"], ids=["fp8"])
    def test_modelopt_quantization_lora_inference(self, config_name):
        self._test_quantization_lora_inference(self.MODELOPT_CONFIGS[config_name])

-    @pytest.mark.parametrize("config_name", ["fp8"])
+    @pytest.mark.parametrize("config_name", ["fp8"], ids=["fp8"])
    def test_modelopt_quantization_serialization(self, config_name):
        self._test_quantization_serialization(self.MODELOPT_CONFIGS[config_name])

-    @pytest.mark.parametrize("config_name", ["fp8"])
+    @pytest.mark.parametrize("config_name", ["fp8"], ids=["fp8"])
    def test_modelopt_quantized_layers(self, config_name):
        self._test_quantized_layers(self.MODELOPT_CONFIGS[config_name])