1
0
mirror of https://github.com/huggingface/diffusers.git synced 2026-01-27 17:22:53 +03:00
This commit is contained in:
DN6
2025-12-15 16:12:15 +05:30
parent eae7543712
commit dcd6026d17

View File

@@ -128,9 +128,9 @@ class QuantizationTesterMixin:
model_quantized = self._create_quantized_model(config_kwargs)
num_params_quantized = model_quantized.num_parameters()
assert (
num_params == num_params_quantized
), f"Parameter count mismatch: unquantized={num_params}, quantized={num_params_quantized}"
assert num_params == num_params_quantized, (
f"Parameter count mismatch: unquantized={num_params}, quantized={num_params_quantized}"
)
def _test_quantization_memory_footprint(self, config_kwargs, expected_memory_reduction=1.2):
model = self._load_unquantized_model()
@@ -140,9 +140,9 @@ class QuantizationTesterMixin:
mem_quantized = model_quantized.get_memory_footprint()
ratio = mem / mem_quantized
assert (
ratio >= expected_memory_reduction
), f"Memory ratio {ratio:.2f} is less than expected ({expected_memory_reduction}x). unquantized={mem}, quantized={mem_quantized}"
assert ratio >= expected_memory_reduction, (
f"Memory ratio {ratio:.2f} is less than expected ({expected_memory_reduction}x). unquantized={mem}, quantized={mem_quantized}"
)
def _test_quantization_inference(self, config_kwargs):
model_quantized = self._create_quantized_model(config_kwargs)
@@ -237,12 +237,12 @@ class QuantizationTesterMixin:
self._verify_if_layer_quantized(name, module, config_kwargs)
num_quantized_layers += 1
assert (
num_quantized_layers > 0
), f"No quantized layers found in model (expected {expected_quantized_layers} linear layers, {num_fp32_modules} kept in FP32)"
assert (
num_quantized_layers == expected_quantized_layers
), f"Quantized layer count mismatch: expected {expected_quantized_layers}, got {num_quantized_layers} (total linear layers: {num_linear_layers}, FP32 modules: {num_fp32_modules})"
assert num_quantized_layers > 0, (
f"No quantized layers found in model (expected {expected_quantized_layers} linear layers, {num_fp32_modules} kept in FP32)"
)
assert num_quantized_layers == expected_quantized_layers, (
f"Quantized layer count mismatch: expected {expected_quantized_layers}, got {num_quantized_layers} (total linear layers: {num_linear_layers}, FP32 modules: {num_fp32_modules})"
)
def _test_quantization_modules_to_not_convert(self, config_kwargs, modules_to_not_convert):
"""
@@ -266,9 +266,9 @@ class QuantizationTesterMixin:
if any(excluded in name for excluded in modules_to_not_convert):
found_excluded = True
# This module should NOT be quantized
assert not self._is_module_quantized(
module
), f"Module {name} should not be quantized but was found to be quantized"
assert not self._is_module_quantized(module), (
f"Module {name} should not be quantized but was found to be quantized"
)
assert found_excluded, f"No linear layers found in excluded modules: {modules_to_not_convert}"
@@ -290,9 +290,9 @@ class QuantizationTesterMixin:
mem_with_exclusion = model_with_exclusion.get_memory_footprint()
mem_fully_quantized = model_fully_quantized.get_memory_footprint()
assert (
mem_with_exclusion > mem_fully_quantized
), f"Model with exclusions should be larger. With exclusion: {mem_with_exclusion}, fully quantized: {mem_fully_quantized}"
assert mem_with_exclusion > mem_fully_quantized, (
f"Model with exclusions should be larger. With exclusion: {mem_with_exclusion}, fully quantized: {mem_fully_quantized}"
)
def _test_quantization_device_map(self, config_kwargs):
"""
@@ -399,40 +399,40 @@ class BitsAndBytesTesterMixin(QuantizationTesterMixin):
def _verify_if_layer_quantized(self, name, module, config_kwargs):
expected_weight_class = bnb.nn.Params4bit if config_kwargs.get("load_in_4bit") else bnb.nn.Int8Params
assert (
module.weight.__class__ == expected_weight_class
), f"Layer {name} has weight type {module.weight.__class__}, expected {expected_weight_class}"
assert module.weight.__class__ == expected_weight_class, (
f"Layer {name} has weight type {module.weight.__class__}, expected {expected_weight_class}"
)
@pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()))
@pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()), ids=list(BNB_CONFIGS.keys()))
def test_bnb_quantization_num_parameters(self, config_name):
self._test_quantization_num_parameters(self.BNB_CONFIGS[config_name])
@pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()))
@pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()), ids=list(BNB_CONFIGS.keys()))
def test_bnb_quantization_memory_footprint(self, config_name):
expected = self.BNB_EXPECTED_MEMORY_REDUCTIONS.get(config_name, 1.2)
self._test_quantization_memory_footprint(self.BNB_CONFIGS[config_name], expected_memory_reduction=expected)
@pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()))
@pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()), ids=list(BNB_CONFIGS.keys()))
def test_bnb_quantization_inference(self, config_name):
self._test_quantization_inference(self.BNB_CONFIGS[config_name])
@pytest.mark.parametrize("config_name", ["4bit_nf4"])
@pytest.mark.parametrize("config_name", ["4bit_nf4"], ids=["4bit_nf4"])
def test_bnb_quantization_dtype_assignment(self, config_name):
self._test_quantization_dtype_assignment(self.BNB_CONFIGS[config_name])
@pytest.mark.parametrize("config_name", ["4bit_nf4"])
@pytest.mark.parametrize("config_name", ["4bit_nf4"], ids=["4bit_nf4"])
def test_bnb_quantization_lora_inference(self, config_name):
self._test_quantization_lora_inference(self.BNB_CONFIGS[config_name])
@pytest.mark.parametrize("config_name", ["4bit_nf4"])
@pytest.mark.parametrize("config_name", ["4bit_nf4"], ids=["4bit_nf4"])
def test_bnb_quantization_serialization(self, config_name):
self._test_quantization_serialization(self.BNB_CONFIGS[config_name])
@pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()))
@pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()), ids=list(BNB_CONFIGS.keys()))
def test_bnb_quantized_layers(self, config_name):
self._test_quantized_layers(self.BNB_CONFIGS[config_name])
@pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()))
@pytest.mark.parametrize("config_name", list(BNB_CONFIGS.keys()), ids=list(BNB_CONFIGS.keys()))
def test_bnb_quantization_config_serialization(self, config_name):
model = self._create_quantized_model(self.BNB_CONFIGS[config_name])
@@ -469,13 +469,13 @@ class BitsAndBytesTesterMixin(QuantizationTesterMixin):
for name, module in model.named_modules():
if isinstance(module, torch.nn.Linear):
if any(fp32_name in name for fp32_name in model._keep_in_fp32_modules):
assert (
module.weight.dtype == torch.float32
), f"Module {name} should be FP32 but is {module.weight.dtype}"
assert module.weight.dtype == torch.float32, (
f"Module {name} should be FP32 but is {module.weight.dtype}"
)
else:
assert (
module.weight.dtype == torch.uint8
), f"Module {name} should be uint8 but is {module.weight.dtype}"
assert module.weight.dtype == torch.uint8, (
f"Module {name} should be uint8 but is {module.weight.dtype}"
)
with torch.no_grad():
inputs = self.get_dummy_inputs()
@@ -492,9 +492,10 @@ class BitsAndBytesTesterMixin(QuantizationTesterMixin):
self._test_quantization_modules_to_not_convert(self.BNB_CONFIGS["4bit_nf4"], modules_to_exclude)
def test_bnb_device_map(self):
@pytest.mark.parametrize("config_name", ["4bit_nf4", "8bit"], ids=["4bit_nf4", "8bit"])
def test_bnb_device_map(self, config_name):
"""Test that device_map='auto' works correctly with quantization."""
self._test_quantization_device_map(self.BNB_CONFIGS["4bit_nf4"])
self._test_quantization_device_map(self.BNB_CONFIGS[config_name])
def test_bnb_dequantize(self):
"""Test that dequantize() works correctly."""
@@ -548,30 +549,36 @@ class QuantoTesterMixin(QuantizationTesterMixin):
def _verify_if_layer_quantized(self, name, module, config_kwargs):
assert isinstance(module, QLinear), f"Layer {name} is not QLinear, got {type(module)}"
@pytest.mark.parametrize("weight_type_name", list(QUANTO_WEIGHT_TYPES.keys()))
@pytest.mark.parametrize(
"weight_type_name", list(QUANTO_WEIGHT_TYPES.keys()), ids=list(QUANTO_WEIGHT_TYPES.keys())
)
def test_quanto_quantization_num_parameters(self, weight_type_name):
self._test_quantization_num_parameters(self.QUANTO_WEIGHT_TYPES[weight_type_name])
@pytest.mark.parametrize("weight_type_name", list(QUANTO_WEIGHT_TYPES.keys()))
@pytest.mark.parametrize(
"weight_type_name", list(QUANTO_WEIGHT_TYPES.keys()), ids=list(QUANTO_WEIGHT_TYPES.keys())
)
def test_quanto_quantization_memory_footprint(self, weight_type_name):
expected = self.QUANTO_EXPECTED_MEMORY_REDUCTIONS.get(weight_type_name, 1.2)
self._test_quantization_memory_footprint(
self.QUANTO_WEIGHT_TYPES[weight_type_name], expected_memory_reduction=expected
)
@pytest.mark.parametrize("weight_type_name", list(QUANTO_WEIGHT_TYPES.keys()))
@pytest.mark.parametrize(
"weight_type_name", list(QUANTO_WEIGHT_TYPES.keys()), ids=list(QUANTO_WEIGHT_TYPES.keys())
)
def test_quanto_quantization_inference(self, weight_type_name):
self._test_quantization_inference(self.QUANTO_WEIGHT_TYPES[weight_type_name])
@pytest.mark.parametrize("weight_type_name", ["int8"])
@pytest.mark.parametrize("weight_type_name", ["int8"], ids=["int8"])
def test_quanto_quantized_layers(self, weight_type_name):
self._test_quantized_layers(self.QUANTO_WEIGHT_TYPES[weight_type_name])
@pytest.mark.parametrize("weight_type_name", ["int8"])
@pytest.mark.parametrize("weight_type_name", ["int8"], ids=["int8"])
def test_quanto_quantization_lora_inference(self, weight_type_name):
self._test_quantization_lora_inference(self.QUANTO_WEIGHT_TYPES[weight_type_name])
@pytest.mark.parametrize("weight_type_name", ["int8"])
@pytest.mark.parametrize("weight_type_name", ["int8"], ids=["int8"])
def test_quanto_quantization_serialization(self, weight_type_name):
self._test_quantization_serialization(self.QUANTO_WEIGHT_TYPES[weight_type_name])
@@ -636,30 +643,30 @@ class TorchAoTesterMixin(QuantizationTesterMixin):
def _verify_if_layer_quantized(self, name, module, config_kwargs):
assert isinstance(module, torch.nn.Linear), f"Layer {name} is not Linear, got {type(module)}"
@pytest.mark.parametrize("quant_type", list(TORCHAO_QUANT_TYPES.keys()))
@pytest.mark.parametrize("quant_type", list(TORCHAO_QUANT_TYPES.keys()), ids=list(TORCHAO_QUANT_TYPES.keys()))
def test_torchao_quantization_num_parameters(self, quant_type):
self._test_quantization_num_parameters(self.TORCHAO_QUANT_TYPES[quant_type])
@pytest.mark.parametrize("quant_type", list(TORCHAO_QUANT_TYPES.keys()))
@pytest.mark.parametrize("quant_type", list(TORCHAO_QUANT_TYPES.keys()), ids=list(TORCHAO_QUANT_TYPES.keys()))
def test_torchao_quantization_memory_footprint(self, quant_type):
expected = self.TORCHAO_EXPECTED_MEMORY_REDUCTIONS.get(quant_type, 1.2)
self._test_quantization_memory_footprint(
self.TORCHAO_QUANT_TYPES[quant_type], expected_memory_reduction=expected
)
@pytest.mark.parametrize("quant_type", list(TORCHAO_QUANT_TYPES.keys()))
@pytest.mark.parametrize("quant_type", list(TORCHAO_QUANT_TYPES.keys()), ids=list(TORCHAO_QUANT_TYPES.keys()))
def test_torchao_quantization_inference(self, quant_type):
self._test_quantization_inference(self.TORCHAO_QUANT_TYPES[quant_type])
@pytest.mark.parametrize("quant_type", ["int8wo"])
@pytest.mark.parametrize("quant_type", ["int8wo"], ids=["int8wo"])
def test_torchao_quantized_layers(self, quant_type):
self._test_quantized_layers(self.TORCHAO_QUANT_TYPES[quant_type])
@pytest.mark.parametrize("quant_type", ["int8wo"])
@pytest.mark.parametrize("quant_type", ["int8wo"], ids=["int8wo"])
def test_torchao_quantization_lora_inference(self, quant_type):
self._test_quantization_lora_inference(self.TORCHAO_QUANT_TYPES[quant_type])
@pytest.mark.parametrize("quant_type", ["int8wo"])
@pytest.mark.parametrize("quant_type", ["int8wo"], ids=["int8wo"])
def test_torchao_quantization_serialization(self, quant_type):
self._test_quantization_serialization(self.TORCHAO_QUANT_TYPES[quant_type])
@@ -801,34 +808,34 @@ class ModelOptTesterMixin(QuantizationTesterMixin):
def _verify_if_layer_quantized(self, name, module, config_kwargs):
assert mtq.utils.is_quantized(module), f"Layer {name} does not have weight_quantizer attribute (not quantized)"
@pytest.mark.parametrize("config_name", ["fp8"])
@pytest.mark.parametrize("config_name", ["fp8"], ids=["fp8"])
def test_modelopt_quantization_num_parameters(self, config_name):
self._test_quantization_num_parameters(self.MODELOPT_CONFIGS[config_name])
@pytest.mark.parametrize("config_name", list(MODELOPT_CONFIGS.keys()))
@pytest.mark.parametrize("config_name", list(MODELOPT_CONFIGS.keys()), ids=list(MODELOPT_CONFIGS.keys()))
def test_modelopt_quantization_memory_footprint(self, config_name):
expected = self.MODELOPT_EXPECTED_MEMORY_REDUCTIONS.get(config_name, 1.2)
self._test_quantization_memory_footprint(
self.MODELOPT_CONFIGS[config_name], expected_memory_reduction=expected
)
@pytest.mark.parametrize("config_name", list(MODELOPT_CONFIGS.keys()))
@pytest.mark.parametrize("config_name", list(MODELOPT_CONFIGS.keys()), ids=list(MODELOPT_CONFIGS.keys()))
def test_modelopt_quantization_inference(self, config_name):
self._test_quantization_inference(self.MODELOPT_CONFIGS[config_name])
@pytest.mark.parametrize("config_name", ["fp8"])
@pytest.mark.parametrize("config_name", ["fp8"], ids=["fp8"])
def test_modelopt_quantization_dtype_assignment(self, config_name):
self._test_quantization_dtype_assignment(self.MODELOPT_CONFIGS[config_name])
@pytest.mark.parametrize("config_name", ["fp8"])
@pytest.mark.parametrize("config_name", ["fp8"], ids=["fp8"])
def test_modelopt_quantization_lora_inference(self, config_name):
self._test_quantization_lora_inference(self.MODELOPT_CONFIGS[config_name])
@pytest.mark.parametrize("config_name", ["fp8"])
@pytest.mark.parametrize("config_name", ["fp8"], ids=["fp8"])
def test_modelopt_quantization_serialization(self, config_name):
self._test_quantization_serialization(self.MODELOPT_CONFIGS[config_name])
@pytest.mark.parametrize("config_name", ["fp8"])
@pytest.mark.parametrize("config_name", ["fp8"], ids=["fp8"])
def test_modelopt_quantized_layers(self, config_name):
self._test_quantized_layers(self.MODELOPT_CONFIGS[config_name])