From db6fd95351eb402676daa948fd39ff60b3f8993f Mon Sep 17 00:00:00 2001 From: Disty0 Date: Thu, 25 Jan 2024 20:22:57 +0300 Subject: [PATCH] OpenVINO Quantization support with NNCF --- CHANGELOG.md | 3 + installer.py | 4 +- modules/intel/openvino/__init__.py | 111 ++++++++++++++++++----------- modules/shared.py | 4 +- 4 files changed, 78 insertions(+), 44 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 64f27a16d..574faee76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -209,6 +209,9 @@ As of this release, default backend is set to **diffusers** as its more feature - disable 1024x1024 workaround if the GPU supports 64 bit - fix lock-ups at very high resolutions - **OpenVINO**, thanks @disty0 + - **quantization support with NNCF** + run 8 bit directly on your GPU without autocast + enable *OpenVINO Quantize Models with NNCF* from *Compute Settings* - **4-bit support with NNCF** enable *Compress Model weights with NNCF* from *Compute Settings* and set a 4-bit NNCF mode select both CPU and GPU from the device selection if you want to use 4-bit or 8-bit modes on GPU diff --git a/installer.py b/installer.py index 7a0d84379..10c8213c7 100644 --- a/installer.py +++ b/installer.py @@ -463,7 +463,7 @@ def check_torch(): torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cpu') install(os.environ.get('OPENVINO_PACKAGE', 'openvino==2023.3.0'), 'openvino') install('onnxruntime-openvino', 'onnxruntime-openvino', ignore=True) # TODO openvino: numpy version conflicts with tensorflow and doesn't support Python 3.11 - install('nncf==2.7.0', 'nncf') + install('nncf==2.8.0', 'nncf') os.environ.setdefault('PYTORCH_TRACING_MODE', 'TORCHFX') os.environ.setdefault('NEOReadDebugKeys', '1') os.environ.setdefault('ClDeviceGlobalMemSizeAvailablePercent', '100') @@ -533,7 +533,7 @@ def check_torch(): log.debug(f'Cannot install xformers package: {e}') if opts.get('cuda_compile_backend', '') == 'hidet': install('hidet', 'hidet') - if opts.get('nncf_compress_weights', False): + if opts.get('nncf_compress_weights', False) and not args.use_openvino: install('nncf==2.7.0', 'nncf') if args.profile: print_profile(pr, 'Torch') diff --git a/modules/intel/openvino/__init__.py b/modules/intel/openvino/__init__.py index fc9551ba1..8c3fc7a5c 100644 --- a/modules/intel/openvino/__init__.py +++ b/modules/intel/openvino/__init__.py @@ -20,8 +20,7 @@ import functools from modules import shared, devices, sd_models -NNCFNodeName = str -def get_node_by_name(self, name: NNCFNodeName) -> nncf.common.graph.NNCFNode: +def get_node_by_name(self, name: str) -> nncf.common.graph.NNCFNode: node_ids = self._node_name_to_node_id_map.get(name, None) if node_ids is None: raise RuntimeError("Could not find a node {} in NNCFGraph!".format(name)) @@ -179,13 +178,14 @@ def execute_cached(compiled_model, *args): result = [torch.from_numpy(res[out]) for out in compiled_model.outputs] return result -def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, file_name=""): +def openvino_compile(gm: GraphModule, *example_inputs, model_hash_str: str = None, file_name=""): core = Core() device = get_device() cache_root = shared.opts.openvino_cache_path global dont_use_4bit_nncf global dont_use_nncf + global dont_use_quant if file_name is not None and os.path.isfile(file_name + ".xml") and os.path.isfile(file_name + ".bin"): om = core.read_model(file_name + ".xml") @@ -195,7 +195,7 @@ def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, file_na input_shapes = [] input_types = [] - for input_data in args: + for input_data in example_inputs: if isinstance(input_data, torch.SymInt): input_types.append(torch.SymInt) input_shapes.append(1) @@ -213,7 +213,7 @@ def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, file_na serialize(om, file_name + ".xml", file_name + ".bin") if (shared.compiled_model_state.cn_model != []): f = open(file_name + ".txt", "w") - for input_data in args: + for input_data in example_inputs: f.write(str(input_data.size())) f.write("\n") f.close() @@ -229,42 +229,6 @@ def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, file_na torch.bool: Type.boolean } - for idx, input_data in enumerate(args): - om.inputs[idx].get_node().set_element_type(dtype_mapping[input_data.dtype]) - om.inputs[idx].get_node().set_partial_shape(PartialShape(list(input_data.shape))) - om.validate_nodes_and_infer_types() - if shared.opts.nncf_compress_weights and not dont_use_nncf: - if dont_use_4bit_nncf or shared.opts.nncf_compress_weights_mode == "INT8": - om = nncf.compress_weights(om) - else: - om = nncf.compress_weights(om, mode=getattr(nncf.CompressWeightsMode, shared.opts.nncf_compress_weights_mode), group_size=8, ratio=shared.opts.nncf_compress_weights_raito) - - if model_hash_str is not None: - core.set_property({'CACHE_DIR': cache_root + '/blob'}) - dont_use_nncf = False - dont_use_4bit_nncf = False - - compiled_model = core.compile_model(om, device) - return compiled_model - -def openvino_compile_cached_model(cached_model_path, *example_inputs): - core = Core() - om = core.read_model(cached_model_path + ".xml") - - global dont_use_4bit_nncf - global dont_use_nncf - - dtype_mapping = { - torch.float32: Type.f32, - torch.float64: Type.f64, - torch.float16: Type.f16, - torch.int64: Type.i64, - torch.int32: Type.i32, - torch.uint8: Type.u8, - torch.int8: Type.i8, - torch.bool: Type.boolean - } - for idx, input_data in enumerate(example_inputs): om.inputs[idx].get_node().set_element_type(dtype_mapping[input_data.dtype]) om.inputs[idx].get_node().set_partial_shape(PartialShape(list(input_data.shape))) @@ -274,9 +238,70 @@ def openvino_compile_cached_model(cached_model_path, *example_inputs): om = nncf.compress_weights(om) else: om = nncf.compress_weights(om, mode=getattr(nncf.CompressWeightsMode, shared.opts.nncf_compress_weights_mode), group_size=8, ratio=shared.opts.nncf_compress_weights_raito) + if shared.opts.nncf_quantize and not dont_use_quant: + new_inputs = [] + for idx, _ in enumerate(example_inputs): + new_inputs.append(example_inputs[idx].detach().cpu().numpy()) + new_inputs = [new_inputs] + if shared.opts.nncf_quant_mode == "INT8": + nncf.quantize(om, nncf.Dataset(new_inputs)) + else: + nncf.quantize(om, nncf.Dataset(new_inputs), mode=getattr(nncf.QuantizationMode, shared.opts.nncf_quant_mode), + advanced_parameters=nncf.quantization.advanced_parameters.AdvancedQuantizationParameters( + overflow_fix=nncf.quantization.advanced_parameters.OverflowFix.DISABLE, backend_params=None)) + + if model_hash_str is not None: + core.set_property({'CACHE_DIR': cache_root + '/blob'}) + dont_use_nncf = False + dont_use_quant = False + dont_use_4bit_nncf = False + + compiled_model = core.compile_model(om, device) + return compiled_model + +def openvino_compile_cached_model(cached_model_path, *example_inputs): + core = Core() + om = core.read_model(cached_model_path + ".xml") + + global dont_use_4bit_nncf + global dont_use_nncf + global dont_use_quant + + dtype_mapping = { + torch.float32: Type.f32, + torch.float64: Type.f64, + torch.float16: Type.f16, + torch.int64: Type.i64, + torch.int32: Type.i32, + torch.uint8: Type.u8, + torch.int8: Type.i8, + torch.bool: Type.boolean + } + + for idx, input_data in enumerate(example_inputs): + om.inputs[idx].get_node().set_element_type(dtype_mapping[input_data.dtype]) + om.inputs[idx].get_node().set_partial_shape(PartialShape(list(input_data.shape))) + om.validate_nodes_and_infer_types() + if shared.opts.nncf_compress_weights and not dont_use_nncf: + if dont_use_4bit_nncf or shared.opts.nncf_compress_weights_mode == "INT8": + om = nncf.compress_weights(om) + else: + om = nncf.compress_weights(om, mode=getattr(nncf.CompressWeightsMode, shared.opts.nncf_compress_weights_mode), group_size=8, ratio=shared.opts.nncf_compress_weights_raito) + if shared.opts.nncf_quantize and not dont_use_quant: + new_inputs = [] + for idx, _ in enumerate(example_inputs): + new_inputs.append(example_inputs[idx].detach().cpu().numpy()) + new_inputs = [new_inputs] + if shared.opts.nncf_quant_mode == "INT8": + nncf.quantize(om, nncf.Dataset(new_inputs)) + else: + nncf.quantize(om, nncf.Dataset(new_inputs), mode=getattr(nncf.QuantizationMode, shared.opts.nncf_quant_mode), + advanced_parameters=nncf.quantization.advanced_parameters.AdvancedQuantizationParameters( + overflow_fix=nncf.quantization.advanced_parameters.OverflowFix.DISABLE, backend_params=None)) core.set_property({'CACHE_DIR': shared.opts.openvino_cache_path + '/blob'}) dont_use_nncf = False + dont_use_quant = False dont_use_4bit_nncf = False compiled_model = core.compile_model(om, get_device()) @@ -366,10 +391,12 @@ def get_subgraph_type(tensor): def openvino_fx(subgraph, example_inputs): global dont_use_4bit_nncf global dont_use_nncf + global dont_use_quant global subgraph_type dont_use_4bit_nncf = False dont_use_nncf = False + dont_use_quant = False dont_use_faketensors = False executor_parameters = None inputs_reversed = False @@ -386,6 +413,7 @@ def openvino_fx(subgraph, example_inputs): dont_use_4bit_nncf = True dont_use_nncf = bool("VAE" not in shared.opts.nncf_compress_weights) + dont_use_quant = bool("VAE" not in shared.opts.nncf_quantize) # SD 1.5 / SDXL Text Encoder elif (subgraph_type[0] is torch.nn.modules.sparse.Embedding and @@ -395,6 +423,7 @@ def openvino_fx(subgraph, example_inputs): dont_use_faketensors = True dont_use_nncf = bool("Text Encoder" not in shared.opts.nncf_compress_weights) + dont_use_quant = bool("Text Encoder" not in shared.opts.nncf_quantize) if not shared.opts.openvino_disable_model_caching: os.environ.setdefault('OPENVINO_TORCH_MODEL_CACHING', "1") diff --git a/modules/shared.py b/modules/shared.py index 1e8e67049..1c9c4b5af 100644 --- a/modules/shared.py +++ b/modules/shared.py @@ -378,7 +378,9 @@ options_templates.update(options_section(('cuda', "Compute Settings"), { "openvino_sep": OptionInfo("

OpenVINO

", "", gr.HTML, {"visible": cmd_opts.use_openvino}), "openvino_devices": OptionInfo([], "OpenVINO devices to use", gr.CheckboxGroup, {"choices": get_openvino_device_list() if cmd_opts.use_openvino else [], "visible": cmd_opts.use_openvino}), - "nncf_compress_weights_mode": OptionInfo("INT8", "OpenVINO compress mode for NNCF", gr.Radio, {"choices": ['INT8', 'INT4_SYM', 'INT4_ASYM', 'NF4'], "visible": cmd_opts.use_openvino}), + "nncf_quantize": OptionInfo([], "OpenVINO Quantize Models with NNCF", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder"], "visible": cmd_opts.use_openvino}), + "nncf_quant_mode": OptionInfo("INT8", "OpenVINO Quantization Mode with NNCF", gr.Radio, {"choices": ['INT8', 'FP8_E4M3', 'FP8_E5M2'], "visible": cmd_opts.use_openvino}), + "nncf_compress_weights_mode": OptionInfo("INT8", "OpenVINO compress mode for NNCF", gr.Radio, {"choices": ['INT8', 'INT8_SYM', 'INT4_ASYM', 'INT4_SYM', 'NF4'], "visible": cmd_opts.use_openvino}), "nncf_compress_weights_raito": OptionInfo(1.0, "OpenVINO compress ratio for NNCF", gr.Slider, {"minimum": 0, "maximum": 1, "step": 0.01, "visible": cmd_opts.use_openvino}), "openvino_disable_model_caching": OptionInfo(False, "OpenVINO disable model caching", gr.Checkbox, {"visible": cmd_opts.use_openvino}), }))