From db6fd95351eb402676daa948fd39ff60b3f8993f Mon Sep 17 00:00:00 2001
From: Disty0 <semihgulec2005@gmail.com>
Date: Thu, 25 Jan 2024 20:22:57 +0300
Subject: [PATCH] OpenVINO Quantization support with NNCF

---
 CHANGELOG.md                       |   3 +
 installer.py                       |   4 +-
 modules/intel/openvino/__init__.py | 111 ++++++++++++++++++-----------
 modules/shared.py                  |   4 +-
 4 files changed, 78 insertions(+), 44 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 64f27a16d..574faee76 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -209,6 +209,9 @@ As of this release, default backend is set to **diffusers** as its more feature
   - disable 1024x1024 workaround if the GPU supports 64 bit  
   - fix lock-ups at very high resolutions  
 - **OpenVINO**, thanks @disty0  
+  - **quantization support with NNCF**  
+    run 8 bit directly on your GPU without autocast  
+    enable *OpenVINO Quantize Models with NNCF* from *Compute Settings*  
   - **4-bit support with NNCF**  
     enable *Compress Model weights with NNCF* from *Compute Settings* and set a 4-bit NNCF mode  
     select both CPU and GPU from the device selection if you want to use 4-bit or 8-bit modes on GPU  
diff --git a/installer.py b/installer.py
index 7a0d84379..10c8213c7 100644
--- a/installer.py
+++ b/installer.py
@@ -463,7 +463,7 @@ def check_torch():
         torch_command = os.environ.get('TORCH_COMMAND', 'torch==2.1.2 torchvision==0.16.2 --index-url https://download.pytorch.org/whl/cpu')
         install(os.environ.get('OPENVINO_PACKAGE', 'openvino==2023.3.0'), 'openvino')
         install('onnxruntime-openvino', 'onnxruntime-openvino', ignore=True) # TODO openvino: numpy version conflicts with tensorflow and doesn't support Python 3.11
-        install('nncf==2.7.0', 'nncf')
+        install('nncf==2.8.0', 'nncf')
         os.environ.setdefault('PYTORCH_TRACING_MODE', 'TORCHFX')
         os.environ.setdefault('NEOReadDebugKeys', '1')
         os.environ.setdefault('ClDeviceGlobalMemSizeAvailablePercent', '100')
@@ -533,7 +533,7 @@ def check_torch():
         log.debug(f'Cannot install xformers package: {e}')
     if opts.get('cuda_compile_backend', '') == 'hidet':
         install('hidet', 'hidet')
-    if opts.get('nncf_compress_weights', False):
+    if opts.get('nncf_compress_weights', False) and not args.use_openvino:
         install('nncf==2.7.0', 'nncf')
     if args.profile:
         print_profile(pr, 'Torch')
diff --git a/modules/intel/openvino/__init__.py b/modules/intel/openvino/__init__.py
index fc9551ba1..8c3fc7a5c 100644
--- a/modules/intel/openvino/__init__.py
+++ b/modules/intel/openvino/__init__.py
@@ -20,8 +20,7 @@ import functools
 
 from modules import shared, devices, sd_models
 
-NNCFNodeName = str
-def get_node_by_name(self, name: NNCFNodeName) ->  nncf.common.graph.NNCFNode:
+def get_node_by_name(self, name: str) ->  nncf.common.graph.NNCFNode:
     node_ids = self._node_name_to_node_id_map.get(name, None)
     if node_ids is None:
         raise RuntimeError("Could not find a node {} in NNCFGraph!".format(name))
@@ -179,13 +178,14 @@ def execute_cached(compiled_model, *args):
     result = [torch.from_numpy(res[out]) for out in compiled_model.outputs]
     return result
 
-def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, file_name=""):
+def openvino_compile(gm: GraphModule, *example_inputs, model_hash_str: str = None, file_name=""):
     core = Core()
 
     device = get_device()
     cache_root = shared.opts.openvino_cache_path
     global dont_use_4bit_nncf
     global dont_use_nncf
+    global dont_use_quant
 
     if file_name is not None and os.path.isfile(file_name + ".xml") and os.path.isfile(file_name + ".bin"):
         om = core.read_model(file_name + ".xml")
@@ -195,7 +195,7 @@ def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, file_na
 
         input_shapes = []
         input_types = []
-        for input_data in args:
+        for input_data in example_inputs:
             if isinstance(input_data, torch.SymInt):
                 input_types.append(torch.SymInt)
                 input_shapes.append(1)
@@ -213,7 +213,7 @@ def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, file_na
             serialize(om, file_name + ".xml", file_name + ".bin")
             if (shared.compiled_model_state.cn_model != []):
                 f = open(file_name + ".txt", "w")
-                for input_data in args:
+                for input_data in example_inputs:
                     f.write(str(input_data.size()))
                     f.write("\n")
                 f.close()
@@ -229,42 +229,6 @@ def openvino_compile(gm: GraphModule, *args, model_hash_str: str = None, file_na
         torch.bool: Type.boolean
     }
 
-    for idx, input_data in enumerate(args):
-        om.inputs[idx].get_node().set_element_type(dtype_mapping[input_data.dtype])
-        om.inputs[idx].get_node().set_partial_shape(PartialShape(list(input_data.shape)))
-    om.validate_nodes_and_infer_types()
-    if shared.opts.nncf_compress_weights and not dont_use_nncf:
-        if dont_use_4bit_nncf or shared.opts.nncf_compress_weights_mode == "INT8":
-            om = nncf.compress_weights(om)
-        else:
-            om = nncf.compress_weights(om, mode=getattr(nncf.CompressWeightsMode, shared.opts.nncf_compress_weights_mode), group_size=8, ratio=shared.opts.nncf_compress_weights_raito)
-
-    if model_hash_str is not None:
-        core.set_property({'CACHE_DIR': cache_root + '/blob'})
-    dont_use_nncf = False
-    dont_use_4bit_nncf = False
-
-    compiled_model = core.compile_model(om, device)
-    return compiled_model
-
-def openvino_compile_cached_model(cached_model_path, *example_inputs):
-    core = Core()
-    om = core.read_model(cached_model_path + ".xml")
-
-    global dont_use_4bit_nncf
-    global dont_use_nncf
-
-    dtype_mapping = {
-        torch.float32: Type.f32,
-        torch.float64: Type.f64,
-        torch.float16: Type.f16,
-        torch.int64: Type.i64,
-        torch.int32: Type.i32,
-        torch.uint8: Type.u8,
-        torch.int8: Type.i8,
-        torch.bool: Type.boolean
-    }
-
     for idx, input_data in enumerate(example_inputs):
         om.inputs[idx].get_node().set_element_type(dtype_mapping[input_data.dtype])
         om.inputs[idx].get_node().set_partial_shape(PartialShape(list(input_data.shape)))
@@ -274,9 +238,70 @@ def openvino_compile_cached_model(cached_model_path, *example_inputs):
             om = nncf.compress_weights(om)
         else:
             om = nncf.compress_weights(om, mode=getattr(nncf.CompressWeightsMode, shared.opts.nncf_compress_weights_mode), group_size=8, ratio=shared.opts.nncf_compress_weights_raito)
+    if shared.opts.nncf_quantize and not dont_use_quant:
+        new_inputs = []
+        for idx, _ in enumerate(example_inputs):
+            new_inputs.append(example_inputs[idx].detach().cpu().numpy())
+        new_inputs = [new_inputs]
+        if shared.opts.nncf_quant_mode == "INT8":
+            nncf.quantize(om, nncf.Dataset(new_inputs))
+        else:
+            nncf.quantize(om, nncf.Dataset(new_inputs), mode=getattr(nncf.QuantizationMode, shared.opts.nncf_quant_mode),
+                advanced_parameters=nncf.quantization.advanced_parameters.AdvancedQuantizationParameters(
+                overflow_fix=nncf.quantization.advanced_parameters.OverflowFix.DISABLE, backend_params=None))
+
+    if model_hash_str is not None:
+        core.set_property({'CACHE_DIR': cache_root + '/blob'})
+    dont_use_nncf = False
+    dont_use_quant = False
+    dont_use_4bit_nncf = False
+
+    compiled_model = core.compile_model(om, device)
+    return compiled_model
+
+def openvino_compile_cached_model(cached_model_path, *example_inputs):
+    core = Core()
+    om = core.read_model(cached_model_path + ".xml")
+
+    global dont_use_4bit_nncf
+    global dont_use_nncf
+    global dont_use_quant
+
+    dtype_mapping = {
+        torch.float32: Type.f32,
+        torch.float64: Type.f64,
+        torch.float16: Type.f16,
+        torch.int64: Type.i64,
+        torch.int32: Type.i32,
+        torch.uint8: Type.u8,
+        torch.int8: Type.i8,
+        torch.bool: Type.boolean
+    }
+
+    for idx, input_data in enumerate(example_inputs):
+        om.inputs[idx].get_node().set_element_type(dtype_mapping[input_data.dtype])
+        om.inputs[idx].get_node().set_partial_shape(PartialShape(list(input_data.shape)))
+    om.validate_nodes_and_infer_types()
+    if shared.opts.nncf_compress_weights and not dont_use_nncf:
+        if dont_use_4bit_nncf or shared.opts.nncf_compress_weights_mode == "INT8":
+            om = nncf.compress_weights(om)
+        else:
+            om = nncf.compress_weights(om, mode=getattr(nncf.CompressWeightsMode, shared.opts.nncf_compress_weights_mode), group_size=8, ratio=shared.opts.nncf_compress_weights_raito)
+    if shared.opts.nncf_quantize and not dont_use_quant:
+        new_inputs = []
+        for idx, _ in enumerate(example_inputs):
+            new_inputs.append(example_inputs[idx].detach().cpu().numpy())
+        new_inputs = [new_inputs]
+        if shared.opts.nncf_quant_mode == "INT8":
+            nncf.quantize(om, nncf.Dataset(new_inputs))
+        else:
+            nncf.quantize(om, nncf.Dataset(new_inputs), mode=getattr(nncf.QuantizationMode, shared.opts.nncf_quant_mode),
+                advanced_parameters=nncf.quantization.advanced_parameters.AdvancedQuantizationParameters(
+                overflow_fix=nncf.quantization.advanced_parameters.OverflowFix.DISABLE, backend_params=None))
 
     core.set_property({'CACHE_DIR': shared.opts.openvino_cache_path + '/blob'})
     dont_use_nncf = False
+    dont_use_quant = False
     dont_use_4bit_nncf = False
 
     compiled_model = core.compile_model(om, get_device())
@@ -366,10 +391,12 @@ def get_subgraph_type(tensor):
 def openvino_fx(subgraph, example_inputs):
     global dont_use_4bit_nncf
     global dont_use_nncf
+    global dont_use_quant
     global subgraph_type
 
     dont_use_4bit_nncf = False
     dont_use_nncf = False
+    dont_use_quant = False
     dont_use_faketensors = False
     executor_parameters = None
     inputs_reversed = False
@@ -386,6 +413,7 @@ def openvino_fx(subgraph, example_inputs):
 
         dont_use_4bit_nncf = True
         dont_use_nncf = bool("VAE" not in shared.opts.nncf_compress_weights)
+        dont_use_quant = bool("VAE" not in shared.opts.nncf_quantize)
 
     # SD 1.5 / SDXL Text Encoder
     elif (subgraph_type[0] is torch.nn.modules.sparse.Embedding and
@@ -395,6 +423,7 @@ def openvino_fx(subgraph, example_inputs):
 
         dont_use_faketensors = True
         dont_use_nncf = bool("Text Encoder" not in shared.opts.nncf_compress_weights)
+        dont_use_quant = bool("Text Encoder" not in shared.opts.nncf_quantize)
 
     if not shared.opts.openvino_disable_model_caching:
         os.environ.setdefault('OPENVINO_TORCH_MODEL_CACHING', "1")
diff --git a/modules/shared.py b/modules/shared.py
index 1e8e67049..1c9c4b5af 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -378,7 +378,9 @@ options_templates.update(options_section(('cuda', "Compute Settings"), {
 
     "openvino_sep": OptionInfo("<h2>OpenVINO</h2>", "", gr.HTML, {"visible": cmd_opts.use_openvino}),
     "openvino_devices": OptionInfo([], "OpenVINO devices to use", gr.CheckboxGroup, {"choices": get_openvino_device_list() if cmd_opts.use_openvino else [], "visible": cmd_opts.use_openvino}),
-    "nncf_compress_weights_mode": OptionInfo("INT8", "OpenVINO compress mode for NNCF", gr.Radio, {"choices": ['INT8', 'INT4_SYM', 'INT4_ASYM', 'NF4'], "visible": cmd_opts.use_openvino}),
+    "nncf_quantize": OptionInfo([], "OpenVINO Quantize Models with NNCF", gr.CheckboxGroup, {"choices": ["Model", "VAE", "Text Encoder"], "visible": cmd_opts.use_openvino}),
+    "nncf_quant_mode": OptionInfo("INT8", "OpenVINO Quantization Mode with NNCF", gr.Radio, {"choices": ['INT8', 'FP8_E4M3', 'FP8_E5M2'], "visible": cmd_opts.use_openvino}),
+    "nncf_compress_weights_mode": OptionInfo("INT8", "OpenVINO compress mode for NNCF", gr.Radio, {"choices": ['INT8', 'INT8_SYM', 'INT4_ASYM', 'INT4_SYM', 'NF4'], "visible": cmd_opts.use_openvino}),
     "nncf_compress_weights_raito": OptionInfo(1.0, "OpenVINO compress ratio for NNCF", gr.Slider, {"minimum": 0, "maximum": 1, "step": 0.01, "visible": cmd_opts.use_openvino}),
     "openvino_disable_model_caching": OptionInfo(False, "OpenVINO disable model caching", gr.Checkbox, {"visible": cmd_opts.use_openvino}),
 }))