From 66bd237bc5fddafa813f6564cf041a539f39429d Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 6 Jul 2025 01:00:01 +0800
Subject: [PATCH] fix

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 src/diffusers/quantizers/gguf/utils.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/diffusers/quantizers/gguf/utils.py b/src/diffusers/quantizers/gguf/utils.py
index 2701040854..03521eadb2 100644
--- a/src/diffusers/quantizers/gguf/utils.py
+++ b/src/diffusers/quantizers/gguf/utils.py
@@ -78,17 +78,21 @@ def _fused_mul_mat_gguf(x: torch.Tensor, qweight: torch.Tensor,
     # there is no need to call any kernel for fp16/bf16
     if qweight_type in UNQUANTIZED_TYPES:
         return x @ qweight.T
-    # enable MMVQ in contiguous batching with batch_size=1
-    if qweight_type in MMVQ_QUANT_TYPES:
-        y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
-    # Use MMQ Kernel if it's available (standard + k-quants)
-    elif qweight_type in MMQ_QUANT_TYPES:
-        y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
+
+    # TODO(Isotr0py): GGUF's MMQ and MMVQ implementation are designed for
+    # contiguous batching and inefficient with diffusers' batching,
+    # so we disabled it now.
+
+    # elif qweight_type in MMVQ_QUANT_TYPES:
+    #     y = ops.ggml_mul_mat_vec_a8(qweight, x, qweight_type, qweight.shape[0])
+    # elif qweight_type in MMQ_QUANT_TYPES:
+    #     y = ops.ggml_mul_mat_a8(qweight, x, qweight_type, qweight.shape[0])
     # If there is no available MMQ kernel, fallback to dequantize
+
     elif qweight_type in DEQUANT_TYPES:
         block_size, type_size = gguf.GGML_QUANT_SIZES[qweight_type]
         shape = (qweight.shape[0], qweight.shape[1] // type_size * block_size)
-        weight = ops.ggml_dequantize(qweight, qweight_type, *shape, x.dtype)
+        weight = ops.ggml_dequantize(qweight, qweight_type, *shape)
         y = x @ weight.T
     else:
         # Raise an error if the quantization type is not supported.
@@ -539,5 +543,10 @@ class GGUFLinear(nn.Linear):
 
     def forward_cuda(self, inputs):
         quant_type = self.weight.quant_type
-        return _fused_mul_mat_gguf(inputs.to(self.compute_dtype), self.weight, quant_type)
+        orig_shape = inputs.shape
+        inputs = inputs.view(-1, orig_shape[-1])
+        output = _fused_mul_mat_gguf(inputs.to(self.compute_dtype), self.weight, quant_type)
+        if self.bias is not None:
+            output = output + self.bias.to(self.compute_dtype)
+        return output.view(*orig_shape[:-1], -1)