pytorch
diff --git a/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion b/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/xnnpack/operators/node_visitor.py‎
Lines changed: 68 additions & 11 deletions b/‎backends/xnnpack/operators/node_visitor.py‎
Lines changed: 68 additions & 11 deletions
diff --git a/‎backends/xnnpack/operators/op_skip_ops.py‎
Lines changed: 18 additions & 0 deletions b/‎backends/xnnpack/operators/op_skip_ops.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎backends/xnnpack/operators/quant_params.py‎
Lines changed: 50 additions & 3 deletions b/‎backends/xnnpack/operators/quant_params.py‎
Lines changed: 50 additions & 3 deletions
diff --git a/‎backends/xnnpack/partition/configs.py‎
Lines changed: 3 additions & 1 deletion b/‎backends/xnnpack/partition/configs.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/xnnpack/partition/xnnpack_partitioner.py‎
Lines changed: 52 additions & 5 deletions b/‎backends/xnnpack/partition/xnnpack_partitioner.py‎
Lines changed: 52 additions & 5 deletions
@@ -30,7 +30,7 @@
 	url = https://github.com/Maratyszcza/FXdiv.git
 [submodule "backends/xnnpack/third-party/XNNPACK"]
 	path = backends/xnnpack/third-party/XNNPACK
-	url = https://github.com/google/XNNPACK.git
+	url = https://github.com/digantdesai/XNNPACK.git
 [submodule "backends/arm/third-party/serialization_lib"]
 	path = backends/arm/third-party/serialization_lib
 	url = https://review.mlplatform.org/tosa/serialization_lib
 
@@ -8,7 +8,7 @@
 import sys
 
 from pathlib import Path
-from typing import cast, Dict, Optional, Tuple
+from typing import cast, Dict, List, Optional, Tuple
 
 import torch
 from executorch.backends.transforms import get_shape
@@ -21,6 +21,7 @@
 
 from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
     ConstantDataOffset,
+    PerChannelGroupQuant,
     PerChannelQuant,
     PerTensorQuant,
     PerTokenDynamicQuant,
@@ -229,12 +230,20 @@ def get_per_channel_dtype(
             if quant_params.dtype == torch.int32:
                 return XNNDatatype.xnn_datatype_qcint32
             elif quant_params.dtype == torch.int8:
-                # 4/8-bit per channel quantized weights
-                return (
-                    XNNDatatype.xnn_datatype_qcint4
-                    if quant_params.is_qc4w
-                    else XNNDatatype.xnn_datatype_qcint8
-                )
+                if quant_params.is_per_channel_group:
+                    # 4-bit per channel group quantized weights
+                    # No 8-bit support yet
+                    assert (
+                        quant_params.is_qc4w is True
+                    ), "Only 4-bit per channel group quantization is supported"
+                    return XNNDatatype.xnn_datatype_qbint4
+                else:
+                    # 4/8-bit per channel quantized weights
+                    return (
+                        XNNDatatype.xnn_datatype_qcint4
+                        if quant_params.is_qc4w
+                        else XNNDatatype.xnn_datatype_qcint8
+                    )
             else:
                 raise RuntimeError(
                     f"Unable to resolve static quantized tensor dtype using quant params dtype: {quant_params.dtype}, [qmin, qmax]: {quant_params.qmin}, {quant_params.qmax} for per channel quantization"
@@ -266,10 +275,17 @@ def get_per_channel_dtype(
     def get_quant_params(self, quant_params: QuantParams) -> XNNQuantParams:
         if quant_params.per_channel:
             scale = cast(torch.Tensor, quant_params.scale)
-            return PerChannelQuant(
-                scale=scale.tolist(),
-                channel_dim=quant_params.axis,
-            )
+            if quant_params.is_per_channel_group:
+                return PerChannelGroupQuant(
+                    scale=scale.flatten().tolist(),
+                    channel_dim=quant_params.axis,
+                    group_size=quant_params.group_size,
+                )
+            else:  # per_channel quant
+                return PerChannelQuant(
+                    scale=scale.tolist(),
+                    channel_dim=quant_params.axis,
+                )
         elif quant_params.is_dynamic:
             # NB:
             # We use per_token quantization for per_tensor quantization
@@ -284,6 +300,42 @@ def get_quant_params(self, quant_params: QuantParams) -> XNNQuantParams:
             zero_point=cast(int, quant_params.zp),
         )
 
+    @staticmethod
+    def _check_per_channel_group_params(
+        quant_params: QuantParams, dims: List[int]
+    ) -> None:
+        # Make sure things are lining up for per_channel_group quantization case
+        # Has to be done this late because we don't have clean access to the actual tensor
+        assert quant_params.is_per_channel_group, "Not per_channel_group quantization"
+        # linear weights will be in [oc, ic]. And per_channel quantization must be on axis 0
+        num_groups = cast(torch.Tensor, quant_params.scale).shape[1]
+        assert (
+            quant_params.axis == 0
+        ), "For per_channel_group quant, axis must be 0, but got {axis}"
+        assert (
+            len(dims) == 2
+        ), "For per_channel_group quant, expecting linear weights to be 2d, but got {len(dims)}"
+        assert (
+            num_groups > 0 and quant_params.group_size > 0
+        ), "For per_channel_group quant, num_groups and group_size must be > 0, but got num_groups: {num_groups}, group_size: {quant_params.group_size}"
+        output_channels = dims[quant_params.axis]
+        input_channels = dims[quant_params.axis ^ 1]
+        assert (
+            output_channels == cast(torch.Tensor, quant_params.scale).shape[0]
+        ), "For per_channel_group quant, expecting output channels to match scale.shape[0], gut got: {output_channels}, scale.shape[0]: {quant_params.scale.shape[0]}"
+        assert (
+            input_channels % num_groups == 0
+        ), "For per_channel_group quant, expecting input channels to be divisible by num_groups, but got ic: {input_channels}, num_groups: {num_groups}"
+        assert (
+            input_channels % quant_params.group_size == 0
+        ), "For per_channel_group quant, expecting input channels to be divisible by group_size, but got ic: {input_channels}, group_size: {quant_params.group_size}"
+        assert (
+            input_channels / quant_params.group_size == num_groups
+        ), "For per_channel_group quant, expecting input channels // group_size == num_groups, but got ic: {input_channels}, group_size: {quant_params.group_size}, num_groups: {num_groups}"
+
+        # For now group quantization is only supported for 4b weights
+        assert quant_params.is_qc4w, "Only 4b group quantization is supported"
+
     def define_tensor(
         self,
         tensor: torch.fx.Node,
@@ -331,6 +383,10 @@ def define_tensor(
         dims = get_shape(tensor)
         dims = [1] if len(dims) == 0 else dims
 
+        # check for per_channel_group quantization
+        if quant_params and quant_params.per_channel_group:
+            self._check_per_channel_group_params(quant_params, dims)
+
         # constant values serialize data
         buffer_idx = self.get_serialized_buffer_index(
             tensor,
@@ -376,6 +432,7 @@ def define_tensor(
             else:
                 assert f"Unsupported weight per channel quantization axis for depthwise conv2d: {quant_params.axis}, expecting 0."
 
+        # Serialize tensor value
         ser_val = (
             XValue(xvalue_union=tvalue)
             if quant_params is None
 
@@ -104,3 +104,21 @@ class OpChooseQparamsToken(OpSkipOps):
     """
 
     target = "quantized_decomposed.choose_qparams_per_token_asymmetric.default"
+
+
+@register_node_visitor
+class OpQuantizePerChannelGroupDefault(OpSkipOps):
+    """
+    do nothing if node is quantize_per_channel_group.default
+    """
+
+    target = "quantized_decomposed.quantize_per_channel_group.default"
+
+
+@register_node_visitor
+class OpDequantizePerChannelGroupDefault(OpSkipOps):
+    """
+    do nothing if node is dequantize_per_channel_group.default
+    """
+
+    target = "quantized_decomposed.dequantize_per_channel_group.default"
@@ -56,6 +56,7 @@ def __init__(
         is_input: bool,
         is_dynamic: bool = False,
         num_nonbatch_dims: int = 1,
+        group_size: int = 0,
     ) -> None:
         self.per_channel = per_channel
         self.q_input = q_input
@@ -77,12 +78,29 @@ def __init__(
             and self.dtype == torch.int8
         )
 
+        # Groupwise quantization for weight
+        self.per_channel_group = False
+        self.group_size = group_size
+        if self.group_size > 0:
+            assert (
+                self.per_channel is True
+            ), "Only per channel quantization supports groupwise quantization"
+            assert (
+                cast(torch.Tensor, scale).ndim == 2
+            ), "Scale must be 2D for per channel groupwise quant"
+            self.per_channel_group = True
+            assert group_size > 0, "Group size must be greater than 0"
+        self.is_per_channel_group = self.per_channel and self.group_size > 0
+
     def quantize_tensor(self, tensor: torch.Tensor) -> torch.Tensor:
         # Do nothing if already quantized by the Quantizer
         if tensor.dtype == self.dtype:
             return tensor
 
         if self.per_channel:
+            assert (
+                self.per_channel_group is False
+            ), f"Not expecting per channel group quantization, got q dtype: {self.dtype}, tensor.dtype {tensor.dtype}"
             assert (
                 tensor.shape[self.axis] == cast(torch.Tensor, self.scale).shape[0]
             ), f"Invalid size of per channel quantization scales, axis: {self.axis}, scale size: {self.scale.shape}, tensor shape: {tensor.shape}"
@@ -148,6 +166,16 @@ def from_q_dq_node(
             exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
             exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
         ]
+
+        _groupwise = False
+        if quant_node.target in [
+            exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default,
+            exir_ops.edge.quantized_decomposed.dequantize_per_channel_group.default,
+        ]:
+            # This is a sub-category of per channel quantization
+            per_channel = True
+            _groupwise = True
+
         scale = quant_node.args[1]
         zp = quant_node.args[2]
         axis = 0
@@ -166,16 +194,34 @@ def _get_tensor(node):
             scale = _get_tensor(scale)
             zp = _get_tensor(zp)
             axis = cast(int, quant_node.args[3])
+
+            if _groupwise:
+                scale_tensor = cast(torch.Tensor, scale)
+                assert (
+                    scale_tensor.ndim == 2
+                ), "Weight scale must be 2D for per_channel_group [de]quant node, got {scale.ndim}D"
+                axis = 0  # axis is ignored for groupwise quantization
+
         check_or_raise(
             bool(
                 quant_node.args[-1] != torch.uint8
                 or quant_node.args[-1] != torch.quint8
             ),
             "XNNPACK does not support unsigned quantization",
         )
-        dtype = cast(torch.dtype, quant_node.args[-1])
-        qmax = cast(int, quant_node.args[-2])
-        qmin = cast(int, quant_node.args[-3])
+
+        if _groupwise:
+            _ = quant_node.args[-1]  # output dtype - not used
+            group_size = cast(int, quant_node.args[-2])
+            dtype = cast(torch.dtype, quant_node.args[-3])
+            qmax = cast(int, quant_node.args[-4])
+            qmin = cast(int, quant_node.args[-5])
+        else:
+            group_size = 0
+            dtype = cast(torch.dtype, quant_node.args[-1])
+            qmax = cast(int, quant_node.args[-2])
+            qmin = cast(int, quant_node.args[-3])
+
         is_output = any(
             user_node.op == "output" for user_node in quant_node.users.keys()
         )
@@ -191,6 +237,7 @@ def _get_tensor(node):
             qmin,
             is_output,
             is_input,
+            group_size=group_size,
         )
 
     @classmethod
 
@@ -137,11 +137,13 @@
 
 # Modules which support dynamic quantization
 # These already support dynamic shape.
-SUPPORTED_DYN_QUANT_MODULES = [
+SUPPORTED_DYN_QUANT_LINEAR_MODULES = [
     torch.nn.Linear,
     torch.nn.functional.linear,
 ]
 
+SUPPORTED_DYN_QUANT_MODULES = SUPPORTED_DYN_QUANT_LINEAR_MODULES
+
 # TODO delete this once we catch up to 100% of the supported op with dynamic shape support.
 # This is tobe used only during the transition when we may not want to partition all the
 # nodes for a dynamic model.
 
@@ -14,6 +14,7 @@
 from executorch.backends.xnnpack.partition.configs import (
     _SUPPORTED_MODULES_WITH_DYNAMIC_SHAPE,
     _SUPPORTED_OPS_WITH_DYNAMIC_SHAPE,
+    SUPPORTED_DYN_QUANT_LINEAR_MODULES,
     SUPPORTED_DYN_QUANT_MODULES,
     SUPPORTED_MODULES,
     SUPPORTED_OPS,
@@ -26,7 +27,11 @@
     FuseBatchNormWithConvPass,
 )
 from executorch.backends.xnnpack.utils.quant_utils import is_dequant
-from executorch.backends.xnnpack.utils.utils import get_input_node, is_param_node
+from executorch.backends.xnnpack.utils.utils import (
+    get_input_node,
+    get_source_fn,
+    is_param_node,
+)
 from executorch.backends.xnnpack.xnnpack_preprocess import XnnpackBackend
 
 from executorch.exir.backend.canonical_partitioners.pattern_op_partitioner import (
@@ -333,10 +338,14 @@ def choose_qparams_tensor(cqp: torch.fx.Node, ep: ExportedProgram) -> bool:  # n
     def dequant_per_token(dq: torch.fx.Node, ep: ExportedProgram) -> bool:  # noqa
         node = list(dq.users.keys())[0]
         assert isinstance(node, torch.fx.Node)
-        return node.target in [
-            exir_ops.edge.aten.mm.default,
-            exir_ops.edge.aten.addmm.default,
-        ]
+        return (
+            node.target
+            in [
+                exir_ops.edge.aten.mm.default,
+                exir_ops.edge.aten.addmm.default,
+            ]
+            or get_source_fn(node) in SUPPORTED_DYN_QUANT_LINEAR_MODULES
+        )
 
     @_constraint(exir_ops.edge.quantized_decomposed.quantize_per_token.default)
     def quant_per_token(q: torch.fx.Node, ep: ExportedProgram) -> bool:  # noqa
@@ -363,6 +372,38 @@ def choose_qparams_per_token_asymmetric(
             and XnnpackOperatorSupport.check_constraint(q, ep)
         )
 
+    @_constraint(
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel_group.default
+    )
+    def dequant_per_channel_group_default(
+        dq: torch.fx.Node, ep: ExportedProgram  # noqa
+    ) -> bool:
+        # Currently only supported by dqlinear weights
+        permute_node = list(dq.users.keys())[0]
+        assert isinstance(permute_node, torch.fx.Node)
+        # We must have a transpose on [add]mm weights
+        if permute_node.target != exir_ops.edge.aten.permute_copy.default:
+            return False
+        mm_node = list(permute_node.users.keys())[0]
+        assert isinstance(mm_node, torch.fx.Node)
+        return mm_node.target in [
+            exir_ops.edge.aten.mm.default,
+            exir_ops.edge.aten.addmm.default,
+        ]
+
+    @_constraint(exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default)
+    def quant_per_channel_group_default(
+        q: torch.fx.Node, ep: ExportedProgram  # noqa
+    ) -> bool:
+        # we shouldn't have this with folded quant weights but doesn't hurt to lower it
+        dq = list(q.users.keys())[0]
+        assert isinstance(dq, torch.fx.Node)
+        return (
+            dq.target
+            == exir_ops.edge.quantized_decomposed.dequantize_per_channel_group.default
+            and XnnpackOperatorSupport.dequant_per_channel_default(dq, ep)
+        )
+
     @_constraint(exir_ops.edge.aten.pow.Tensor_Scalar)
     def pow_tensor_scalar(node: torch.fx.Node, ep: ExportedProgram) -> bool:  # noqa
         """
@@ -612,13 +653,15 @@ class XnnpackQuantizedPartitioner(XnnpackFloatingPointPartitioner):
     _Q_OPS = [
         exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
         exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default,
         exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
         exir_ops.edge.quantized_decomposed.quantize_per_token.default,
     ]
 
     _DQ_OPS = [
         exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
         exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+        exir_ops.edge.quantized_decomposed.dequantize_per_channel_group.default,
         exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
         exir_ops.edge.quantized_decomposed.dequantize_per_token.default,
     ]
@@ -763,13 +806,17 @@ class XnnpackPartitioner(Partitioner):
     _Q_OPS = [
         exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
         exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default,
         exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
+        exir_ops.edge.quantized_decomposed.quantize_per_token.default,
     ]
 
     _DQ_OPS = [
         exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
         exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
+        exir_ops.edge.quantized_decomposed.quantize_per_channel_group.default,
         exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
+        exir_ops.edge.quantized_decomposed.dequantize_per_token.default,
     ]
 
     _QPARAM_OPS = [