Add option to disable symmetric INT4 quantization (#1053)

### Description This PR adds `int4_is_symmetric` to the `--extra_options` to control INT4 quantization. Here's how this extra option can be used: `--extra_options int4_is_symmetric=<true/True/1 | false/False/0>`. ### Motivation and Context This PR fixes [this issue](#1051).
microsoft · Nov 22, 2024 · 17061e0 · 17061e0
1 parent 02feea3
commit 17061e0
Showing 1 changed file with 17 additions and 6 deletions.
diff --git a/src/python/py/models/builder.py b/src/python/py/models/builder.py
@@ -54,12 +54,11 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
         self.nodes = []
 
         # EP-specific variables
-        enable_cuda_graph = extra_options.get("enable_cuda_graph", "0")
         self.ep = ep
         self.ep_attrs = {
             "cpu": {},
             "cuda": {
-                "enable_cuda_graph": enable_cuda_graph,        # "1" if the the model is able to enable cuda graph, "0" otherwise
+                "enable_cuda_graph": extra_options.get("enable_cuda_graph", "0"),        # "1" if the model is able to enable cuda graph, "0" otherwise
             },
             "rocm": {
                 "tunable_op_enable": "1",
@@ -295,6 +294,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
             "int4": {
                 "accuracy_level": int(extra_options.get("int4_accuracy_level", 0)),   # Default is 0 for non-QDQ formats, default is 4 for QDQ formats
                 "block_size": int(extra_options.get("int4_block_size", 32)),
+                "is_symmetric": extra_options.get("int4_is_symmetric", True),
                 "op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul", )),
             },
             "use_qdq": False,           # Use QDQ format
@@ -464,7 +464,7 @@ def to_int4(self, model):
         quant = MatMul4BitsQuantizer(
             model=model,
             block_size=self.quant_attrs["int4"]["block_size"],
-            is_symmetric=True,
+            is_symmetric=self.quant_attrs["int4"]["is_symmetric"],
             accuracy_level=self.quant_attrs["int4"]["accuracy_level"],
             nodes_to_exclude=[],
             quant_format=QuantFormat.QDQ if self.quant_attrs["use_qdq"] else QuantFormat.QOperator,
@@ -1562,7 +1562,6 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
         self.layernorm_attrs["skip_input"] = f"{o_matmul_name if not o_bias_exists else o_add_name}/output_0"
 
     def make_attention_unpacked(self, layer_id, attention, root_input, **kwargs):
-
         qkv_proj = 'qkv_proj' if hasattr(attention, 'qkv_proj') else 'query_key_value'
         qkv_linear = eval(f"attention.{qkv_proj}")
 
@@ -1699,6 +1698,7 @@ def make_mlp_unpacked_regular(self, layer_id, mlp, root_input):
 
         mlp.up_proj = torch.nn.Linear(in_features=self.hidden_size, out_features=self.intermediate_size)
         mlp.up_proj.weight = torch.nn.Parameter(packed_proj.weight[self.intermediate_size :, :])
+
         # Delete original packed weights
         del packed_proj
 
@@ -1991,7 +1991,7 @@ def make_lm_head(self, lm_head):
 
     def make_layer(self, layer_id, layer):
         # Each LLM decoder layer is typically defined as:
-        # input_layernorm --> attention --> MLP --> output_layernorm
+        # input_layernorm --> attention --> output_layernorm --> MLP
         self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input")
         self.make_attention(layer_id, layer.self_attn, root_input=self.layernorm_attrs["output_0"])
         self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention")
@@ -3041,6 +3041,15 @@ def make_layer(self, layer_id, layer):
         super().make_layer(layer_id, layer)
 
 def check_extra_options(kv_pairs):
+    """
+    Check key-value pairs and set values correctly
+    """
+    if "int4_is_symmetric" in kv_pairs:
+        if kv_pairs["int4_is_symmetric"] in {"false", "False", "0"}:
+            kv_pairs["int4_is_symmetric"] = False
+        elif kv_pairs["int4_is_symmetric"] in {"true", "True", "1"}:
+            kv_pairs["int4_is_symmetric"] = True
+
     if "int4_op_types_to_quantize" in kv_pairs:
         op_types_to_quantize = ()
         for op_type in kv_pairs["int4_op_types_to_quantize"].split("/"):
@@ -3056,7 +3065,7 @@ def check_extra_options(kv_pairs):
 
 def parse_extra_options(kv_items):
     """
-    Parse key value pairs that are separated by '='
+    Parse key-value pairs that are separated by '='
     """
     kv_pairs = {}
 
@@ -3230,6 +3239,8 @@ def get_args():
                     2 is fp16.
                     1 is fp32.
                 int4_block_size = 16/32/64/128/256: Specify the block_size for int4 quantization.
+                int4_is_symmetric = Quantize the weights symmetrically. Default is true.
+                    If true, quantization is done to int4. If false, quantization is done to uint4.
                 int4_op_types_to_quantize = MatMul/Gather: Specify op types to target for int4 quantization.
                     Use this option when you want to quantize specific ops.
                     Separate the op types with a '/' when passing them here (e.g. int4_op_types_to_quantize=MatMul/Gather)