Skip to content

Commit

Permalink
Add option to disable symmetric INT4 quantization (#1053)
Browse files Browse the repository at this point in the history
### Description

This PR adds `int4_is_symmetric` to the `--extra_options` to control
INT4 quantization. Here's how this extra option can be used:
`--extra_options int4_is_symmetric=<true/True/1 | false/False/0>`.

### Motivation and Context

This PR fixes [this
issue](#1051).
  • Loading branch information
kunal-vaishnavi authored Nov 22, 2024
1 parent 02feea3 commit 17061e0
Showing 1 changed file with 17 additions and 6 deletions.
23 changes: 17 additions & 6 deletions src/python/py/models/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,11 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
self.nodes = []

# EP-specific variables
enable_cuda_graph = extra_options.get("enable_cuda_graph", "0")
self.ep = ep
self.ep_attrs = {
"cpu": {},
"cuda": {
"enable_cuda_graph": enable_cuda_graph, # "1" if the the model is able to enable cuda graph, "0" otherwise
"enable_cuda_graph": extra_options.get("enable_cuda_graph", "0"), # "1" if the model is able to enable cuda graph, "0" otherwise
},
"rocm": {
"tunable_op_enable": "1",
Expand Down Expand Up @@ -295,6 +294,7 @@ def __init__(self, config, io_dtype, onnx_dtype, ep, cache_dir, extra_options):
"int4": {
"accuracy_level": int(extra_options.get("int4_accuracy_level", 0)), # Default is 0 for non-QDQ formats, default is 4 for QDQ formats
"block_size": int(extra_options.get("int4_block_size", 32)),
"is_symmetric": extra_options.get("int4_is_symmetric", True),
"op_types_to_quantize": extra_options.get("int4_op_types_to_quantize", ("MatMul", )),
},
"use_qdq": False, # Use QDQ format
Expand Down Expand Up @@ -464,7 +464,7 @@ def to_int4(self, model):
quant = MatMul4BitsQuantizer(
model=model,
block_size=self.quant_attrs["int4"]["block_size"],
is_symmetric=True,
is_symmetric=self.quant_attrs["int4"]["is_symmetric"],
accuracy_level=self.quant_attrs["int4"]["accuracy_level"],
nodes_to_exclude=[],
quant_format=QuantFormat.QDQ if self.quant_attrs["use_qdq"] else QuantFormat.QOperator,
Expand Down Expand Up @@ -1562,7 +1562,6 @@ def make_attention(self, layer_id, attention, root_input, **kwargs):
self.layernorm_attrs["skip_input"] = f"{o_matmul_name if not o_bias_exists else o_add_name}/output_0"

def make_attention_unpacked(self, layer_id, attention, root_input, **kwargs):

qkv_proj = 'qkv_proj' if hasattr(attention, 'qkv_proj') else 'query_key_value'
qkv_linear = eval(f"attention.{qkv_proj}")

Expand Down Expand Up @@ -1699,6 +1698,7 @@ def make_mlp_unpacked_regular(self, layer_id, mlp, root_input):

mlp.up_proj = torch.nn.Linear(in_features=self.hidden_size, out_features=self.intermediate_size)
mlp.up_proj.weight = torch.nn.Parameter(packed_proj.weight[self.intermediate_size :, :])

# Delete original packed weights
del packed_proj

Expand Down Expand Up @@ -1991,7 +1991,7 @@ def make_lm_head(self, lm_head):

def make_layer(self, layer_id, layer):
# Each LLM decoder layer is typically defined as:
# input_layernorm --> attention --> MLP --> output_layernorm
# input_layernorm --> attention --> output_layernorm --> MLP
self.make_layernorm(layer_id, layer.input_layernorm, skip=not self.layernorm_attrs["first_layernorm"], simple=self.layernorm_attrs["simple"], location="input")
self.make_attention(layer_id, layer.self_attn, root_input=self.layernorm_attrs["output_0"])
self.make_layernorm(layer_id, layer.post_attention_layernorm, skip=True, simple=self.layernorm_attrs["simple"], location="post_attention")
Expand Down Expand Up @@ -3041,6 +3041,15 @@ def make_layer(self, layer_id, layer):
super().make_layer(layer_id, layer)

def check_extra_options(kv_pairs):
"""
Check key-value pairs and set values correctly
"""
if "int4_is_symmetric" in kv_pairs:
if kv_pairs["int4_is_symmetric"] in {"false", "False", "0"}:
kv_pairs["int4_is_symmetric"] = False
elif kv_pairs["int4_is_symmetric"] in {"true", "True", "1"}:
kv_pairs["int4_is_symmetric"] = True

if "int4_op_types_to_quantize" in kv_pairs:
op_types_to_quantize = ()
for op_type in kv_pairs["int4_op_types_to_quantize"].split("/"):
Expand All @@ -3056,7 +3065,7 @@ def check_extra_options(kv_pairs):

def parse_extra_options(kv_items):
"""
Parse key value pairs that are separated by '='
Parse key-value pairs that are separated by '='
"""
kv_pairs = {}

Expand Down Expand Up @@ -3230,6 +3239,8 @@ def get_args():
2 is fp16.
1 is fp32.
int4_block_size = 16/32/64/128/256: Specify the block_size for int4 quantization.
int4_is_symmetric = Quantize the weights symmetrically. Default is true.
If true, quantization is done to int4. If false, quantization is done to uint4.
int4_op_types_to_quantize = MatMul/Gather: Specify op types to target for int4 quantization.
Use this option when you want to quantize specific ops.
Separate the op types with a '/' when passing them here (e.g. int4_op_types_to_quantize=MatMul/Gather)
Expand Down

0 comments on commit 17061e0

Please sign in to comment.