NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 2 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/int4.py.bak‎
Lines changed: 330 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/int4.py.bak‎
Lines changed: 330 additions & 0 deletions
@@ -45,6 +45,8 @@ transforms:
   # see https://github.com/NVIDIA/TensorRT-LLM/pull/3668#discussion_r2052714528
   optimize_rope:
     stage: pattern_matcher
+  quantize_int4_from_graph:
+    stage: pattern_matcher
   quantize_fp8_linear_from_config:
     stage: pattern_matcher
   quantize_nvfp4_linear_from_config:
 
@@ -0,0 +1,330 @@
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+from transformers.models.qwen2.modeling_qwen2 import apply_rotary_pos_emb, eager_attention_forward
+
+
+def _int4_unpack_out_packed_uint8_to_int8(packed: torch.Tensor) -> torch.Tensor:
+    """
+    Unpack weights packed along OUT dimension.
+    Input:  packed shape (out//2, in), dtype=uint8
+    Output: unpacked int8 shape (out, in), values in [-8, 7]
+    Mapping follows quantize: byte = (hi<<4) | lo with offsets in [0..15] where real = v - 8.
+    """
+    assert packed.dtype == torch.uint8, "Expected packed INT4 weights in uint8."
+    hi = (packed >> 4).to(torch.int16) - 8  # [-8..7]
+    lo = (packed & 0x0F).to(torch.int16) - 8
+    # Interleave rows: even rows from hi, odd rows from lo
+    out_packed, in_features = packed.shape
+    out_full = out_packed * 2
+    out = torch.empty((out_full, in_features), dtype=torch.int16, device=packed.device)
+    out[0::2, :] = hi
+    out[1::2, :] = lo
+    return out.to(torch.int8)
+
+
+def _expand_scales_to_columns(
+    weight_scale: torch.Tensor, block_size: int, in_features: int
+) -> torch.Tensor:
+    """
+    Expand per-block scales to per-column scales.
+    weight_scale: (out_features, in_features // block_size)
+    returns:      (out_features, in_features)
+    """
+    assert weight_scale.dim() == 2, "weight_scale should be (out, in//block_size)"
+    assert in_features % block_size == 0, "in_features must be divisible by block_size"
+    return weight_scale.repeat_interleave(block_size, dim=1)
+
+
+def _int4_awq_linear_fallback_bak(
+    x: torch.Tensor,
+    packed_weight: torch.Tensor,  # (out//2, in), uint8
+    weight_scale: torch.Tensor,  # (out, in//bs), float
+    bias: Optional[torch.Tensor],  # (out,)
+    pre_quant_scale: Optional[torch.Tensor],  # (in,) or None
+    block_size: int = 128,
+) -> torch.Tensor:
+    """
+    Pure PyTorch fallback for INT4-AWQ fake-quant linear:
+      y = (x * pre_quant_scale) @ dequant(W).T + bias
+    where dequant(W) applies per-(out, input_block) scales.
+    """
+    x_dtype = x.dtype
+    if pre_quant_scale is not None:
+        x = x * pre_quant_scale.to(x_dtype)
+
+    # Unpack packed rows (out//2, in) -> (out, in) int8 in [-8..7]
+    W_i8 = _int4_unpack_out_packed_uint8_to_int8(packed_weight)  # (out, in)
+    out_features, in_features = W_i8.shape
+    # Expand scales to per-column
+    scales_cols = _expand_scales_to_columns(weight_scale, block_size, in_features)  # (out, in)
+    W = (W_i8.to(torch.float32) / scales_cols.to(torch.float32)).to(x_dtype)  # dequantized
+    # Linear: x @ W^T + b
+    y = F.linear(x, W, bias)
+    return y
+
+
+def _int4_awq_linear_fallback(
+    x: torch.Tensor,
+    packed_weight: torch.Tensor,  # (out//2, in), uint8
+    weight_scale: torch.Tensor,  # (out, in//bs), float
+    bias: Optional[torch.Tensor],  # (out,)
+    pre_quant_scale: Optional[torch.Tensor],  # (in,) or None
+    block_size: int = 128,
+) -> torch.Tensor:
+    x_dtype = x.dtype
+    out_features = packed_weight.shape[0] * 2
+    in_features = packed_weight.shape[1]
+
+    scale_quant_maxbound = 2 ** (4 - 1) - 1
+    first_half = (packed_weight >> 4).to(torch.long) - (scale_quant_maxbound + 1)
+    second_half = (packed_weight & 0x0F).to(torch.long) - (scale_quant_maxbound + 1)
+
+    # de-quantize tensor
+    first_half = first_half.view(-1, block_size // 2) / weight_scale.view(-1, 1)
+    second_half = second_half.view(-1, block_size // 2) / weight_scale.view(-1, 1)
+
+    # merge the interleaving elements
+    first_half = first_half.flatten().unsqueeze(-1).transpose(0, 1)
+    second_half = second_half.flatten().unsqueeze(-1).transpose(0, 1)
+
+    W = (
+        torch.stack([first_half, second_half], dim=-1)
+        .view(-1)[: (out_features * in_features)]
+        .reshape(out_features, in_features)
+        .to(x_dtype)
+    )
+
+    # return the *projected* activations
+    return F.linear(x, W, bias)
+
+
+class Int4LinearAWQ(nn.Module):
+    """
+    Linear layer that consumes AWQ INT4 checkpoint tensors.
+
+    Buffers/params created with exact names so load_state_dict can map:
+      - weight:        uint8  (out//2, in)   <-- packed int4 (two rows per byte)
+      - weight_scale:  float  (out, in//bs)  <-- per-block scale
+      - pre_quant_scale (optional): (in,)    <-- per-input scale
+      - bias:          (out,) if present
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool,
+        block_size: int = 128,
+        has_pqs: bool = False,
+        pqs_dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        self.in_features = int(in_features)
+        self.out_features = int(out_features)
+        self.block_size = int(block_size)
+
+        # Buffers get overwritten by load_state_dict:
+        # Use correctly-sized placeholders to avoid shape mismatch.
+        packed_shape = (self.out_features // 2, self.in_features)
+        scale_shape = (self.out_features, self.in_features // self.block_size)
+
+        self.register_buffer(
+            "weight", torch.empty(packed_shape, dtype=torch.uint8, device="cuda"), persistent=True
+        )
+        self.register_buffer(
+            "weight_scale",
+            torch.empty(scale_shape, dtype=torch.float32, device="cuda"),
+            persistent=True,
+        )
+        if has_pqs:
+            # allocate with CORRECT shape to satisfy the checkpoint
+            self.register_buffer(
+                "pre_quant_scale",
+                torch.empty(self.in_features, dtype=pqs_dtype, device="cuda"),
+                persistent=True,
+            )
+        else:
+            # truly optional: zero-length placeholder (no ckpt entry expected)
+            self.register_buffer(
+                "pre_quant_scale", torch.empty(0, dtype=pqs_dtype, device="cuda"), persistent=True
+            )
+
+        if bias:
+            self.bias = nn.Parameter(
+                torch.zeros(self.out_features, dtype=torch.bfloat16, device="cuda")
+            )
+        else:
+            self.register_parameter("bias", None)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # dequant in PyTorch
+        pqs = self.pre_quant_scale if self.pre_quant_scale.numel() != 0 else None
+        return _int4_awq_linear_fallback(
+            x, self.weight, self.weight_scale, self.bias, pqs, self.block_size
+        )
+
+
+class Qwen2MLP_INT4(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.act_fn = ACT2FN[config.hidden_act]
+
+        # gate/up: no bias in original MLP
+        self.gate_proj = Int4LinearAWQ(
+            self.hidden_size, self.intermediate_size, bias=False, block_size=128
+        )
+        self.up_proj = Int4LinearAWQ(
+            self.hidden_size, self.intermediate_size, bias=False, block_size=128
+        )
+        # down_proj has a pre_quant_scale in your checkpoint
+        self.down_proj = Int4LinearAWQ(
+            self.intermediate_size, self.hidden_size, bias=False, block_size=128, has_pqs=True
+        )
+
+    def forward(self, x):
+        # (x * up) ⊙ act(x * gate) -> down
+        up = self.up_proj(x)
+        gate = self.gate_proj(x)
+        y = self.act_fn(gate) * up
+        return self.down_proj(y)
+
+
+class Qwen2Attention_INT4(nn.Module):
+    """Patched attention using INT4 AWQ linear ops; preserves original shapes/logic."""
+
+    def __init__(self, config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.sliding_window = (
+            config.sliding_window if config.layer_types[layer_idx] == "sliding_attention" else None
+        )
+
+        # q/k/v with bias in your checkpoint
+        self.q_proj = Int4LinearAWQ(
+            config.hidden_size,
+            config.num_attention_heads * self.head_dim,
+            bias=True,
+            block_size=128,
+        )
+        self.k_proj = Int4LinearAWQ(
+            config.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=True,
+            block_size=128,
+        )
+        self.v_proj = Int4LinearAWQ(
+            config.hidden_size,
+            config.num_key_value_heads * self.head_dim,
+            bias=True,
+            block_size=128,
+        )
+        # o_proj without bias; has pre_quant_scale in your checkpoint
+        self.o_proj = Int4LinearAWQ(
+            config.num_attention_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            block_size=128,
+            has_pqs=True,
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        bsz, q_len, _ = hidden_states.size()
+        hidden_shape = (bsz, q_len, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+def add_int4_awq_patch(model: nn.Module, block_size: int = 128) -> nn.Module:
+    """
+    Replace Qwen2Attention / Qwen2MLP with INT4-AWQ variants that expose the exact
+    (weight / weight_scale / pre_quant_scale / bias) tensor names expected by ModelOpt INT4 checkpoint.
+
+    Args:
+        model: A transformers Qwen2 model (Qwen2Model or Qwen2ForCausalLM, etc.)
+        block_size: INT4 AWQ group size (128)
+    """
+    # Find the "layers" stack — typical HF layout: model.model.layers
+    # Adjust if your model wraps differently.
+    layers = None
+    if hasattr(model, "model") and hasattr(model.model, "layers"):
+        layers = model.model.layers
+    elif hasattr(model, "layers"):
+        layers = model.layers
+    else:
+        raise AttributeError(
+            "Cannot locate 'layers' in the provided model. Expected model.model.layers or model.layers"
+        )
+
+    # Patch each transformer block’s attention and MLP
+    for idx, layer in enumerate(layers):
+        # ATTENTION
+        if hasattr(layer, "self_attn"):
+            cfg = model.config
+            # Re-create with the same layer index to preserve rope/sliding-window selection logic
+            attn_int4 = Qwen2Attention_INT4(cfg, layer_idx=idx)
+            # carry over dropout/training flags if needed (no state to copy here)
+            layer.self_attn = attn_int4
+
+            # Update block size if user passes a different one
+            layer.self_attn.q_proj.block_size = block_size
+            layer.self_attn.k_proj.block_size = block_size
+            layer.self_attn.v_proj.block_size = block_size
+            layer.self_attn.o_proj.block_size = block_size
+
+        # MLP
+        if hasattr(layer, "mlp"):
+            mlp_int4 = Qwen2MLP_INT4(model.config)
+            layer.mlp = mlp_int4
+            layer.mlp.gate_proj.block_size = block_size
+            layer.mlp.up_proj.block_size = block_size
+            layer.mlp.down_proj.block_size = block_size
+
+    return model