diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py
index 613aefc178..9fddbc435c 100644
--- a/tests/jax/test_custom_call_compute.py
+++ b/tests/jax/test_custom_call_compute.py
@@ -36,6 +36,7 @@
     ScaledTensor1x,
     ScaledTensor2x,
     GroupedScaledTensor1x,
+    GroupedNoScaleTensor,
     ScalingMode,
     QuantizerFactory,
     QuantizeLayout,
@@ -1787,13 +1788,18 @@ def test_grouped_gemm_fp16(self, dtype, input_shape, layout):
         ref_out = self._ref_grouped_dense(lhs, rhs, None, group_sizes, contracting_dims)
 
         # jitting grouped_gemm
+        lhs_tensor = GroupedNoScaleTensor(
+            data=lhs, first_dims=group_sizes, last_dims=None, group_axis=0, original_shape=lhs.shape
+        )
+        rhs_tensor = GroupedNoScaleTensor(
+            data=rhs, first_dims=None, last_dims=None, group_axis=0, original_shape=rhs.shape
+        )
         prim_out = jax.jit(
             tex.grouped_gemm, static_argnames=("contracting_dims", "use_async_d2h_group_sizes")
         )(
-            lhs,
-            rhs,
-            group_sizes,
-            contracting_dims,
+            lhs_tensor,
+            rhs_tensor,
+            contracting_dims=contracting_dims,
             use_async_d2h_group_sizes=True,
         )
 
@@ -1825,8 +1831,17 @@ def test_grouped_gemm_fp8(self, fwd_bwd_dtype, scaling_mode, input_shape, layout
         )
         ref_out = self._ref_grouped_dense(lhs, rhs, None, group_sizes, contracting_dims)
 
+        lhs_tensor = GroupedNoScaleTensor(
+            data=lhs, first_dims=group_sizes, last_dims=None, group_axis=0, original_shape=lhs.shape
+        )
+        rhs_tensor = GroupedNoScaleTensor(
+            data=rhs, first_dims=None, last_dims=None, group_axis=0, original_shape=rhs.shape
+        )
         prim_out = jax.jit(tex.grouped_gemm, static_argnames=("contracting_dims",))(
-            lhs, rhs, group_sizes, contracting_dims, quantizer_set=quantizer_set
+            lhs_tensor,
+            rhs_tensor,
+            contracting_dims=contracting_dims,
+            quantizer_set=quantizer_set,
         )
 
         allclose_dtype = jnp.float8_e4m3fn
diff --git a/transformer_engine/jax/cpp_extensions/gemm.py b/transformer_engine/jax/cpp_extensions/gemm.py
index 4506adf33b..c86cb1db55 100644
--- a/transformer_engine/jax/cpp_extensions/gemm.py
+++ b/transformer_engine/jax/cpp_extensions/gemm.py
@@ -37,6 +37,7 @@
     ScaledTensor1x,
     ScaledTensor2x,
     GroupedScaledTensor1x,
+    GroupedNoScaleTensor,
     ScalingMode,
     Quantizer,
     GroupedQuantizer,
@@ -1331,17 +1332,47 @@ def impl(
 register_primitive(GroupedGemmCopySizesPrimitive)
 
 
+def _assert_grouped_gemm_dims_shapes(
+    lhs_first_dims_aval,
+    lhs_last_dims_aval,
+    rhs_first_dims_aval,
+    rhs_last_dims_aval,
+    out_first_dims_aval,
+    out_last_dims_aval,
+    num_groups: int,
+) -> None:
+    """Assert that all non-empty *_dims arrays have exactly num_groups elements.
+
+    rhs_first_dims / rhs_last_dims describe the ragged contracting K dimension.
+    K totals need not fill the entire buffer (padding is allowed), so only the
+    array length is checked, not the per-group sum.
+    """
+    for name, aval in [
+        ("lhs_first_dims", lhs_first_dims_aval),
+        ("lhs_last_dims", lhs_last_dims_aval),
+        ("out_first_dims", out_first_dims_aval),
+        ("out_last_dims", out_last_dims_aval),
+        ("rhs_first_dims", rhs_first_dims_aval),
+        ("rhs_last_dims", rhs_last_dims_aval),
+    ]:
+        if aval.size > 0:
+            assert (
+                aval.size == num_groups
+            ), f"grouped GEMM {name} has size {aval.size}, expected num_groups={num_groups}"
+
+
 class GroupedGemmPrimitive(BasePrimitive):
     """
     Primitive for grouped GEMM using nvte_multi_tensor_gemm (supports all scaling modes) or nvte_grouped_gemm (supporting BF16).
     """
 
-    # args = lhs_data, lhs_scale_inv, rhs_data, rhs_scale_inv, bias, group_sizes, group_offset, unused_placeholder
     name = "te_grouped_gemm_ffi"
-    # args = lhs_data, lhs_scale_inv, rhs_data, rhs_scale_inv, bias, group_sizes, alpha, beta
+    # args = lhs_data, lhs_scale_inv, rhs_data, rhs_scale_inv, bias,
+    #        lhs_first_dims, lhs_last_dims, rhs_first_dims, rhs_last_dims,
+    #        out_first_dims, out_last_dims, alpha, beta
     name_graph_safe = "te_grouped_gemm_v2_ffi"
     multiple_results = True
-    impl_static_args = (8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18)
+    impl_static_args = (13, 14, 15, 16, 17, 18, 19, 20, 21, 22)
     inner_primitive = None
     outer_primitive = None
 
@@ -1352,53 +1383,103 @@ def abstract(
         rhs_data_aval,
         rhs_scale_inv_aval,
         bias_aval,
-        group_sizes_aval,
+        lhs_first_dims_aval,
+        lhs_last_dims_aval,
+        rhs_first_dims_aval,
+        rhs_last_dims_aval,
+        out_first_dims_aval,
+        out_last_dims_aval,
         *additional_args,  # group_offset_aval, unused_placeholder OR alpha_aval, beta_aval
-        M,
-        N,
-        K,
         lhs_is_trans,
         rhs_is_trans,
         scaling_mode,
         out_dtype,
         has_bias,
-        is_grouped_dense_wgrad,
         use_async_d2h_group_sizes,
         use_v2_ffi,
+        lhs_axis_boundary,
+        rhs_axis_boundary,
+        rhs_group_axis,
     ):
         """
         Grouped GEMM operation.
 
         Args:
-            lhs_data: Left-hand side input matrix data, 1D flattened array
+            lhs_data: Left-hand side input matrix data, N-D array
             lhs_scale_inv: Left-hand side input scale_inv matrix, 1D flattened array
-            rhs_data: Right-hand side input matrix data, 1D flattened array
+            rhs_data: Right-hand side input matrix data, N-D array
             rhs_scale_inv: Right-hand side input scale_inv matrix, 1D flattened array
             bias: Bias matrix of shape (G, N)
-            group_sizes: 1D array containing the sizes of each group
+            lhs_first_dims: (G,) int32 if lhs first-dim is ragged, else empty (0,) sentinel
+            rhs_first_dims: (G,) int32 if rhs first-dim is ragged (wgrad), else empty (0,) sentinel
+            out_first_dims: (G,) int32 if output first-dim is ragged, else empty (0,) sentinel
             additional_args: Either
                 * group_offsets: 1D array containing offsets for each group (not yet implemented)
                 OR
                 * alpha: 1D array of shape (G,) containing alpha values for each group
                 * beta: 1D array of shape (G,) containing beta values for each group
-            M: Number of rows in the output matrix
-            N: Number of columns in the output matrix
-            K: Number of columns in the left-hand side matrix
             lhs_is_trans: Boolean indicating if the left-hand side matrix is transposed
             rhs_is_trans: Boolean indicating if the right-hand side matrix is transposed
             scaling_mode: Scaling mode for the GEMM operations
             out_dtype: Data type of the output tensors
             has_bias: Boolean indicating if bias tensors are provided
-            is_grouped_dense_wgrad: Boolean indicating if this is a grouped dense wgrad operation
-                                    where both lhs and rhs are 2D matrices and output is (G, M, N)
+            lhs_axis_boundary: Axis split point for lhs N-D → 2D flattening
+            rhs_axis_boundary: Axis split point for rhs N-D → 2D flattening
+            rhs_group_axis: Batch-group axis of rhs to exclude from output non-contracting dims
 
         Returns:
             A jnp.ndarray containing the result of the grouped GEMM operation
         """
-        del lhs_data_aval, rhs_data_aval, bias_aval
-        del K, lhs_is_trans, rhs_is_trans, has_bias, use_async_d2h_group_sizes
+        del bias_aval
+        del has_bias, use_async_d2h_group_sizes
+
+        num_groups = (
+            lhs_first_dims_aval.size
+            or lhs_last_dims_aval.size
+            or rhs_first_dims_aval.size
+            or rhs_last_dims_aval.size
+            or out_first_dims_aval.size
+            or out_last_dims_aval.size
+            or additional_args[0].size  # alpha (V2) has size G; group_offset (legacy) has size >= 1
+        )
+
+        _assert_grouped_gemm_dims_shapes(
+            lhs_first_dims_aval,
+            lhs_last_dims_aval,
+            rhs_first_dims_aval,
+            rhs_last_dims_aval,
+            out_first_dims_aval,
+            out_last_dims_aval,
+            num_groups,
+        )
+
+        # Derive output shape from N-D buffer shapes using axis_boundary.
+        lhs_shape = lhs_data_aval.shape
+        rhs_shape = rhs_data_aval.shape
+
+        # Non-contracting dims for lhs
+        if lhs_is_trans:
+            lhs_non_contracting = lhs_shape[lhs_axis_boundary:]
+        else:
+            lhs_non_contracting = lhs_shape[:lhs_axis_boundary]
+
+        # Non-contracting dims for rhs (excluding batch-group axis where applicable)
+        if rhs_is_trans:
+            rhs_non_contracting = tuple(
+                rhs_shape[d]
+                for d in range(rhs_axis_boundary)
+                if rhs_group_axis is None or d != rhs_group_axis
+            )
+        else:
+            rhs_non_contracting = rhs_shape[rhs_axis_boundary:]
 
-        num_groups = group_sizes_aval.size
+        # K validation is intentionally skipped: per-group K values may not fill the
+        # entire buffer (padding is allowed), so sum(rhs_*_dims) != buffer K is acceptable.
+        if rhs_first_dims_aval.size > 0 or rhs_last_dims_aval.size > 0:
+            # Wgrad case: rhs has ragged contracting K dimension → output gets G prefix.
+            out_shape = (num_groups, *lhs_non_contracting, *rhs_non_contracting)
+        else:
+            out_shape = (*lhs_non_contracting, *rhs_non_contracting)
 
         cublas_workspace_aval = jax.core.ShapedArray(
             shape=(
@@ -1409,9 +1490,6 @@ def abstract(
             dtype=jnp.uint8,
         )
 
-        out_shape = (M, N)
-        if is_grouped_dense_wgrad:
-            out_shape = (num_groups, M, N)
         out_aval = jax.core.ShapedArray(shape=out_shape, dtype=out_dtype)
 
         if use_v2_ffi:
@@ -1419,7 +1497,24 @@ def abstract(
                 shape=(get_grouped_gemm_setup_workspace_size(num_groups),), dtype=jnp.uint8
             )
             # Temporary buffer for int32 -> int64 conversion of group_sizes on device.
-            int64_workspace_size = num_groups * jnp.dtype(jnp.int64).itemsize
+            # Each non-empty *_dims buffer needs its own slot of num_groups int64 elements so that
+            # make_grouped_tensor can write to a distinct region per ragged dimension.  Allocate
+            # exactly as many slots as there are non-empty buffers (minimum 1 to avoid zero-size).
+            num_ragged_dim_buffers = sum(
+                1
+                for aval in [
+                    lhs_first_dims_aval,
+                    lhs_last_dims_aval,
+                    rhs_first_dims_aval,
+                    rhs_last_dims_aval,
+                    out_first_dims_aval,
+                    out_last_dims_aval,
+                ]
+                if aval.size > 0
+            )
+            int64_workspace_size = (
+                max(num_ragged_dim_buffers, 1) * num_groups * jnp.dtype(jnp.int64).itemsize
+            )
             int64_workspace_aval = jax.core.ShapedArray(
                 shape=(int64_workspace_size,), dtype=jnp.uint8
             )
@@ -1484,45 +1579,40 @@ def outer_abstract(*args, **kwargs):
     def lowering(
         ctx,
         *args,
-        M,
-        N,
-        K,
         lhs_is_trans,
         rhs_is_trans,
         scaling_mode,
         out_dtype,
         has_bias,
-        is_grouped_dense_wgrad,
         use_async_d2h_group_sizes,
         use_v2_ffi,
+        lhs_axis_boundary,
+        rhs_axis_boundary,
+        rhs_group_axis,
     ):
-        del out_dtype
+        del out_dtype, rhs_group_axis  # Python-only; not forwarded to C++
         if use_v2_ffi:
             ffi_name = GroupedGemmPrimitive.name_graph_safe
             return jax.ffi.ffi_lowering(ffi_name)(
                 ctx,
                 *args,
-                M=M,
-                N=N,
-                K=K,
                 lhs_is_trans=lhs_is_trans,
                 rhs_is_trans=rhs_is_trans,
                 scaling_mode=scaling_mode.value,
-                is_grouped_dense_wgrad=is_grouped_dense_wgrad,
+                lhs_axis_boundary=lhs_axis_boundary,
+                rhs_axis_boundary=rhs_axis_boundary,
             )
         ffi_name = GroupedGemmPrimitive.name
         return jax.ffi.ffi_lowering(ffi_name)(
             ctx,
             *args,
-            M=M,
-            N=N,
-            K=K,
             lhs_is_trans=lhs_is_trans,
             rhs_is_trans=rhs_is_trans,
             scaling_mode=scaling_mode.value,
             has_bias=has_bias,
-            is_grouped_dense_wgrad=is_grouped_dense_wgrad,
             use_async_d2h_group_sizes=use_async_d2h_group_sizes,
+            lhs_axis_boundary=lhs_axis_boundary,
+            rhs_axis_boundary=rhs_axis_boundary,
         )
 
     @staticmethod
@@ -1532,20 +1622,24 @@ def impl(
         rhs_data,
         rhs_scale_inv,
         bias,
-        group_sizes,
+        lhs_first_dims,
+        lhs_last_dims,
+        rhs_first_dims,
+        rhs_last_dims,
+        out_first_dims,
+        out_last_dims,
         additional_arg_0,  # group_offset (non-graph-safe) OR alpha (graph-safe)
         additional_arg_1,  # unused placeholder (non-graph-safe) OR beta (graph-safe)
-        M,
-        N,
-        K,
         lhs_is_trans,
         rhs_is_trans,
         scaling_mode,
         out_dtype,
         has_bias,
-        is_grouped_dense_wgrad,
         use_async_d2h_group_sizes,
         use_v2_ffi,
+        lhs_axis_boundary,
+        rhs_axis_boundary,
+        rhs_group_axis,
     ):
         if GroupedGemmPrimitive.inner_primitive is None:
             raise RuntimeError("GroupedGemmPrimitive.inner_primitive has not been registered")
@@ -1559,19 +1653,23 @@ def impl(
             rhs_data,
             rhs_scale_inv,
             bias,
-            group_sizes,
+            lhs_first_dims,
+            lhs_last_dims,
+            rhs_first_dims,
+            rhs_last_dims,
+            out_first_dims,
+            out_last_dims,
             *additional_args,
-            M=M,
-            N=N,
-            K=K,
             lhs_is_trans=lhs_is_trans,
             rhs_is_trans=rhs_is_trans,
             scaling_mode=scaling_mode,
             out_dtype=out_dtype,
             has_bias=has_bias,
-            is_grouped_dense_wgrad=is_grouped_dense_wgrad,
             use_async_d2h_group_sizes=use_async_d2h_group_sizes,
             use_v2_ffi=use_v2_ffi,
+            lhs_axis_boundary=lhs_axis_boundary,
+            rhs_axis_boundary=rhs_axis_boundary,
+            rhs_group_axis=rhs_group_axis,
         )
         return (out,)
 
@@ -1875,13 +1973,17 @@ def _can_use_v2_grouped_gemm(
     if not _v2_grouped_gemm_available:
         return False
 
+    # nvte_grouped_gemm (the v2 kernel) requires SM100+ (Blackwell or newer).
+    # Fall back to the v1 path on SM90 (Hopper) and older architectures.
+    if get_device_compute_capability(0) < 100:
+        return False
+
     return scaling_mode == ScalingMode.NO_SCALING and dtype == jnp.bfloat16 and not has_bias
 
 
 def grouped_gemm(
-    lhs: Union[jnp.ndarray, GroupedScaledTensor1x],
-    rhs: Union[jnp.ndarray, GroupedScaledTensor1x],
-    group_sizes: jnp.ndarray,
+    lhs: Union[GroupedNoScaleTensor, GroupedScaledTensor1x],
+    rhs: Union[GroupedNoScaleTensor, GroupedScaledTensor1x],
     contracting_dims: Tuple[Sequence[int], Sequence[int]] = ((1,), (2,)),
     bias: jnp.ndarray = None,
     precision: jax.lax.Precision = jax.lax.Precision.DEFAULT,
@@ -1894,9 +1996,8 @@ def grouped_gemm(
     Grouped GEMM operation.
 
     Args:
-        lhs: Left-hand side input matrix, can be a jnp.ndarray or GroupedScaledTensor1x
-        rhs: Right-hand side input matrix, can be a jnp.ndarray or GroupedScaledTensor1x
-        group_sizes: 1D array containing the sizes of each group
+        lhs: Left-hand side input matrix, GroupedNoScaleTensor or GroupedScaledTensor1x
+        rhs: Right-hand side input matrix, GroupedNoScaleTensor or GroupedScaledTensor1x
         contracting_dims: Tuple of two sequences representing the contracting dimensions
         bias: Bias tensor of shape (G, N)
         precision: JAX precision for the GEMM operation
@@ -1906,49 +2007,74 @@ def grouped_gemm(
 
     Returns:
         A jnp.ndarray containing the result of the grouped GEMM operation
-
-    Note:
-        Tested shapes:
-        lhs: [M, K] or [K, N]
-        rhs: [G, N, K] or [G, K, N] or [G * K, N] or [N, G * K]
     """
 
     # TODO(Phuong): implement the precision
     del precision
 
-    if isinstance(lhs, jnp.ndarray):
-        if not isinstance(rhs, jnp.ndarray):
-            raise TypeError(
-                f"Expected rhs to be jnp.ndarray when lhs is jnp.ndarray, but got type={type(rhs)}"
-            )
-        out_dtype = lhs.dtype
-        lhs_shape = lhs.shape
-        rhs_shape = rhs.shape
-        lhs_data = lhs
-        rhs_data = rhs
-        lhs_scale_inv = rhs_scale_inv = jnp.empty((0,), jnp.float32)
+    empty_gs = jnp.empty((0,), jnp.int32)
+
+    # Extract data, dims, and metadata from tensor objects.
+    if isinstance(lhs, GroupedNoScaleTensor):
+        lhs_data = lhs.data
+        lhs_shape = lhs.original_shape
+        lhs_scale_inv = jnp.empty((0,), jnp.float32)
         scaling_mode = ScalingMode.NO_SCALING
+        out_dtype = lhs.data.dtype
+        lhs_first_dims = lhs.first_dims if lhs.first_dims is not None else empty_gs
+        lhs_last_dims = lhs.last_dims if lhs.last_dims is not None else empty_gs
+        rhs_group_axis = getattr(rhs, "group_axis", 0)
     elif isinstance(lhs, GroupedScaledTensor1x):
-        if not isinstance(rhs, GroupedScaledTensor1x):
-            raise TypeError(
-                "Expected rhs to be GroupedScaledTensor1x when lhs is GroupedScaledTensor1x, but"
-                f" got type={type(rhs)}"
-            )
-        out_dtype = lhs.dq_dtype
         lhs_shape = lhs.original_shape
-        rhs_shape = rhs.original_shape
-        lhs_data = lhs.data
-        rhs_data = rhs.data
+        lhs_data = lhs.data.reshape(lhs_shape)
         lhs_scale_inv = lhs.scale_inv
+        scaling_mode = lhs.scaling_mode
+        out_dtype = lhs.dq_dtype
+        lhs_first_dims = lhs.first_dims if lhs.first_dims is not None else empty_gs
+        lhs_last_dims = lhs.last_dims if lhs.last_dims is not None else empty_gs
+        rhs_group_axis = getattr(rhs, "group_axis", 0)
+    else:
+        raise TypeError(
+            f"lhs must be GroupedNoScaleTensor or GroupedScaledTensor1x, got type={type(lhs)}"
+        )
+
+    if isinstance(rhs, GroupedNoScaleTensor):
+        rhs_data = rhs.data
+        rhs_shape = rhs.original_shape
+        rhs_scale_inv = jnp.empty((0,), jnp.float32)
+        rhs_first_dims = rhs.first_dims if rhs.first_dims is not None else empty_gs
+        rhs_last_dims = rhs.last_dims if rhs.last_dims is not None else empty_gs
+    elif isinstance(rhs, GroupedScaledTensor1x):
+        rhs_shape = rhs.original_shape
+        rhs_data = rhs.data.reshape(rhs_shape)
         rhs_scale_inv = rhs.scale_inv
-        if lhs.scaling_mode != rhs.scaling_mode:
+        rhs_first_dims = rhs.first_dims if rhs.first_dims is not None else empty_gs
+        rhs_last_dims = rhs.last_dims if rhs.last_dims is not None else empty_gs
+        if isinstance(lhs, GroupedScaledTensor1x) and lhs.scaling_mode != rhs.scaling_mode:
             raise ValueError(
                 f"Mismatched scaling modes: lhs.scaling_mode={lhs.scaling_mode},"
                 f" rhs.scaling_mode={rhs.scaling_mode}"
             )
-        scaling_mode = lhs.scaling_mode
+        if isinstance(lhs, GroupedScaledTensor1x):
+            scaling_mode = lhs.scaling_mode
     else:
-        raise TypeError("Unsupported lhs type object!")
+        raise TypeError(
+            f"rhs must be GroupedNoScaleTensor or GroupedScaledTensor1x, got type={type(rhs)}"
+        )
+
+    # Infer output dims from which operand has the ragged non-contracting dim.
+    if rhs_first_dims.size > 0 or rhs_last_dims.size > 0:
+        # Wgrad: rhs contracting dim is ragged → output is uniform (G prefix from num_groups)
+        out_first_dims = empty_gs
+        out_last_dims = empty_gs
+    elif lhs_first_dims.size > 0:
+        out_first_dims = lhs_first_dims
+        out_last_dims = empty_gs
+    elif lhs_last_dims.size > 0:
+        out_first_dims = empty_gs
+        out_last_dims = lhs_last_dims
+    else:
+        out_first_dims = out_last_dims = empty_gs
 
     out_dtype = preferred_element_type or out_dtype
 
@@ -1957,26 +2083,10 @@ def grouped_gemm(
     lhs_is_trans = lhs_contract_dim[-1] != len(lhs_shape) - 1
     lhs_flatten_axis = len(lhs_contract_dim) * (1 if lhs_is_trans else -1)
 
-    # rhs_shape [G, K, N]
-    rhs_is_trans = rhs_contract_dim[0] != 1
+    # rhs_is_trans: K is the last dim of rhs (i.e., rhs is in "T" layout).
+    rhs_is_trans = rhs_contract_dim[-1] == len(rhs_shape) - 1
     rhs_flatten_axis = -len(rhs_contract_dim) if rhs_is_trans else 1 + len(rhs_contract_dim)
 
-    is_grouped_dense_wgrad = False
-    if len(rhs_shape) == 2:
-        rhs_is_trans = rhs_contract_dim[0] != 0
-        is_grouped_dense_wgrad = True
-
-    # TODO(Hua): thses are for fp16 dense wgrad, any better way to handle this?
-    if (
-        is_grouped_dense_wgrad
-        and not isinstance(lhs, ScaledTensor)
-        and not isinstance(rhs, ScaledTensor)
-    ):
-        lhs_is_trans = True
-        rhs_is_trans = False
-        lhs_flatten_axis = 1
-        rhs_flatten_axis = 1
-
     if (
         not isinstance(lhs, ScaledTensor)
         and not isinstance(rhs, ScaledTensor)
@@ -2007,12 +2117,24 @@ def grouped_gemm(
         quantizer_set.kernel.q_layout = (
             QuantizeLayout.ROWWISE if rhs_is_rowwise else QuantizeLayout.COLWISE
         )
-        lhs_q = grouped_quantize(lhs, quantizer_set.x, group_sizes, lhs_flatten_axis)
+        active_group_sizes = next(
+            (
+                gs
+                for gs in [lhs_first_dims, lhs_last_dims, rhs_first_dims, rhs_last_dims]
+                if gs.size > 0
+            ),
+            empty_gs,
+        )
+        lhs_input_data = lhs.data if isinstance(lhs, GroupedNoScaleTensor) else lhs_data
+        rhs_input_data = rhs.data if isinstance(rhs, GroupedNoScaleTensor) else rhs_data
+        lhs_q = grouped_quantize(
+            lhs_input_data, quantizer_set.x, active_group_sizes, lhs_flatten_axis
+        )
         rhs_q = grouped_quantize(
-            rhs, quantizer_set.kernel, group_sizes=None, flatten_axis=rhs_flatten_axis
+            rhs_input_data, quantizer_set.kernel, group_sizes=None, flatten_axis=rhs_flatten_axis
         )
-        lhs_data = lhs_q.data
-        rhs_data = rhs_q.data
+        lhs_data = lhs_q.data.reshape(lhs_q.original_shape)
+        rhs_data = rhs_q.data.reshape(rhs_q.original_shape)
         lhs_scale_inv = lhs_q.scale_inv
         rhs_scale_inv = rhs_q.scale_inv
         lhs_shape = lhs_q.original_shape
@@ -2044,38 +2166,48 @@ def grouped_gemm(
             lhs_contract_dim = tuple((lhs_ndim - 1 - i) % lhs_ndim for i in lhs_contract_dim)
         if rhs_layout_is_T:
             # For rhs [G, K, N], need to exclude the G dim from contract_dim
-            if group_sizes.size == rhs_shape[0]:
+            if (
+                lhs_first_dims.size > 0 or lhs_last_dims.size > 0
+            ):  # fwd/dgrad: rhs has G as first dim
                 rhs_contract_dim = tuple(
                     (rhs_ndim - 1 - i) % (rhs_ndim - 1) + 1 for i in rhs_contract_dim
                 )
             else:
                 rhs_contract_dim = tuple((rhs_ndim - 1 - i) % rhs_ndim for i in rhs_contract_dim)
 
-    # Calling GroupedGEMM Custom Call
-    K_lhs = math.prod(lhs_shape[i] for i in lhs_contract_dim)
-    K_rhs = math.prod(rhs_shape[i] for i in rhs_contract_dim)
-    if K_lhs != K_rhs:
+    # Compute N-D axis boundaries from final (post-adjustment) contracting dims.
+    lhs_axis_boundary = get_lhs_axis_boundary(lhs_contract_dim, lhs_is_trans)
+    rhs_axis_boundary = get_rhs_axis_boundary(rhs_contract_dim, rhs_is_trans)
+
+    num_gemms = (
+        lhs_first_dims.size
+        or lhs_last_dims.size
+        or rhs_first_dims.size
+        or rhs_last_dims.size
+        or out_first_dims.size
+        or out_last_dims.size
+    )
+    if num_gemms == 0:
         raise ValueError(
-            f"Mismatched contracting dimensions: K_lhs={K_lhs}, K_rhs={K_rhs} (from"
-            f" lhs_shape={lhs_shape}, rhs_shape={rhs_shape})"
+            "grouped_gemm requires at least one non-empty dimension array. "
+            "Ensure lhs or rhs tensor objects carry first_dims or last_dims."
         )
-    M = math.prod(_calculate_remaining_shape(lhs_shape, lhs_contract_dim))
-    N = math.prod(_calculate_remaining_shape(rhs_shape, rhs_contract_dim)[1:])  # Exclude G
-
-    if is_grouped_dense_wgrad:
-        N = math.prod(_calculate_remaining_shape(rhs_shape, rhs_contract_dim))
-    else:
-        if group_sizes.size != rhs_shape[0]:
-            raise ValueError(
-                "Expected group_sizes.size == rhs_shape[0], but got"
-                f" group_sizes.size={group_sizes.size}, rhs_shape[0]={rhs_shape[0]}"
-            )
 
     has_bias = bias is not None
-    if has_bias and bias.shape != (group_sizes.size, N):
-        raise ValueError(
-            f"Expected bias.shape=({group_sizes.size}, {N}), but got bias.shape={bias.shape}"
-        )
+    if has_bias:
+        # Compute N from rhs non-contracting dims.
+        if rhs_is_trans:
+            N_dim = math.prod(
+                rhs_data.shape[d]
+                for d in range(rhs_axis_boundary)
+                if rhs_group_axis is None or d != rhs_group_axis
+            )
+        else:
+            N_dim = math.prod(rhs_data.shape[rhs_axis_boundary:])
+        assert bias.shape == (
+            num_gemms,
+            N_dim,
+        ), f"bias shape {bias.shape} does not match expected shape {(num_gemms, N_dim)}"
     bias = jnp.empty((), jnp.float32) if bias is None else bias
 
     if group_offset is not None:
@@ -2087,7 +2219,6 @@ def grouped_gemm(
 
     use_v2_ffi = _can_use_v2_grouped_gemm(scaling_mode, lhs_data.dtype, has_bias)
     if use_v2_ffi:
-        num_gemms = group_sizes.shape[0]
         additional_arg_0 = jnp.ones((num_gemms,), jnp.float32)  # alpha
         additional_arg_1 = jnp.zeros((num_gemms,), jnp.float32)  # beta
     else:
@@ -2100,19 +2231,23 @@ def grouped_gemm(
         rhs_data,
         rhs_scale_inv,
         bias,
-        group_sizes,
+        lhs_first_dims,
+        lhs_last_dims,
+        rhs_first_dims,
+        rhs_last_dims,
+        out_first_dims,
+        out_last_dims,
         additional_arg_0,
         additional_arg_1,
-        M=M,
-        N=N,
-        K=K_lhs,
         lhs_is_trans=lhs_is_trans,
         rhs_is_trans=rhs_is_trans,
         scaling_mode=scaling_mode.value,
         out_dtype=out_dtype,
         has_bias=has_bias,
-        is_grouped_dense_wgrad=is_grouped_dense_wgrad,
         use_async_d2h_group_sizes=use_async_d2h_group_sizes,
         use_v2_ffi=use_v2_ffi,
+        lhs_axis_boundary=lhs_axis_boundary,
+        rhs_axis_boundary=rhs_axis_boundary,
+        rhs_group_axis=rhs_group_axis,
     )
     return out
diff --git a/transformer_engine/jax/cpp_extensions/quantization.py b/transformer_engine/jax/cpp_extensions/quantization.py
index bf4e833c89..c8578d48b8 100644
--- a/transformer_engine/jax/cpp_extensions/quantization.py
+++ b/transformer_engine/jax/cpp_extensions/quantization.py
@@ -1203,6 +1203,7 @@ def grouped_quantize(
     ), f"Only flatten_axis = -1 is supported for now, got {flatten_axis}"
     group_axis = 0
 
+    ragged_first_dims = group_sizes  # None if no explicit group_sizes (kernel case)
     if group_sizes is None:
         group_sizes = jnp.ones(x.shape[group_axis], dtype=jnp.int32)
 
@@ -1280,7 +1281,7 @@ def grouped_quantize(
         q_layout=quantizer.q_layout,
         data_layout=quantizer.get_data_layout(),
         flatten_axis=flatten_axis,
-        group_sizes=group_sizes,
+        first_dims=ragged_first_dims,
         original_shape=original_shape,
         group_axis=group_axis,
     )
diff --git a/transformer_engine/jax/csrc/extensions.h b/transformer_engine/jax/csrc/extensions.h
index 0fe4e99239..616209709b 100644
--- a/transformer_engine/jax/csrc/extensions.h
+++ b/transformer_engine/jax/csrc/extensions.h
@@ -55,6 +55,24 @@ struct GemmConfig {
   bool use_split_accumulator;
 };
 
+struct GroupedGemmV2Config {
+  bool lhs_is_trans;
+  bool rhs_is_trans;
+  JAXX_Scaling_Mode scaling_mode;
+  int64_t lhs_axis_boundary;
+  int64_t rhs_axis_boundary;
+};
+
+struct GroupedGemmConfig {
+  bool lhs_is_trans;
+  bool rhs_is_trans;
+  JAXX_Scaling_Mode scaling_mode;
+  bool has_bias;
+  bool use_async_d2h_group_sizes;
+  int64_t lhs_axis_boundary;
+  int64_t rhs_axis_boundary;
+};
+
 inline bool use_fp8(DType type) { return type == DType::kFloat8E4M3 || type == DType::kFloat8E5M2; }
 
 // Activation
@@ -192,6 +210,22 @@ XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(
     ::xla::ffi::StructMember<bool>("rhs_transposed"),
     ::xla::ffi::StructMember<bool>("use_split_accumulator"));
 
+XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(
+    transformer_engine::jax::GroupedGemmV2Config, ::xla::ffi::StructMember<bool>("lhs_is_trans"),
+    ::xla::ffi::StructMember<bool>("rhs_is_trans"),
+    ::xla::ffi::StructMember<transformer_engine::jax::JAXX_Scaling_Mode>("scaling_mode"),
+    ::xla::ffi::StructMember<int64_t>("lhs_axis_boundary"),
+    ::xla::ffi::StructMember<int64_t>("rhs_axis_boundary"));
+
+XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(
+    transformer_engine::jax::GroupedGemmConfig, ::xla::ffi::StructMember<bool>("lhs_is_trans"),
+    ::xla::ffi::StructMember<bool>("rhs_is_trans"),
+    ::xla::ffi::StructMember<transformer_engine::jax::JAXX_Scaling_Mode>("scaling_mode"),
+    ::xla::ffi::StructMember<bool>("has_bias"),
+    ::xla::ffi::StructMember<bool>("use_async_d2h_group_sizes"),
+    ::xla::ffi::StructMember<int64_t>("lhs_axis_boundary"),
+    ::xla::ffi::StructMember<int64_t>("rhs_axis_boundary"));
+
 // ENUM_ATTR and DICT_ATTR recoding need to be registered in the global namespace
 XLA_FFI_REGISTER_ENUM_ATTR_DECODING(transformer_engine::jax::JAXX_Scaling_Mode);
 XLA_FFI_REGISTER_ENUM_ATTR_DECODING(transformer_engine::jax::JAXX_Score_Function);
diff --git a/transformer_engine/jax/csrc/extensions/gemm.cpp b/transformer_engine/jax/csrc/extensions/gemm.cpp
index 737dd65622..07adf55577 100644
--- a/transformer_engine/jax/csrc/extensions/gemm.cpp
+++ b/transformer_engine/jax/csrc/extensions/gemm.cpp
@@ -617,137 +617,98 @@ JAXX_GroupedTensorWrapper make_grouped_tensor(Buffer_Type const &data,
   return std::move(grouped_tensor_wrapper);
 }
 
-// This FFI is EXPERIMENTAL and subject to change without deprecation, intended for use in JAX's internal implementation of grouped GEMM.
-Error_Type GroupedGemmV2FFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type lhs_sinv,
-                            Buffer_Type rhs_data, Buffer_Type rhs_sinv, Buffer_Type bias,
-                            Buffer_Type group_sizes, Buffer_Type alpha, Buffer_Type beta,
-                            Result_Type output, Result_Type cublas_workspace,
-                            Result_Type setup_workspace, Result_Type int64_workspace, size_t m,
-                            size_t n, size_t k, bool lhs_is_trans, bool rhs_is_trans,
-                            JAXX_Scaling_Mode scaling_mode, bool is_grouped_dense_wgrad) {
-  // Notes on matrix layouts and transpose:
-  // Jax uses row-major data_layout, on entering this function, each input matrix pair:
-  //   A: row-major [m, k] for N - [k, m] for T
-  //   B: row-major [k, n] for N - [n, k] for T
-  // on exiting this function, JAX expect:
-  //   C: row-major with size [m, n].
-  // cuBLAS uses column-major data_layout, in this view, each input matrix pair:
-  //   A: column-major with size [k, m] for T - [m, k] for N
-  //   B: column-major with size [n, k] for T - [k, n] for N
-  //
-  // If we call cuBLAS GEMM for A * B, the output will be:
-  //   C: column-major with size [m, n] --> row-major with size [n, m].
-  // To make the output compatible with JAX, we need to swap A and B in cuBLAS GEMM call.
+// V2 variant: derives data shape from the XLA buffer directly, converts group_sizes
+// int32→int64 per-tensor into a dedicated slot of int64_workspace, and wires first_dims/last_dims.
+// int64_offset (in int64 elements) is updated on return to the next available slot so callers can
+// thread it through successive make_grouped_tensor calls without aliasing.  Bounds are checked
+// before each slot is used.  Only NO_SCALING is supported.
+JAXX_GroupedTensorWrapper make_grouped_tensor(
+    Buffer_Type const &data, Buffer_Type const &first_dims, Buffer_Type const &last_dims,
+    int64_t *int64_workspace_base, size_t int64_workspace_capacity, size_t &int64_offset,
+    size_t num_gemms, cudaStream_t stream, int64_t axis_boundary = -1) {
+  auto dims = data.dimensions();
+  NVTE_CHECK(dims.size() >= 2, "grouped GEMM data buffer must be at least 2D.");
+  // Flatten dims at axis_boundary to produce a 2D NVTE shape.
+  // axis_boundary=-1 (default) collapses dims[0..N-2] → rows and keeps dims[N-1] → cols,
+  // preserving the prior behaviour for output buffers (e.g. [G, K, N] for wgrad).
+  size_t ab = (axis_boundary < 0) ? dims.size() - 1 : static_cast<size_t>(axis_boundary);
+  NVTEShape dataShape{.data = {product(dims, 0, ab), product(dims, ab, dims.size())}, .ndim = 2};
+  JAXX_GroupedTensorWrapper wrapper(JAXX_Scaling_Mode::NO_SCALING, num_gemms, dataShape);
+  wrapper.set_rowwise(data, std::nullopt);
+  if (first_dims.element_count() > 0) {
+    NVTE_CHECK(first_dims.element_type() == xla::ffi::DataType::S32, "group_sizes must be int32.");
+    NVTE_CHECK(int64_offset + num_gemms <= int64_workspace_capacity,
+               "int64_workspace overflow: not enough space for first_dims conversion.");
+    auto *slot = int64_workspace_base + int64_offset;
+    nvte_convert_int32_to_int64(reinterpret_cast<const int32_t *>(first_dims.untyped_data()), slot,
+                                num_gemms, stream);
+    wrapper.set_group_sizes_only(slot, num_gemms, kNVTEGroupedFirstDims);
+    int64_offset += num_gemms;
+  }
+  if (last_dims.element_count() > 0) {
+    NVTE_CHECK(last_dims.element_type() == xla::ffi::DataType::S32, "group_sizes must be int32.");
+    NVTE_CHECK(int64_offset + num_gemms <= int64_workspace_capacity,
+               "int64_workspace overflow: not enough space for last_dims conversion.");
+    auto *slot = int64_workspace_base + int64_offset;
+    nvte_convert_int32_to_int64(reinterpret_cast<const int32_t *>(last_dims.untyped_data()), slot,
+                                num_gemms, stream);
+    wrapper.set_group_sizes_only(slot, num_gemms, kNVTEGroupedLastDims);
+    int64_offset += num_gemms;
+  }
+  return wrapper;
+}
 
-  // Inputs
-  auto lhs_ptr = reinterpret_cast<uint8_t *>(lhs_data.untyped_data());
-  auto rhs_ptr = reinterpret_cast<uint8_t *>(rhs_data.untyped_data());
-  auto lhs_sinv_ptr = reinterpret_cast<uint8_t *>(lhs_sinv.untyped_data());
-  auto rhs_sinv_ptr = reinterpret_cast<uint8_t *>(rhs_sinv.untyped_data());
-  auto lhs_dtype = convert_ffi_datatype_to_te_dtype(lhs_data.element_type());
-  auto rhs_dtype = convert_ffi_datatype_to_te_dtype(rhs_data.element_type());
-  auto lhs_sinv_dtype = convert_ffi_datatype_to_te_dtype(lhs_sinv.element_type());
-  auto rhs_sinv_dtype = convert_ffi_datatype_to_te_dtype(rhs_sinv.element_type());
-  bool has_bias = product(bias.dimensions()) > 0;
-  auto bias_ptr = has_bias ? reinterpret_cast<uint8_t *>(bias.untyped_data()) : nullptr;
-  auto bias_dtype = convert_ffi_datatype_to_te_dtype(bias.element_type());
+// Returns num_gemms from the first non-empty per-tensor group_sizes buffer,
+// falling back to the element count of alpha for the uniform-batch case.
+size_t grouped_gemm_num_gemms(Buffer_Type const &lhs_first_dims, Buffer_Type const &lhs_last_dims,
+                              Buffer_Type const &rhs_first_dims, Buffer_Type const &rhs_last_dims,
+                              Buffer_Type const &out_first_dims, Buffer_Type const &out_last_dims,
+                              Buffer_Type const &alpha) {
+  if (lhs_first_dims.element_count() > 0) {
+    return lhs_first_dims.dimensions()[0];
+  } else if (lhs_last_dims.element_count() > 0) {
+    return lhs_last_dims.dimensions()[0];
+  } else if (rhs_first_dims.element_count() > 0) {
+    return rhs_first_dims.dimensions()[0];
+  } else if (rhs_last_dims.element_count() > 0) {
+    return rhs_last_dims.dimensions()[0];
+  } else if (out_first_dims.element_count() > 0) {
+    return out_first_dims.dimensions()[0];
+  } else if (out_last_dims.element_count() > 0) {
+    return out_last_dims.dimensions()[0];
+  } else {
+    return alpha.element_count();  // uniform batch: no ragged tensor
+  }
+}
+
+}  // namespace jax
+}  // namespace transformer_engine
 
-  NVTE_CHECK(group_sizes.dimensions().size() == 1);
-  size_t num_gemms = group_sizes.dimensions()[0];
+namespace transformer_engine {
+namespace jax {
 
-  // Convert int32 group_sizes to int64 into the dedicated output buffer.
-  NVTE_CHECK(group_sizes.element_type() == xla::ffi::DataType::S32, "group_sizes must be int32.");
-  auto *int64_sizes_ptr = reinterpret_cast<int64_t *>(int64_workspace->untyped_data());
-  nvte_convert_int32_to_int64(reinterpret_cast<const int32_t *>(group_sizes.untyped_data()),
-                              int64_sizes_ptr, num_gemms, stream);
+// This FFI is EXPERIMENTAL and subject to change without deprecation, intended for use in JAX's internal implementation of grouped GEMM.
+Error_Type GroupedGemmV2FFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type lhs_sinv,
+                            Buffer_Type rhs_data, Buffer_Type rhs_sinv, Buffer_Type bias,
+                            Buffer_Type lhs_first_dims, Buffer_Type lhs_last_dims,
+                            Buffer_Type rhs_first_dims, Buffer_Type rhs_last_dims,
+                            Buffer_Type out_first_dims, Buffer_Type out_last_dims,
+                            Buffer_Type alpha, Buffer_Type beta, Result_Type output,
+                            Result_Type cublas_workspace, Result_Type setup_workspace,
+                            Result_Type int64_workspace, GroupedGemmV2Config config) {
+  auto [lhs_is_trans, rhs_is_trans, scaling_mode, lhs_axis_boundary, rhs_axis_boundary] = config;
 
   NVTE_CHECK(scaling_mode == JAXX_Scaling_Mode::NO_SCALING,
              "Only non-quantized grouped GEMM is supported in current implementation.");
 
-  // It is weird that TE/Common GEMM only use colwise for MXFP8
-  const bool is_fp8_gemm = is_fp8_dtype(lhs_dtype);
-  const bool is_tensor_scaling = scaling_mode == JAXX_Scaling_Mode::DELAYED_TENSOR_SCALING ||
-                                 scaling_mode == JAXX_Scaling_Mode::CURRENT_TENSOR_SCALING;
-  const bool is_mxfp8_scaling = scaling_mode == JAXX_Scaling_Mode::MXFP8_1D_SCALING;
-  const bool rhs_use_colwise = is_mxfp8_scaling && !rhs_is_trans;
-  const bool lhs_use_colwise = is_mxfp8_scaling && lhs_is_trans;
+  size_t num_gemms = grouped_gemm_num_gemms(lhs_first_dims, lhs_last_dims, rhs_first_dims,
+                                            rhs_last_dims, out_first_dims, out_last_dims, alpha);
 
-  // Outputs
-  auto out_ptr = reinterpret_cast<uint8_t *>(output->untyped_data());
-  auto out_dtype = convert_ffi_datatype_to_te_dtype(output->element_type());
+  // Workspaces.
   auto setup_workspace_ptr = reinterpret_cast<uint8_t *>(setup_workspace->untyped_data());
-  // Here we clear the lower 8 bits of the buffer address to ensure the buffer is 256-aligned
   auto cublas_workspace_ptr = reinterpret_cast<uint8_t *>(cublas_workspace->untyped_data());
   cublas_workspace_ptr = move_ptr_to_next_256B_aligned(cublas_workspace_ptr);
-  auto workspace_total_size = product(cublas_workspace->dimensions());
-
-  auto lhs_sinv_size = product(lhs_sinv.dimensions());
-  auto rhs_sinv_size = product(rhs_sinv.dimensions());
-  const size_t workspace_alignment_padding = 256;
-  const size_t tensor_scaling_sinv_aligment = 16;
-  const size_t mxfp8_scaling_sinv_alignment_padding = 256;
-  auto workspace_size = workspace_total_size - workspace_alignment_padding;
-  if (is_mxfp8_scaling) {
-    // For MXFP8 swizzled scale_inv buffers, only the first pointer needs to be with 256B alignment padding. Later pointers are guaranteed to be 256-aligned as the scale_inv shapes are padded by 128x4.
-    workspace_size -= (lhs_sinv_size + rhs_sinv_size + 2 * mxfp8_scaling_sinv_alignment_padding);
-  } else if (is_tensor_scaling) {
-    // For tensor scaling, each matrix has a single scale value, and all scales need to be aligned
-    // by 16 bytes to meet the requirement of CUDA 12.9.1 and later.
-    workspace_size -= tensor_scaling_sinv_aligment * (lhs_sinv_size + rhs_sinv_size);
-  }
-  auto swizzled_lhs_sinv_ptr = cublas_workspace_ptr + workspace_size;
-  swizzled_lhs_sinv_ptr = move_ptr_to_next_256B_aligned(swizzled_lhs_sinv_ptr);
-  auto swizzled_rhs_sinv_ptr = swizzled_lhs_sinv_ptr + lhs_sinv_size;
-  swizzled_rhs_sinv_ptr = move_ptr_to_next_256B_aligned(swizzled_rhs_sinv_ptr);
-  auto lhs_scatter_aligned_ptr = swizzled_lhs_sinv_ptr;  // Already 256B aligned
-  auto rhs_scatter_aligned_ptr = lhs_scatter_aligned_ptr + num_gemms * tensor_scaling_sinv_aligment;
-
-  size_t lhs_dtype_bytes = te_dtype_bytes(lhs_dtype);
-  size_t rhs_dtype_bytes = te_dtype_bytes(rhs_dtype);
-  size_t lhs_sinv_dtype_bytes = te_dtype_bytes(lhs_sinv_dtype);
-  size_t rhs_sinv_dtype_bytes = te_dtype_bytes(rhs_sinv_dtype);
-  size_t bias_dtype_bytes = te_dtype_bytes(bias_dtype);
-  size_t out_dtype_bytes = te_dtype_bytes(out_dtype);
-
-  NVTE_CHECK(lhs_dtype_bytes == rhs_dtype_bytes, "sizeof(lhs_dtype) != sizeof(rhs_dtype)");
-  NVTE_CHECK(lhs_sinv_dtype_bytes == rhs_sinv_dtype_bytes,
-             "sizeof(lhs_sinv_dtype) != sizeof(rhs_sinv_dtype)");
-
-  size_t expected_lhs_size = m * k;
-  size_t expected_rhs_size = is_grouped_dense_wgrad ? (k * n) : (num_gemms * k * n);
-  size_t expected_out_size = is_grouped_dense_wgrad ? (num_gemms * m * n) : (m * n);
-  size_t actual_lhs_size = product(lhs_data.dimensions());
-  size_t actual_rhs_size = product(rhs_data.dimensions());
-  size_t actual_out_size = product(output->dimensions());
-  NVTE_CHECK(expected_lhs_size == actual_lhs_size, "Unexpected lhs size! Expect ",
-             expected_lhs_size, ", got ", actual_lhs_size);
-  if (!is_grouped_dense_wgrad) {
-    NVTE_CHECK(expected_rhs_size == actual_rhs_size,
-               "Unexpected rhs size! Expect num_gemms * n * k = ", num_gemms, " * ", n, " * ", k,
-               " = ", expected_rhs_size, ", got ", actual_rhs_size);
-    NVTE_CHECK(expected_out_size == actual_out_size, "Unexpected output size! Expect m * n = ", m,
-               " * ", n, " = ", expected_out_size, ", got ", actual_out_size);
-  } else {
-    NVTE_CHECK(expected_rhs_size == actual_rhs_size, "Unexpected rhs size! Expect k * n = ", k,
-               " * ", n, " = ", expected_rhs_size, ", got ", actual_rhs_size);
-    NVTE_CHECK(expected_out_size == actual_out_size,
-               "Unexpected output size! Expect num_gemms * m * n = ", num_gemms, " * ", m, " * ", n,
-               " = ", expected_out_size, ", got ", actual_out_size);
-  }
-
-  auto num_math_sm = cuda::sm_count() - getenv<int>("NVTE_EXT_MARGIN_SM", 0);
-  bool grad = false;
-  bool accumulate = false;
-  bool use_split_accumulator = false;
-  auto bias_shape = std::vector<size_t>{has_bias ? n : 0};
-  const int arch = cuda::sm_arch();
-
-  if (arch < 100 && is_fp8_gemm) {
-    NVTE_CHECK(!lhs_is_trans && rhs_is_trans,
-               "For SM90 or older archs and FP8 input, only NT (row-major) GEMM is supported, ",
-               "got lhs_is_trans=", lhs_is_trans, ", rhs_is_trans=", rhs_is_trans);
-  }
-
+  auto workspace_size = product(cublas_workspace->dimensions()) - 256;
   TensorWrapper workspace_setup(setup_workspace_ptr,
                                 std::vector<size_t>{product(setup_workspace->dimensions())},
                                 DType::kByte);
@@ -761,59 +722,21 @@ Error_Type GroupedGemmV2FFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Ty
                             std::vector<size_t>{num_gemms},
                             convert_ffi_datatype_to_te_dtype(beta.element_type()));
 
-  if (is_grouped_dense_wgrad) {
-    NVTE_CHECK(lhs_is_trans && !rhs_is_trans,
-               "For grouped dense wgrad, only TN GEMM is supported in TE/JAX currently.");
-
-    //// RHS
-    NVTEShape rhsShape{.data = {k, n}, .ndim = 2};
-    auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape);
-    rhs_tensor.set_group_sizes_only(int64_sizes_ptr, num_gemms, kNVTEGroupedFirstDims);
-
-    //// LHS
-    NVTEShape lhsShape{.data = {k, m}, .ndim = 2};
-    lhs_is_trans = true;
-    auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape);
-    lhs_tensor.set_group_sizes_only(int64_sizes_ptr, num_gemms, kNVTEGroupedFirstDims);
-
-    //// OUTPUT
-    NVTEShape outShape{.data = {num_gemms * m, n}, .ndim = 2};
-    auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING,
-                                          num_gemms, outShape);
-
-    nvte_grouped_gemm(rhs_tensor, rhs_is_trans, lhs_tensor, lhs_is_trans, nullptr, out_tensor,
-                      alpha_tensor.data(), beta_tensor.data(), workspace_setup.data(),
-                      workspace_cublas.data(),
-                      nullptr,  // config (use defaults)
-                      stream);
-
-    return ffi_with_cuda_error_check();
-  }
-
-  // Nominal case for FWD or DGRAD
-
-  //// RHS
-  NVTEShape rhsShape{.data = {num_gemms * k, n}, .ndim = 2};
-  if (rhs_is_trans) {
-    rhsShape.data[0] = num_gemms * n;
-    rhsShape.data[1] = k;
-  }
-  auto rhs_tensor = make_grouped_tensor(rhs_data, rhs_sinv, scaling_mode, num_gemms, rhsShape);
-
-  //// LHS
-  NVTEShape lhsShape{.data = {m, k}, .ndim = 2};
-  if (lhs_is_trans) {
-    std::swap(lhsShape.data[0], lhsShape.data[1]);
-  }
-  auto lhs_tensor = make_grouped_tensor(lhs_data, lhs_sinv, scaling_mode, num_gemms, lhsShape);
-  lhs_tensor.set_group_sizes_only(int64_sizes_ptr, num_gemms,
-                                  lhs_is_trans ? kNVTEGroupedLastDims : kNVTEGroupedFirstDims);
-
-  //// OUTPUT
-  NVTEShape outShape{.data = {m, n}, .ndim = 2};
-  auto out_tensor = make_grouped_tensor(*output, std::nullopt, JAXX_Scaling_Mode::NO_SCALING,
-                                        num_gemms, outShape);
-  out_tensor.set_group_sizes_only(int64_sizes_ptr, num_gemms, kNVTEGroupedFirstDims);
+  // Build grouped tensors from XLA buffer shapes and group_sizes — no m/n/k derivation needed.
+  // int64_workspace is partitioned into per-ragged-buffer slots of num_gemms int64 elements each.
+  // int64_offset is threaded through the three make_grouped_tensor calls so each non-empty *_dims
+  // buffer gets its own non-aliasing slot; bounds are checked inside make_grouped_tensor.
+  auto *int64_base = reinterpret_cast<int64_t *>(int64_workspace->untyped_data());
+  size_t int64_capacity = int64_workspace->element_count() / sizeof(int64_t);
+  size_t int64_offset = 0;
+  auto rhs_tensor =
+      make_grouped_tensor(rhs_data, rhs_first_dims, rhs_last_dims, int64_base, int64_capacity,
+                          int64_offset, num_gemms, stream, rhs_axis_boundary);
+  auto lhs_tensor =
+      make_grouped_tensor(lhs_data, lhs_first_dims, lhs_last_dims, int64_base, int64_capacity,
+                          int64_offset, num_gemms, stream, lhs_axis_boundary);
+  auto out_tensor = make_grouped_tensor(*output, out_first_dims, out_last_dims, int64_base,
+                                        int64_capacity, int64_offset, num_gemms, stream);
 
   nvte_grouped_gemm(rhs_tensor, rhs_is_trans, lhs_tensor, lhs_is_trans, nullptr, out_tensor,
                     alpha_tensor.data(), beta_tensor.data(), workspace_setup.data(),
@@ -827,33 +750,35 @@ Error_Type GroupedGemmV2FFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Ty
 XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedGemmV2Handler, GroupedGemmV2FFI,
                               FFI::Bind()
                                   .Ctx<FFI_Stream_Type>()  // stream
-                                  .Arg<Buffer_Type>()      // lhs_data
+                                  .Arg<Buffer_Type>()      // lhs_data (2D)
                                   .Arg<Buffer_Type>()      // lhs_sinv
-                                  .Arg<Buffer_Type>()      // rhs_data
+                                  .Arg<Buffer_Type>()      // rhs_data (2D)
                                   .Arg<Buffer_Type>()      // rhs_sinv
                                   .Arg<Buffer_Type>()      // bias
-                                  .Arg<Buffer_Type>()      // group_sizes (int32)
+                                  .Arg<Buffer_Type>()      // lhs_first_dims (G,) or empty (0,)
+                                  .Arg<Buffer_Type>()      // lhs_last_dims (G,) or empty (0,)
+                                  .Arg<Buffer_Type>()      // rhs_first_dims (G,) or empty (0,)
+                                  .Arg<Buffer_Type>()      // rhs_last_dims (G,) or empty (0,)
+                                  .Arg<Buffer_Type>()      // out_first_dims (G,) or empty (0,)
+                                  .Arg<Buffer_Type>()      // out_last_dims (G,) or empty (0,)
                                   .Arg<Buffer_Type>()      // alpha
                                   .Arg<Buffer_Type>()      // beta
                                   .Ret<Buffer_Type>()      // output
                                   .Ret<Buffer_Type>()      // cublas_workspace
                                   .Ret<Buffer_Type>()      // setup_workspace
                                   .Ret<Buffer_Type>()      // int64_workspace
-                                  .Attr<int64_t>("M")
-                                  .Attr<int64_t>("N")
-                                  .Attr<int64_t>("K")
-                                  .Attr<bool>("lhs_is_trans")
-                                  .Attr<bool>("rhs_is_trans")
-                                  .Attr<JAXX_Scaling_Mode>("scaling_mode")
-                                  .Attr<bool>("is_grouped_dense_wgrad"),
+                                  .Attrs<GroupedGemmV2Config>(),
                               FFI_CudaGraph_Traits);
 
 Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type lhs_sinv,
                           Buffer_Type rhs_data, Buffer_Type rhs_sinv, Buffer_Type bias,
-                          Buffer_Type group_sizes, Buffer_Type group_offset, Result_Type output,
-                          Result_Type workspace, size_t m, size_t n, size_t k, bool lhs_is_trans,
-                          bool rhs_is_trans, JAXX_Scaling_Mode scaling_mode, bool has_bias,
-                          bool is_grouped_dense_wgrad, bool use_async_d2h_group_sizes) {
+                          Buffer_Type lhs_first_dims, Buffer_Type lhs_last_dims,
+                          Buffer_Type rhs_first_dims, Buffer_Type rhs_last_dims,
+                          Buffer_Type out_first_dims, Buffer_Type out_last_dims,
+                          Buffer_Type group_offset, Result_Type output, Result_Type workspace,
+                          GroupedGemmConfig config) {
+  auto [lhs_is_trans, rhs_is_trans, scaling_mode, has_bias, use_async_d2h_group_sizes,
+        lhs_axis_boundary, rhs_axis_boundary] = config;
   // Notes on matrix layouts and transpose:
   // Jax uses row-major data_layout, on entering this function, each input matrix pair:
   //   A: row-major [m, k] for N - [k, m] for T
@@ -870,6 +795,61 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type
 
   int num_streams = nvte_get_num_compute_streams();
 
+  // Determine which group_sizes buffers are active (non-empty = ragged dimension).
+  bool is_lhs_first_ragged = lhs_first_dims.element_count() > 0;
+  bool is_lhs_last_ragged = lhs_last_dims.element_count() > 0;
+  bool is_rhs_first_ragged = rhs_first_dims.element_count() > 0;
+  bool is_rhs_last_ragged = rhs_last_dims.element_count() > 0;
+  bool is_lhs_ragged = is_lhs_first_ragged || is_lhs_last_ragged;
+  bool is_rhs_ragged = is_rhs_first_ragged || is_rhs_last_ragged;
+  bool any_ragged = is_lhs_ragged || is_rhs_ragged;
+
+  size_t num_gemms;
+  if (is_lhs_first_ragged)
+    num_gemms = lhs_first_dims.dimensions()[0];
+  else if (is_lhs_last_ragged)
+    num_gemms = lhs_last_dims.dimensions()[0];
+  else if (is_rhs_first_ragged)
+    num_gemms = rhs_first_dims.dimensions()[0];
+  else if (is_rhs_last_ragged)
+    num_gemms = rhs_last_dims.dimensions()[0];
+  else
+    NVTE_CHECK(false,
+               "GroupedGemmFFI (v1): At least one of the group size buffers must be non-empty to "
+               "determine num_gemms.");
+
+  const Buffer_Type *active_gs_ptr = nullptr;
+  if (is_lhs_first_ragged)
+    active_gs_ptr = &lhs_first_dims;
+  else if (is_lhs_last_ragged)
+    active_gs_ptr = &lhs_last_dims;
+  else if (is_rhs_first_ragged)
+    active_gs_ptr = &rhs_first_dims;
+  else if (is_rhs_last_ragged)
+    active_gs_ptr = &rhs_last_dims;
+
+  // Derive m, n, k from N-D buffer dimensions using axis_boundary.
+  // axis_boundary splits contracting dims from non-contracting dims.
+  auto lhs_dims = lhs_data.dimensions();
+  auto rhs_dims = rhs_data.dimensions();
+  NVTE_CHECK(lhs_dims.size() >= 2, "lhs_data must be at least 2D.");
+  NVTE_CHECK(rhs_dims.size() >= 2, "rhs_data must be at least 2D.");
+  size_t lab = static_cast<size_t>(lhs_axis_boundary);
+  size_t rab = static_cast<size_t>(rhs_axis_boundary);
+  // k = product of contracting dims of lhs
+  size_t k = lhs_is_trans ? product(lhs_dims, 0, lab) : product(lhs_dims, lab, lhs_dims.size());
+  size_t m, n;
+  if (is_rhs_ragged) {
+    // wgrad: non-contracting lhs dims form M; non-contracting rhs dims form N
+    m = lhs_is_trans ? product(lhs_dims, lab, lhs_dims.size()) : product(lhs_dims, 0, lab);
+    n = rhs_is_trans ? product(rhs_dims, 0, rab) : product(rhs_dims, rab, rhs_dims.size());
+  } else {
+    m = lhs_is_trans ? product(lhs_dims, lab, lhs_dims.size())
+                     : product(lhs_dims, 0, lab);  // total M (sum of group sizes)
+    n = rhs_is_trans ? product(rhs_dims, 0, rab) / num_gemms
+                     : product(rhs_dims, rab, rhs_dims.size());
+  }
+
   // Inputs
   auto lhs_ptr = reinterpret_cast<uint8_t *>(lhs_data.untyped_data());
   auto rhs_ptr = reinterpret_cast<uint8_t *>(rhs_data.untyped_data());
@@ -882,9 +862,6 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type
   auto bias_ptr = has_bias ? reinterpret_cast<uint8_t *>(bias.untyped_data()) : nullptr;
   auto bias_dtype = convert_ffi_datatype_to_te_dtype(bias.element_type());
 
-  NVTE_CHECK(group_sizes.dimensions().size() == 1);
-  size_t num_gemms = group_sizes.dimensions()[0];
-
   // It is weird that TE/Common GEMM only use colwise for MXFP8
   const bool is_fp8_gemm = is_fp8_dtype(lhs_dtype);
   const bool is_tensor_scaling = scaling_mode == JAXX_Scaling_Mode::DELAYED_TENSOR_SCALING ||
@@ -951,14 +928,14 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type
              "sizeof(lhs_sinv_dtype) != sizeof(rhs_sinv_dtype)");
 
   size_t expected_lhs_size = m * k;
-  size_t expected_rhs_size = is_grouped_dense_wgrad ? (k * n) : (num_gemms * k * n);
-  size_t expected_out_size = is_grouped_dense_wgrad ? (num_gemms * m * n) : (m * n);
+  size_t expected_rhs_size = is_rhs_ragged ? (k * n) : (num_gemms * k * n);
+  size_t expected_out_size = is_rhs_ragged ? (num_gemms * m * n) : (m * n);
   size_t actual_lhs_size = product(lhs_data.dimensions());
   size_t actual_rhs_size = product(rhs_data.dimensions());
   size_t actual_out_size = product(output->dimensions());
   NVTE_CHECK(expected_lhs_size == actual_lhs_size, "Unexpected lhs size! Expect ",
              expected_lhs_size, ", got ", actual_lhs_size);
-  if (!is_grouped_dense_wgrad) {
+  if (!is_rhs_ragged) {
     NVTE_CHECK(expected_rhs_size == actual_rhs_size,
                "Unexpected rhs size! Expect num_gemms * n * k = ", num_gemms, " * ", n, " * ", k,
                " = ", expected_rhs_size, ", got ", actual_rhs_size);
@@ -974,25 +951,28 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type
 
   size_t dim_list_bytes = sizeof(int32_t) * num_gemms;
   std::vector<int32_t> dim_list_host(num_gemms);
-  size_t host_num_gemms = 0;
-  if (use_async_d2h_group_sizes) {
-    host_num_gemms = GroupedGemmGetGroupSizes(stream, num_gemms, nullptr, dim_list_host.data());
-    NVTE_CHECK(host_num_gemms == num_gemms, "num_gemms ", num_gemms,
-               " does not match the return of GroupedGemmGetGroupSizes ", host_num_gemms, ".");
-  } else {
-    auto dim_list_ptr = reinterpret_cast<int32_t *>(group_sizes.untyped_data());
-    cudaMemcpyAsync(dim_list_host.data(), dim_list_ptr, dim_list_bytes, cudaMemcpyDeviceToHost,
-                    stream);
-    // Note: This may break cudaGraph.
-    cudaStreamSynchronize(stream);
-  }
-  size_t sum_group_sizes = std::accumulate(dim_list_host.begin(), dim_list_host.end(), 0);
-  if (!is_grouped_dense_wgrad) {
-    NVTE_CHECK(m == sum_group_sizes, "Unexpected group_sizes! M = ", m,
-               ", got sum(group_sizes)=", sum_group_sizes);
-  } else {
-    NVTE_CHECK(k == sum_group_sizes, "Unexpected group_sizes! K = ", k,
-               ", got sum(group_sizes)=", sum_group_sizes);
+  if (any_ragged) {
+    size_t host_num_gemms = 0;
+    if (use_async_d2h_group_sizes) {
+      host_num_gemms = GroupedGemmGetGroupSizes(stream, num_gemms, nullptr, dim_list_host.data());
+      NVTE_CHECK(host_num_gemms == num_gemms, "num_gemms ", num_gemms,
+                 " does not match the return of GroupedGemmGetGroupSizes ", host_num_gemms, ".");
+    } else {
+      NVTE_CHECK(active_gs_ptr != nullptr, "active_gs_ptr is null but any_ragged is true.");
+      auto gs_data_ptr = reinterpret_cast<const int32_t *>(active_gs_ptr->untyped_data());
+      cudaMemcpyAsync(dim_list_host.data(), gs_data_ptr, dim_list_bytes, cudaMemcpyDeviceToHost,
+                      stream);
+      // Note: This may break cudaGraph.
+      cudaStreamSynchronize(stream);
+    }
+    size_t sum_group_sizes = std::accumulate(dim_list_host.begin(), dim_list_host.end(), 0);
+    if (!is_rhs_ragged) {
+      NVTE_CHECK(m == sum_group_sizes, "Unexpected group_sizes! M = ", m,
+                 ", got sum(group_sizes)=", sum_group_sizes);
+    } else {
+      NVTE_CHECK(k == sum_group_sizes, "Unexpected group_sizes! K = ", k,
+                 ", got sum(group_sizes)=", sum_group_sizes);
+    }
   }
 
   auto num_math_sm = cuda::sm_count() - getenv<int>("NVTE_EXT_MARGIN_SM", 0);
@@ -1040,7 +1020,7 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type
     auto lhs_shape_i = std::vector<size_t>{m_i, k};
     auto rhs_shape_i = std::vector<size_t>{rhs_is_trans ? n : k, rhs_is_trans ? k : n};
     auto out_shape_i = std::vector<size_t>{m_i, n};
-    if (is_grouped_dense_wgrad) {
+    if (is_rhs_ragged) {
       size_t k_i = dim_list_host[i];
       lhs_shape_i[0] = lhs_is_trans ? k_i : m;
       lhs_shape_i[1] = lhs_is_trans ? m : k_i;
@@ -1230,24 +1210,21 @@ Error_Type GroupedGemmFFI(cudaStream_t stream, Buffer_Type lhs_data, Buffer_Type
 XLA_FFI_DEFINE_HANDLER_SYMBOL(GroupedGemmHandler, GroupedGemmFFI,
                               FFI::Bind()
                                   .Ctx<FFI_Stream_Type>()  // stream
-                                  .Arg<Buffer_Type>()      // lhs_data
+                                  .Arg<Buffer_Type>()      // lhs_data (2D)
                                   .Arg<Buffer_Type>()      // lhs_sinv
-                                  .Arg<Buffer_Type>()      // rhs_data
+                                  .Arg<Buffer_Type>()      // rhs_data (2D)
                                   .Arg<Buffer_Type>()      // rhs_sinv
                                   .Arg<Buffer_Type>()      // bias
-                                  .Arg<Buffer_Type>()      // group_sizes
+                                  .Arg<Buffer_Type>()      // lhs_first_dims (G,) or empty (0,)
+                                  .Arg<Buffer_Type>()      // lhs_last_dims (G,) or empty (0,)
+                                  .Arg<Buffer_Type>()      // rhs_first_dims (G,) or empty (0,)
+                                  .Arg<Buffer_Type>()      // rhs_last_dims (G,) or empty (0,)
+                                  .Arg<Buffer_Type>()      // out_first_dims (G,) or empty (0,)
+                                  .Arg<Buffer_Type>()      // out_last_dims (G,) or empty (0,)
                                   .Arg<Buffer_Type>()      // group_offset
                                   .Ret<Buffer_Type>()      // output
                                   .Ret<Buffer_Type>()      // workspace
-                                  .Attr<int64_t>("M")
-                                  .Attr<int64_t>("N")
-                                  .Attr<int64_t>("K")
-                                  .Attr<bool>("lhs_is_trans")
-                                  .Attr<bool>("rhs_is_trans")
-                                  .Attr<JAXX_Scaling_Mode>("scaling_mode")
-                                  .Attr<bool>("has_bias")
-                                  .Attr<bool>("is_grouped_dense_wgrad")
-                                  .Attr<bool>("use_async_d2h_group_sizes"));
+                                  .Attrs<GroupedGemmConfig>());
 
 }  // namespace jax
 }  // namespace transformer_engine
diff --git a/transformer_engine/jax/dense.py b/transformer_engine/jax/dense.py
index fe02e61fc0..76c984486f 100644
--- a/transformer_engine/jax/dense.py
+++ b/transformer_engine/jax/dense.py
@@ -27,6 +27,7 @@
     is_fp8_gemm_with_all_layouts_supported,
     TensorUsage,
     QuantizeLayout,
+    GroupedNoScaleTensor,
 )
 
 
@@ -490,7 +491,8 @@ def _grouped_dense_fwd_rule(
                 is_colwise=False,
                 data_layout="N",
                 flatten_axis=ctx_kernel.flatten_axis,
-                group_sizes=ctx_kernel.group_sizes,
+                first_dims=ctx_kernel.first_dims,
+                last_dims=ctx_kernel.last_dims,
                 original_shape=kernel_shape,
                 group_axis=ctx_kernel.group_axis,
             )
@@ -507,7 +509,8 @@ def _grouped_dense_fwd_rule(
                     is_colwise=True,
                     data_layout="T",
                     flatten_axis=ctx_kernel.flatten_axis,
-                    group_sizes=ctx_kernel.group_sizes,
+                    first_dims=ctx_kernel.first_dims,
+                    last_dims=ctx_kernel.last_dims,
                     original_shape=kernel_shape,
                     group_axis=ctx_kernel.group_axis,
                 )
@@ -518,15 +521,29 @@ def _grouped_dense_fwd_rule(
         # This is needed especially when kernel_fsdp_enabled == True AND FP8 enabled.
         quantizer_set.kernel.q_layout = original_quantizer_set_kernel_q_layout
 
+    if is_noop_quantizer_set:
+        grouped_gemm_x = GroupedNoScaleTensor(
+            data=grouped_gemm_x,
+            first_dims=group_sizes,
+            last_dims=None,
+            group_axis=0,
+            original_shape=grouped_gemm_x.shape,
+        )
+        grouped_gemm_kernel = GroupedNoScaleTensor(
+            data=grouped_gemm_kernel,
+            first_dims=None,
+            last_dims=None,
+            group_axis=0,
+            original_shape=grouped_gemm_kernel.shape,
+        )
     output = tex.grouped_gemm(
         grouped_gemm_x,
         grouped_gemm_kernel,
-        group_sizes,
-        contracting_dims,
-        bias,
-        precision,
-        preferred_element_type,
-        group_offset,
+        contracting_dims=contracting_dims,
+        bias=bias,
+        precision=precision,
+        preferred_element_type=preferred_element_type,
+        group_offset=group_offset,
     )
 
     ctx = (
@@ -610,11 +627,39 @@ def _grouped_dense_bwd_rule(
         wgrad_x_T = ctx_x
         wgrad_grad = casted_grad.get_tensor(usage=TensorUsage.RHS)
 
+    if is_noop_quantizer_set:
+        dgrad_grad = GroupedNoScaleTensor(
+            data=dgrad_grad,
+            first_dims=group_sizes,
+            last_dims=None,
+            group_axis=0,
+            original_shape=dgrad_grad.shape,
+        )
+        dgrad_kernel_T = GroupedNoScaleTensor(
+            data=dgrad_kernel_T,
+            first_dims=None,
+            last_dims=None,
+            group_axis=0,
+            original_shape=dgrad_kernel_T.shape,
+        )
+        wgrad_x_T = GroupedNoScaleTensor(
+            data=wgrad_x_T,
+            first_dims=group_sizes,
+            last_dims=None,
+            group_axis=0,
+            original_shape=wgrad_x_T.shape,
+        )
+        wgrad_grad = GroupedNoScaleTensor(
+            data=wgrad_grad,
+            first_dims=group_sizes,
+            last_dims=None,
+            group_axis=0,
+            original_shape=wgrad_grad.shape,
+        )
     dgrad = tex.grouped_gemm(
         dgrad_grad,
         dgrad_kernel_T,
-        group_sizes,
-        dgrad_contracting_dims,
+        contracting_dims=dgrad_contracting_dims,
         precision=precision,
         preferred_element_type=preferred_element_type,
         group_offset=group_offset,
@@ -623,8 +668,7 @@ def _grouped_dense_bwd_rule(
     wgrad = tex.grouped_gemm(
         wgrad_x_T,
         wgrad_grad,
-        group_sizes,
-        wgrad_contracting_dims,
+        contracting_dims=wgrad_contracting_dims,
         precision=precision,
         preferred_element_type=preferred_element_type,
         group_offset=group_offset,
diff --git a/transformer_engine/jax/quantize/dequantizer.py b/transformer_engine/jax/quantize/dequantizer.py
index 74787b9308..5075f1a664 100644
--- a/transformer_engine/jax/quantize/dequantizer.py
+++ b/transformer_engine/jax/quantize/dequantizer.py
@@ -275,7 +275,17 @@ def _grouped_dequantize(grouped_scaled_tensor):
     """
     data = grouped_scaled_tensor.data
     scale_inv = grouped_scaled_tensor.scale_inv
-    group_sizes = grouped_scaled_tensor.group_sizes
+    group_sizes = (
+        grouped_scaled_tensor.first_dims
+        if grouped_scaled_tensor.first_dims is not None
+        and grouped_scaled_tensor.first_dims.size > 0
+        else grouped_scaled_tensor.last_dims
+    )
+    # For non-ragged groups (kernel case), group_sizes is not stored; derive from original_shape
+    if group_sizes is None:
+        group_sizes = jnp.ones(
+            grouped_scaled_tensor.original_shape[grouped_scaled_tensor.group_axis], dtype=jnp.int32
+        )
     flatten_axis = grouped_scaled_tensor.flatten_axis
     scaling_mode = grouped_scaled_tensor.scaling_mode
     original_shape = grouped_scaled_tensor.original_shape
diff --git a/transformer_engine/jax/quantize/quantizer.py b/transformer_engine/jax/quantize/quantizer.py
index f5ca6aeaed..55dd7f5618 100644
--- a/transformer_engine/jax/quantize/quantizer.py
+++ b/transformer_engine/jax/quantize/quantizer.py
@@ -948,7 +948,7 @@ def _create_grouped_tensor_from_tensor_list(
             is_colwise=tensor_list[0].is_colwise,
             data_layout=tensor_list[0].data_layout,
             flatten_axis=tensor_list[0].flatten_axis,
-            group_sizes=group_sizes,
+            first_dims=group_sizes,
             original_shape=original_shape,
             group_axis=group_axis,
         )
diff --git a/transformer_engine/jax/quantize/tensor.py b/transformer_engine/jax/quantize/tensor.py
index c26cb8a531..316e4f3139 100644
--- a/transformer_engine/jax/quantize/tensor.py
+++ b/transformer_engine/jax/quantize/tensor.py
@@ -9,7 +9,7 @@
 rowwise and colwise quantization modes with proper scaling and dequantization.
 """
 from dataclasses import dataclass
-from typing import Callable, Tuple
+from typing import Callable, Optional, Tuple
 from abc import ABC, abstractmethod
 
 import jax.numpy as jnp
@@ -32,6 +32,7 @@
     "ScaledTensor1x",
     "ScaledTensor2x",
     "GroupedScaledTensor1x",
+    "GroupedNoScaleTensor",
     "ScaledTensorFactory",
     "with_sharding_constraint_by_logical_axes",
 ]
@@ -365,12 +366,14 @@ class GroupedScaledTensor1x(ScaledTensor1x):
     where elements are grouped along a specified axis.
 
     Attributes:
-        group_sizes: Array containing the size of each group
+        first_dims: Per-group sizes of the first (row) 2D dim, or None if not ragged
+        last_dims: Per-group sizes of the last (col) 2D dim, or None if not ragged
         original_shape: The original shape of the tensor before grouping
         group_axis: The axis along which grouping is performed (default: 0)
     """
 
-    group_sizes: jnp.ndarray
+    first_dims: Optional[jnp.ndarray]
+    last_dims: Optional[jnp.ndarray]
     original_shape: Tuple
     group_axis: int
 
@@ -379,7 +382,7 @@ def __init__(
         data,
         scale_inv,
         amax,
-        group_sizes,
+        first_dims,
         scaling_mode,
         dq_dtype,
         _dq_func,
@@ -388,9 +391,11 @@ def __init__(
         flatten_axis,
         original_shape,
         group_axis=0,
+        last_dims=None,
     ):
         self.flatten_axis = flatten_axis
-        self.group_sizes = group_sizes
+        self.first_dims = first_dims
+        self.last_dims = last_dims
         self.original_shape = original_shape
         self.group_axis = group_axis
         # TODO(Phuong):Handle RHT for grouped quantization once grouped quantization supports NVFP4
@@ -422,9 +427,19 @@ def __post_init__(self):
             0 <= self.group_axis < data_ndim
         ), f"group_axis {self.group_axis} is out of bounds for shape {self.original_shape}"
 
+        active_dims = (
+            self.first_dims
+            if self.first_dims is not None and self.first_dims.size > 0
+            else self.last_dims
+        )
+        if active_dims is not None:
+            num_groups = active_dims.size
+        else:
+            num_groups = self.original_shape[self.group_axis]
+
         expected_scale_shape = self.scaling_mode.get_grouped_scale_shape(
             self.original_shape,
-            self.group_sizes.size,
+            num_groups,
             self.group_axis,
             self.is_colwise,
             is_padded=True,
@@ -442,7 +457,7 @@ def tree_flatten(self):
         Returns:
             A tuple containing (children, aux_data) for tree operations
         """
-        children = (self.data, self.scale_inv, self.amax, self.group_sizes)
+        children = (self.data, self.scale_inv, self.amax, self.first_dims, self.last_dims)
         aux_data = (
             self.scaling_mode,
             self.dq_dtype,
@@ -455,6 +470,36 @@ def tree_flatten(self):
         )
         return (children, aux_data)
 
+    @classmethod
+    def tree_unflatten(cls, aux_data, children):
+        """Reconstructs the tensor from its flattened representation."""
+        data, scale_inv, amax, first_dims, last_dims = children
+        (
+            scaling_mode,
+            dq_dtype,
+            _dq_func,
+            is_colwise,
+            data_layout,
+            flatten_axis,
+            original_shape,
+            group_axis,
+        ) = aux_data
+        return cls(
+            data=data,
+            scale_inv=scale_inv,
+            amax=amax,
+            first_dims=first_dims,
+            last_dims=last_dims,
+            scaling_mode=scaling_mode,
+            dq_dtype=dq_dtype,
+            _dq_func=_dq_func,
+            is_colwise=is_colwise,
+            data_layout=data_layout,
+            flatten_axis=flatten_axis,
+            original_shape=original_shape,
+            group_axis=group_axis,
+        )
+
     def apply_sharding_constraint_by_logical_axes(self, logical_axis_names: Tuple[str, ...]):
         raise NotImplementedError
 
@@ -473,6 +518,52 @@ def checkpoint(self, quantizer):
         return jax_checkpoint_name(self, name=quantizer.checkpoint_name)
 
 
+@register_pytree_node_class
+@dataclass
+class GroupedNoScaleTensor:
+    """Unquantized grouped tensor.
+
+    Stores N-D data with per-group dimension sizes so that grouped_gemm()
+    can extract first/last dims automatically without explicit parameters.
+
+    Attributes:
+        data: The raw (unquantized) tensor data in N-D layout
+        first_dims: Per-group sizes of the first (row) 2D dim, or None if not ragged
+        last_dims: Per-group sizes of the last (col) 2D dim, or None if not ragged
+        group_axis: Which axis of original_shape is the group batch prefix
+        original_shape: Shape of data (same as data.shape for N-D unquantized)
+    """
+
+    data: jnp.ndarray
+    first_dims: Optional[jnp.ndarray]
+    last_dims: Optional[jnp.ndarray]
+    group_axis: int
+    original_shape: Tuple
+
+    def tree_flatten(self):
+        """Flattens the tensor for JAX tree operations."""
+        children = (self.data, self.first_dims, self.last_dims)
+        aux_data = (self.group_axis, self.original_shape)
+        return (children, aux_data)
+
+    @classmethod
+    def tree_unflatten(cls, aux_data, children):
+        """Reconstructs the tensor from its flattened representation."""
+        group_axis, original_shape = aux_data
+        data, first_dims, last_dims = children
+        return cls(
+            data=data,
+            first_dims=first_dims,
+            last_dims=last_dims,
+            group_axis=group_axis,
+            original_shape=original_shape,
+        )
+
+    def dequantize(self):
+        """No-op dequantization — returns the raw data."""
+        return self.data
+
+
 @register_pytree_node_class
 @dataclass
 class ScaledTensor2x(AbstractBaseTensor, ScaledTensor):
@@ -570,7 +661,8 @@ def create_1x(
         is_colwise=False,
         data_layout="N",
         flatten_axis=-1,
-        group_sizes=None,
+        first_dims=None,
+        last_dims=None,
         original_shape=None,
         group_axis=0,
         has_rht_applied=False,
@@ -586,29 +678,44 @@ def create_1x(
             is_colwise: Whether to use column-wise quantization (default: False)
             data_layout: The data_layout specification (default: "N")
             flatten_axis: The quantization axis for the tensor
-            group_sizes: Array of ints containing the size of each group (default: None)
+            first_dims: Per-group sizes of the first (row) 2D dim (default: None)
+            last_dims: Per-group sizes of the last (col) 2D dim (default: None)
             original_shape: The original shape of the tensor before grouping (default: None)
             group_axis: The axis along which grouping is performed (default: 0)
             has_rht_applied: Whether the tensor had the Randomized Hadamard Transform (RHT) applied during quantization (default: False)
 
         Returns:
-            A ScaledTensor1x or GroupedScaledTensor1x instance depending on whether group_sizes is provided
+            A ScaledTensor1x or GroupedScaledTensor1x instance depending on whether first_dims or last_dims is provided
         """
         if amax is None:
             amax = jnp.empty((1,), dtype=jnp.float32)
 
         dequantizer = ScalingModeToDequantizerMap.get(scaling_mode)
 
-        if group_sizes is not None:
-            flatten_axis = (len(original_shape) + flatten_axis) % len(original_shape)
+        if (
+            first_dims is not None
+            or last_dims is not None
+            or (original_shape is not None and group_axis is not None)
+        ):
             assert (
                 original_shape is not None
             ), "original_shape is not given for GroupedScaledTensor1x"
+            flatten_axis = (len(original_shape) + flatten_axis) % len(original_shape)
+
+            # Determine num_groups from whichever dims array is provided, or from original_shape
+            active_dims = (
+                first_dims if first_dims is not None and first_dims.size > 0 else last_dims
+            )
+            if active_dims is not None:
+                num_groups = active_dims.size
+            else:
+                norm_group_axis = (len(original_shape) + group_axis) % len(original_shape)
+                num_groups = original_shape[norm_group_axis]
 
             # Handling attrs of transposed tensors
             group_axis = (len(original_shape) + group_axis) % len(original_shape)
             if data_layout == "T":
-                if original_shape[0] == group_sizes.size:
+                if original_shape[0] == num_groups:
                     original_shape = (
                         original_shape[0],
                         *original_shape[flatten_axis:],
@@ -633,7 +740,8 @@ def create_1x(
                 is_colwise=is_colwise,
                 data_layout=data_layout,
                 flatten_axis=flatten_axis,
-                group_sizes=group_sizes,
+                first_dims=first_dims,
+                last_dims=last_dims,
                 original_shape=original_shape,
                 group_axis=group_axis,
             )
@@ -668,7 +776,8 @@ def create_2x(
         dq_dtype=jnp.bfloat16,
         data_layout="NN",
         flatten_axis=-1,
-        group_sizes=None,
+        first_dims=None,
+        last_dims=None,
         original_shape=None,
         group_axis=0,
         rowwise_has_rht_applied=False,
@@ -686,7 +795,8 @@ def create_2x(
             dq_dtype: The data type for dequantized values (default: bfloat16)
             data_layout: The data_layout specification (default: "NN")
             flatten_axis: The quantization axis for the tensor
-            group_sizes: Array containing the size of each group (default: None)
+            first_dims: Per-group sizes of the first (row) 2D dim (default: None)
+            last_dims: Per-group sizes of the last (col) 2D dim (default: None)
             original_shape: The original shape of the tensor before grouping (default: None)
             group_axis: The axis along which grouping is performed (default: 0)
             rowwise_has_rht_applied: Whether the row-wise tensor uses the Randomized Hadamard Transform (RHT) (default: False)
@@ -710,7 +820,8 @@ def create_2x(
             is_colwise=False,
             data_layout=data_layout[0],
             flatten_axis=flatten_axis,
-            group_sizes=group_sizes,
+            first_dims=first_dims,
+            last_dims=last_dims,
             original_shape=original_shape,
             group_axis=group_axis,
             has_rht_applied=rowwise_has_rht_applied,
@@ -724,7 +835,8 @@ def create_2x(
             is_colwise=True,
             data_layout=data_layout[1],
             flatten_axis=flatten_axis,
-            group_sizes=group_sizes,
+            first_dims=first_dims,
+            last_dims=last_dims,
             original_shape=original_shape,
             group_axis=group_axis,
             has_rht_applied=colwise_has_rht_applied,
@@ -744,7 +856,8 @@ def create(
         data_layout: str = "NN",
         q_layout: QuantizeLayout = QuantizeLayout.ROWWISE,
         flatten_axis: int = -1,
-        group_sizes: jnp.ndarray = None,
+        first_dims: jnp.ndarray = None,
+        last_dims: jnp.ndarray = None,
         original_shape: Tuple[int] = None,
         group_axis: int = 0,
         rowwise_has_rht_applied: bool = False,
@@ -762,7 +875,8 @@ def create(
             data_layout: The data_layout specification (default: "NN")
             q_layout: The quantization axis (default: ROWWISE)
             flatten_axis: The axis along which the tensor could be flattened to 2D (default: -1)
-            group_sizes: Array containing the size of each group (default: None)
+            first_dims: Per-group sizes of the first (row) 2D dim (default: None)
+            last_dims: Per-group sizes of the last (col) 2D dim (default: None)
             original_shape: The original shape of the tensor before grouping (default: None)
             group_axis: The axis along which grouping is performed (default: 0)
             rowwise_has_rht_applied: Whether the row-wise tensor uses the Randomized Hadamard Transform (RHT) (default: False)
@@ -785,7 +899,8 @@ def create(
                 dq_dtype,
                 data_layout=data_layout,
                 flatten_axis=flatten_axis,
-                group_sizes=group_sizes,
+                first_dims=first_dims,
+                last_dims=last_dims,
                 original_shape=original_shape,
                 group_axis=group_axis,
                 rowwise_has_rht_applied=rowwise_has_rht_applied,
@@ -802,7 +917,8 @@ def create(
                 is_colwise=True,
                 data_layout=data_layout[0],
                 flatten_axis=flatten_axis,
-                group_sizes=group_sizes,
+                first_dims=first_dims,
+                last_dims=last_dims,
                 original_shape=original_shape,
                 group_axis=group_axis,
                 has_rht_applied=colwise_has_rht_applied,
@@ -817,7 +933,8 @@ def create(
             is_colwise=False,
             data_layout=data_layout[0],
             flatten_axis=flatten_axis,
-            group_sizes=group_sizes,
+            first_dims=first_dims,
+            last_dims=last_dims,
             original_shape=original_shape,
             group_axis=group_axis,
             has_rht_applied=rowwise_has_rht_applied,