From 79a53a82cf8df27f35a96e147ff9dfcc9cf617d9 Mon Sep 17 00:00:00 2001
From: Ashley Dunfield <ashdunfield@gmail.com>
Date: Tue, 31 Mar 2026 08:31:54 -0600
Subject: [PATCH] Add SVE2 BitPerm intrinsics (BDEP, BEXT, BGRP)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add Rust bindings for the three SVE2 bit-permutation instructions
(FEAT_SVE_BitPerm): BDEP (bit deposit), BEXT (bit extract), and
BGRP (bit group). These are the scalable vector equivalents of
x86 BMI2 PDEP/PEXT.

Intrinsics added (12 total):
- svbdep_u{8,16,32,64} — scatter source bits to mask positions
- svbext_u{8,16,32,64} — gather bits from mask positions
- svbgrp_u{8,16,32,64} — partition bits into two groups by mask

All gated by #[target_feature(enable = "sve2-bitperm")] and the
unstable feature flag stdarch_aarch64_sve2_bitperm.

Also adds SVE scalable vector type definitions (svuint8_t through
svint64_t) using #[rustc_scalable_vector(N)] from rust#143924,
gated by stdarch_aarch64_sve.

Tested on AWS Graviton 4 (c8g) which has FEAT_SVE2_BitPerm.
LLVM intrinsic names verified via Clang assembly output.
Scalar reference tests pass on both x86_64 and aarch64.
---
 crates/core_arch/src/aarch64/mod.rs           |   4 +
 crates/core_arch/src/aarch64/sve/mod.rs       |  33 ++
 .../core_arch/src/aarch64/sve/sve2_bitperm.rs | 418 ++++++++++++++++++
 crates/core_arch/src/aarch64/sve/types.rs     |  96 ++++
 4 files changed, 551 insertions(+)
 create mode 100644 crates/core_arch/src/aarch64/sve/mod.rs
 create mode 100644 crates/core_arch/src/aarch64/sve/sve2_bitperm.rs
 create mode 100644 crates/core_arch/src/aarch64/sve/types.rs

diff --git a/crates/core_arch/src/aarch64/mod.rs b/crates/core_arch/src/aarch64/mod.rs
index d7295659c3..c429c098f7 100644
--- a/crates/core_arch/src/aarch64/mod.rs
+++ b/crates/core_arch/src/aarch64/mod.rs
@@ -29,6 +29,10 @@ mod prefetch;
 #[unstable(feature = "stdarch_aarch64_prefetch", issue = "117217")]
 pub use self::prefetch::*;
 
+mod sve;
+#[unstable(feature = "stdarch_aarch64_sve", issue = "none")]
+pub use self::sve::*;
+
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub use super::arm_shared::*;
 
diff --git a/crates/core_arch/src/aarch64/sve/mod.rs b/crates/core_arch/src/aarch64/sve/mod.rs
new file mode 100644
index 0000000000..d6b8bc1c49
--- /dev/null
+++ b/crates/core_arch/src/aarch64/sve/mod.rs
@@ -0,0 +1,33 @@
+//! ARM SVE (Scalable Vector Extension) intrinsics.
+//!
+//! This module provides Rust bindings for ARM SVE and SVE2 SIMD intrinsics.
+//! SVE vectors are *scalable* — their width is determined at runtime by the
+//! hardware (128–2048 bits in 128-bit increments). The types defined here
+//! use `#[rustc_scalable_vector]` to represent these runtime-sized vectors.
+//!
+//! # SVE2 Bit Permutation (FEAT_SVE_BitPerm)
+//!
+//! The `sve2_bitperm` sub-module exposes three bit-permutation instructions
+//! added by the SVE2 BitPerm extension:
+//!
+//! - **BDEP** (Bit Deposit) — scatter source bits into positions selected by a mask.
+//!   Equivalent to x86 `PDEP`.
+//! - **BEXT** (Bit Extract) — gather bits from positions selected by a mask.
+//!   Equivalent to x86 `PEXT`.
+//! - **BGRP** (Bit Group) — partition bits into two groups: those selected by the
+//!   mask (placed at the bottom) and those not selected (placed at the top).
+//!
+//! Gated by `#[target_feature(enable = "sve2-bitperm")]`.
+//!
+//! # References
+//!
+//! - [ARM Architecture Reference Manual — SVE2](https://developer.arm.com/documentation/ddi0602/)
+//! - [ARM C Language Extensions (ACLE) for SVE](https://arm-software.github.io/acle/main/acle-sve.html)
+
+mod types;
+#[unstable(feature = "stdarch_aarch64_sve", issue = "none")]
+pub use self::types::*;
+
+mod sve2_bitperm;
+#[unstable(feature = "stdarch_aarch64_sve2_bitperm", issue = "none")]
+pub use self::sve2_bitperm::*;
diff --git a/crates/core_arch/src/aarch64/sve/sve2_bitperm.rs b/crates/core_arch/src/aarch64/sve/sve2_bitperm.rs
new file mode 100644
index 0000000000..7b6067a44c
--- /dev/null
+++ b/crates/core_arch/src/aarch64/sve/sve2_bitperm.rs
@@ -0,0 +1,418 @@
+#![allow(improper_ctypes)]
+
+//! SVE2 Bit Permutation intrinsics (FEAT_SVE_BitPerm).
+//!
+//! Three bit-permutation instructions operating on scalable vectors:
+//!
+//! | Instruction | ACLE           | x86 equivalent | Operation                                     |
+//! |-------------|----------------|----------------|-----------------------------------------------|
+//! | `BDEP`      | `svbdep[_u*]`  | `PDEP`         | Scatter source bits to mask-selected positions |
+//! | `BEXT`      | `svbext[_u*]`  | `PEXT`         | Gather bits from mask-selected positions       |
+//! | `BGRP`      | `svbgrp[_u*]`  | —              | Partition bits by mask into two groups          |
+//!
+//! All intrinsics require `#[target_feature(enable = "sve2-bitperm")]`.
+//!
+//! # References
+//!
+//! - [ARM Architecture Reference Manual — SVE2 BDEP, BEXT, BGRP](https://developer.arm.com/documentation/ddi0602/)
+//! - [ARM ACLE for SVE2](https://arm-software.github.io/acle/main/acle-sve2.html)
+
+#[cfg(test)]
+use stdarch_test::assert_instr;
+
+use super::types::*;
+
+// ---------------------------------------------------------------------------
+// LLVM intrinsic declarations
+// ---------------------------------------------------------------------------
+//
+// The `.x.` suffix indicates unpredicated (don't-care predication) variants.
+// SVE2 bitperm instructions do not use governing predicates.
+//
+// Naming: llvm.aarch64.sve.{bdep|bext|bgrp}.x.nxv{N}i{M}
+
+// --- BDEP ---
+
+unsafe extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.sve.bdep.x.nxv16i8")]
+    fn _svbdep_u8(op1: svuint8_t, op2: svuint8_t) -> svuint8_t;
+}
+unsafe extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.sve.bdep.x.nxv8i16")]
+    fn _svbdep_u16(op1: svuint16_t, op2: svuint16_t) -> svuint16_t;
+}
+unsafe extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.sve.bdep.x.nxv4i32")]
+    fn _svbdep_u32(op1: svuint32_t, op2: svuint32_t) -> svuint32_t;
+}
+unsafe extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.sve.bdep.x.nxv2i64")]
+    fn _svbdep_u64(op1: svuint64_t, op2: svuint64_t) -> svuint64_t;
+}
+
+// --- BEXT ---
+
+unsafe extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.sve.bext.x.nxv16i8")]
+    fn _svbext_u8(op1: svuint8_t, op2: svuint8_t) -> svuint8_t;
+}
+unsafe extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.sve.bext.x.nxv8i16")]
+    fn _svbext_u16(op1: svuint16_t, op2: svuint16_t) -> svuint16_t;
+}
+unsafe extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.sve.bext.x.nxv4i32")]
+    fn _svbext_u32(op1: svuint32_t, op2: svuint32_t) -> svuint32_t;
+}
+unsafe extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.sve.bext.x.nxv2i64")]
+    fn _svbext_u64(op1: svuint64_t, op2: svuint64_t) -> svuint64_t;
+}
+
+// --- BGRP ---
+
+unsafe extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.sve.bgrp.x.nxv16i8")]
+    fn _svbgrp_u8(op1: svuint8_t, op2: svuint8_t) -> svuint8_t;
+}
+unsafe extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.sve.bgrp.x.nxv8i16")]
+    fn _svbgrp_u16(op1: svuint16_t, op2: svuint16_t) -> svuint16_t;
+}
+unsafe extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.sve.bgrp.x.nxv4i32")]
+    fn _svbgrp_u32(op1: svuint32_t, op2: svuint32_t) -> svuint32_t;
+}
+unsafe extern "unadjusted" {
+    #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.sve.bgrp.x.nxv2i64")]
+    fn _svbgrp_u64(op1: svuint64_t, op2: svuint64_t) -> svuint64_t;
+}
+
+// ===========================================================================
+// BDEP — Bit Deposit (SVE2 equivalent of x86 PDEP)
+// ===========================================================================
+//
+// For each element, scatter consecutive low bits from `data` into the bit
+// positions where `mask` has a 1. Remaining result bits are zero.
+
+/// SVE2 bit deposit (8-bit elements).
+///
+/// For each 8-bit element, scatter consecutive low bits from `data` into
+/// the bit positions selected by `mask`.
+///
+/// Equivalent to x86 BMI2 `PDEP` but operating on scalable vectors.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/svbdep[_u8])
+#[inline(always)]
+#[target_feature(enable = "sve2-bitperm")]
+#[cfg_attr(test, assert_instr(bdep))]
+#[unstable(feature = "stdarch_aarch64_sve2_bitperm", issue = "none")]
+pub fn svbdep_u8(data: svuint8_t, mask: svuint8_t) -> svuint8_t {
+    unsafe { _svbdep_u8(data, mask) }
+}
+
+/// SVE2 bit deposit (16-bit elements).
+///
+/// For each 16-bit element, scatter consecutive low bits from `data` into
+/// the bit positions selected by `mask`.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/svbdep[_u16])
+#[inline(always)]
+#[target_feature(enable = "sve2-bitperm")]
+#[cfg_attr(test, assert_instr(bdep))]
+#[unstable(feature = "stdarch_aarch64_sve2_bitperm", issue = "none")]
+pub fn svbdep_u16(data: svuint16_t, mask: svuint16_t) -> svuint16_t {
+    unsafe { _svbdep_u16(data, mask) }
+}
+
+/// SVE2 bit deposit (32-bit elements).
+///
+/// For each 32-bit element, scatter consecutive low bits from `data` into
+/// the bit positions selected by `mask`.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/svbdep[_u32])
+#[inline(always)]
+#[target_feature(enable = "sve2-bitperm")]
+#[cfg_attr(test, assert_instr(bdep))]
+#[unstable(feature = "stdarch_aarch64_sve2_bitperm", issue = "none")]
+pub fn svbdep_u32(data: svuint32_t, mask: svuint32_t) -> svuint32_t {
+    unsafe { _svbdep_u32(data, mask) }
+}
+
+/// SVE2 bit deposit (64-bit elements).
+///
+/// For each 64-bit element, scatter consecutive low bits from `data` into
+/// the bit positions selected by `mask`.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/svbdep[_u64])
+#[inline(always)]
+#[target_feature(enable = "sve2-bitperm")]
+#[cfg_attr(test, assert_instr(bdep))]
+#[unstable(feature = "stdarch_aarch64_sve2_bitperm", issue = "none")]
+pub fn svbdep_u64(data: svuint64_t, mask: svuint64_t) -> svuint64_t {
+    unsafe { _svbdep_u64(data, mask) }
+}
+
+// ===========================================================================
+// BEXT — Bit Extract (SVE2 equivalent of x86 PEXT)
+// ===========================================================================
+//
+// For each element, gather bits from `data` at positions where `mask` has
+// a 1 and pack them into consecutive low bits of the result.
+
+/// SVE2 bit extract (8-bit elements).
+///
+/// For each 8-bit element, gather bits from `data` at the positions
+/// selected by `mask` and pack them into consecutive low bits.
+///
+/// Equivalent to x86 BMI2 `PEXT` but operating on scalable vectors.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/svbext[_u8])
+#[inline(always)]
+#[target_feature(enable = "sve2-bitperm")]
+#[cfg_attr(test, assert_instr(bext))]
+#[unstable(feature = "stdarch_aarch64_sve2_bitperm", issue = "none")]
+pub fn svbext_u8(data: svuint8_t, mask: svuint8_t) -> svuint8_t {
+    unsafe { _svbext_u8(data, mask) }
+}
+
+/// SVE2 bit extract (16-bit elements).
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/svbext[_u16])
+#[inline(always)]
+#[target_feature(enable = "sve2-bitperm")]
+#[cfg_attr(test, assert_instr(bext))]
+#[unstable(feature = "stdarch_aarch64_sve2_bitperm", issue = "none")]
+pub fn svbext_u16(data: svuint16_t, mask: svuint16_t) -> svuint16_t {
+    unsafe { _svbext_u16(data, mask) }
+}
+
+/// SVE2 bit extract (32-bit elements).
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/svbext[_u32])
+#[inline(always)]
+#[target_feature(enable = "sve2-bitperm")]
+#[cfg_attr(test, assert_instr(bext))]
+#[unstable(feature = "stdarch_aarch64_sve2_bitperm", issue = "none")]
+pub fn svbext_u32(data: svuint32_t, mask: svuint32_t) -> svuint32_t {
+    unsafe { _svbext_u32(data, mask) }
+}
+
+/// SVE2 bit extract (64-bit elements).
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/svbext[_u64])
+#[inline(always)]
+#[target_feature(enable = "sve2-bitperm")]
+#[cfg_attr(test, assert_instr(bext))]
+#[unstable(feature = "stdarch_aarch64_sve2_bitperm", issue = "none")]
+pub fn svbext_u64(data: svuint64_t, mask: svuint64_t) -> svuint64_t {
+    unsafe { _svbext_u64(data, mask) }
+}
+
+// ===========================================================================
+// BGRP — Bit Group (no x86 equivalent)
+// ===========================================================================
+//
+// For each element, partition bits of `data` into two groups by `mask`:
+// - Bits where mask=1 are packed into the low portion of the result
+// - Bits where mask=0 are packed into the high portion
+//
+// Formally: result = BEXT(data, mask) | (BEXT(data, ~mask) << popcount(mask))
+
+/// SVE2 bit group (8-bit elements).
+///
+/// For each 8-bit element, partition the bits of `data` into two groups
+/// based on `mask`: bits at mask-selected positions are packed into the
+/// low bits, remaining bits are packed into the high bits.
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/svbgrp[_u8])
+#[inline(always)]
+#[target_feature(enable = "sve2-bitperm")]
+#[cfg_attr(test, assert_instr(bgrp))]
+#[unstable(feature = "stdarch_aarch64_sve2_bitperm", issue = "none")]
+pub fn svbgrp_u8(data: svuint8_t, mask: svuint8_t) -> svuint8_t {
+    unsafe { _svbgrp_u8(data, mask) }
+}
+
+/// SVE2 bit group (16-bit elements).
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/svbgrp[_u16])
+#[inline(always)]
+#[target_feature(enable = "sve2-bitperm")]
+#[cfg_attr(test, assert_instr(bgrp))]
+#[unstable(feature = "stdarch_aarch64_sve2_bitperm", issue = "none")]
+pub fn svbgrp_u16(data: svuint16_t, mask: svuint16_t) -> svuint16_t {
+    unsafe { _svbgrp_u16(data, mask) }
+}
+
+/// SVE2 bit group (32-bit elements).
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/svbgrp[_u32])
+#[inline(always)]
+#[target_feature(enable = "sve2-bitperm")]
+#[cfg_attr(test, assert_instr(bgrp))]
+#[unstable(feature = "stdarch_aarch64_sve2_bitperm", issue = "none")]
+pub fn svbgrp_u32(data: svuint32_t, mask: svuint32_t) -> svuint32_t {
+    unsafe { _svbgrp_u32(data, mask) }
+}
+
+/// SVE2 bit group (64-bit elements).
+///
+/// [Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/svbgrp[_u64])
+#[inline(always)]
+#[target_feature(enable = "sve2-bitperm")]
+#[cfg_attr(test, assert_instr(bgrp))]
+#[unstable(feature = "stdarch_aarch64_sve2_bitperm", issue = "none")]
+pub fn svbgrp_u64(data: svuint64_t, mask: svuint64_t) -> svuint64_t {
+    unsafe { _svbgrp_u64(data, mask) }
+}
+
+// ===========================================================================
+// Scalar broadcast variants (_n_ suffix)
+// ===========================================================================
+//
+// ACLE defines `svbdep_n_u*` / `svbext_n_u*` / `svbgrp_n_u*` where the
+// second operand is a scalar broadcast to all lanes. These depend on
+// `svdup_n_u*` (SVE DUP instruction) which will be provided by the base
+// SVE intrinsics module. They are intentionally deferred to that PR to
+// keep this contribution minimal and self-contained.
+//
+// TODO: Add _n_ variants once base SVE1 intrinsics (svdup_n_u*) land.
+
+// ===========================================================================
+// Tests — Scalar reference implementations
+// ===========================================================================
+
+#[cfg(test)]
+mod tests {
+    // Scalar reference implementations for correctness verification.
+    // These run on any host (no SVE hardware needed).
+    //
+    // Integration tests that call the actual SVE2 intrinsics require either:
+    // - Hardware with SVE2 + BitPerm (e.g., AWS Graviton 3)
+    // - QEMU user-mode: qemu-aarch64 -cpu max ./test_binary
+
+    /// Scalar BDEP: scatter consecutive low bits of `data` to mask-selected positions.
+    fn scalar_bdep(data: u64, mask: u64) -> u64 {
+        let mut result: u64 = 0;
+        let mut m = mask;
+        let mut src_bit: u32 = 0;
+        while m != 0 {
+            let pos = m.trailing_zeros();
+            if (data >> src_bit) & 1 != 0 {
+                result |= 1u64 << pos;
+            }
+            m &= m - 1; // clear lowest set bit
+            src_bit += 1;
+        }
+        result
+    }
+
+    /// Scalar BEXT: gather bits from mask-selected positions into consecutive low bits.
+    fn scalar_bext(data: u64, mask: u64) -> u64 {
+        let mut result: u64 = 0;
+        let mut m = mask;
+        let mut dst_bit: u32 = 0;
+        while m != 0 {
+            let pos = m.trailing_zeros();
+            if (data >> pos) & 1 != 0 {
+                result |= 1u64 << dst_bit;
+            }
+            m &= m - 1;
+            dst_bit += 1;
+        }
+        result
+    }
+
+    /// Scalar BGRP: partition bits by mask — selected bits low, unselected bits high.
+    fn scalar_bgrp(data: u64, mask: u64) -> u64 {
+        let selected = scalar_bext(data, mask);
+        let not_selected = scalar_bext(data, !mask);
+        let shift = mask.count_ones();
+        if shift >= 64 { selected } else { selected | (not_selected << shift) }
+    }
+
+    // --- Correctness of reference implementations ---
+
+    #[test]
+    fn test_bdep_bext_inverse_property() {
+        // Key property: BEXT(BDEP(x, m), m) == x & ((1 << popcount(m)) - 1)
+        let data: u64 = 0b1011;
+        let mask: u64 = 0b1010_1010;
+
+        let deposited = scalar_bdep(data, mask);
+        // bits 1,0,1,1 deposited at positions 1,3,5,7
+        // data bits 1,1,0,1 deposited at mask positions 1,3,5,7
+        // = (1<<1) | (1<<3) | (0<<5) | (1<<7) = 2+8+128 = 0b1000_1010
+        assert_eq!(deposited, 0b1000_1010);
+
+        let extracted = scalar_bext(deposited, mask);
+        let significant = (1u64 << mask.count_ones()) - 1;
+        assert_eq!(extracted, data & significant);
+    }
+
+    #[test]
+    fn test_bdep_bext_roundtrip_exhaustive_u8() {
+        // Exhaustive roundtrip for all 8-bit values
+        for mask in 0u8..=255 {
+            let popcount = mask.count_ones();
+            let max_data = if popcount >= 8 { 255u64 } else { (1u64 << popcount) - 1 };
+            for data in 0..=max_data.min(255) {
+                let deposited = scalar_bdep(data, mask as u64) as u8;
+                let extracted = scalar_bext(deposited as u64, mask as u64) as u8;
+                assert_eq!(
+                    extracted,
+                    (data as u8) & (max_data as u8),
+                    "roundtrip failed: data={data:#010b}, mask={mask:#010b}"
+                );
+            }
+        }
+    }
+
+    #[test]
+    fn test_bgrp_partitions_correctly() {
+        let data: u64 = 0b1100_1010;
+        let mask: u64 = 0b1111_0000;
+
+        let grouped = scalar_bgrp(data, mask);
+        // mask=1 positions (bits 7-4) of data: 0b1100 → packed low
+        // mask=0 positions (bits 3-0) of data: 0b1010 → packed high
+        // result = 0b1100 | (0b1010 << 4) = 0b1010_1100
+        assert_eq!(grouped, 0b1010_1100);
+    }
+
+    #[test]
+    fn test_bdep_zero_mask() {
+        assert_eq!(scalar_bdep(0xFF, 0x00), 0);
+    }
+
+    #[test]
+    fn test_bext_zero_mask() {
+        assert_eq!(scalar_bext(0xFF, 0x00), 0);
+    }
+
+    #[test]
+    fn test_bdep_full_mask() {
+        assert_eq!(scalar_bdep(0xAB, 0xFF), 0xAB);
+    }
+
+    #[test]
+    fn test_bext_full_mask() {
+        assert_eq!(scalar_bext(0xAB, 0xFF), 0xAB);
+    }
+
+    #[test]
+    fn test_bgrp_full_mask() {
+        // All bits selected → identity
+        assert_eq!(scalar_bgrp(0xAB, 0xFF) as u8, 0xAB);
+    }
+
+    #[test]
+    fn test_bgrp_zero_mask() {
+        // No bits selected → all bits shift up by 0 → data unchanged (moved to "high" group)
+        let data: u64 = 0xAB;
+        let mask: u64 = 0x00;
+        // popcount(0) = 0, so not_selected << 0 = data, selected = 0
+        assert_eq!(scalar_bgrp(data, mask) as u8, data as u8);
+    }
+}
diff --git a/crates/core_arch/src/aarch64/sve/types.rs b/crates/core_arch/src/aarch64/sve/types.rs
new file mode 100644
index 0000000000..69f2828531
--- /dev/null
+++ b/crates/core_arch/src/aarch64/sve/types.rs
@@ -0,0 +1,96 @@
+//! SVE scalable vector type definitions.
+//!
+//! These types represent ARM SVE scalable vectors whose width is determined
+//! at runtime by the hardware (128–2048 bits in 128-bit increments).
+//!
+//! Each type maps to an LLVM scalable vector IR type:
+//!
+//! | Rust type    | LLVM IR type              | ACLE C type   | Min elements |
+//! |-------------|---------------------------|---------------|-------------|
+//! | `svuint8_t`  | `<vscale x 16 x i8>`     | `svuint8_t`   | 16          |
+//! | `svuint16_t` | `<vscale x 8 x i16>`     | `svuint16_t`  | 8           |
+//! | `svuint32_t` | `<vscale x 4 x i32>`     | `svuint32_t`  | 4           |
+//! | `svuint64_t` | `<vscale x 2 x i64>`     | `svuint64_t`  | 2           |
+//! | `svint8_t`   | `<vscale x 16 x i8>`     | `svint8_t`    | 16          |
+//! | `svint16_t`  | `<vscale x 8 x i16>`     | `svint16_t`   | 8           |
+//! | `svint32_t`  | `<vscale x 4 x i32>`     | `svint32_t`   | 4           |
+//! | `svint64_t`  | `<vscale x 2 x i64>`     | `svint64_t`   | 2           |
+//!
+//! The `vscale` factor is determined by hardware: a 128-bit SVE implementation
+//! has `vscale = 1` (16 × u8 lanes), 256-bit has `vscale = 2` (32 × u8 lanes),
+//! up to 2048-bit with `vscale = 16` (256 × u8 lanes).
+//!
+//! # Compiler support
+//!
+//! These types use `#[rustc_scalable_vector(N)]` (rust-lang/rust#145052),
+//! available on nightly behind `#![feature(rustc_attrs)]`.
+
+#![allow(non_camel_case_types)]
+
+/// Scalable vector of unsigned 8-bit integers.
+///
+/// Maps to ACLE `svuint8_t` / LLVM `<vscale x 16 x i8>`.
+/// At runtime, contains `16 * vscale` elements.
+#[rustc_scalable_vector(16)]
+#[derive(Copy, Clone)]
+#[unstable(feature = "stdarch_aarch64_sve", issue = "none")]
+pub struct svuint8_t(u8);
+
+/// Scalable vector of unsigned 16-bit integers.
+///
+/// Maps to ACLE `svuint16_t` / LLVM `<vscale x 8 x i16>`.
+/// At runtime, contains `8 * vscale` elements.
+#[rustc_scalable_vector(8)]
+#[derive(Copy, Clone)]
+#[unstable(feature = "stdarch_aarch64_sve", issue = "none")]
+pub struct svuint16_t(u16);
+
+/// Scalable vector of unsigned 32-bit integers.
+///
+/// Maps to ACLE `svuint32_t` / LLVM `<vscale x 4 x i32>`.
+/// At runtime, contains `4 * vscale` elements.
+#[rustc_scalable_vector(4)]
+#[derive(Copy, Clone)]
+#[unstable(feature = "stdarch_aarch64_sve", issue = "none")]
+pub struct svuint32_t(u32);
+
+/// Scalable vector of unsigned 64-bit integers.
+///
+/// Maps to ACLE `svuint64_t` / LLVM `<vscale x 2 x i64>`.
+/// At runtime, contains `2 * vscale` elements.
+#[rustc_scalable_vector(2)]
+#[derive(Copy, Clone)]
+#[unstable(feature = "stdarch_aarch64_sve", issue = "none")]
+pub struct svuint64_t(u64);
+
+/// Scalable vector of signed 8-bit integers.
+///
+/// Maps to ACLE `svint8_t` / LLVM `<vscale x 16 x i8>`.
+#[rustc_scalable_vector(16)]
+#[derive(Copy, Clone)]
+#[unstable(feature = "stdarch_aarch64_sve", issue = "none")]
+pub struct svint8_t(i8);
+
+/// Scalable vector of signed 16-bit integers.
+///
+/// Maps to ACLE `svint16_t` / LLVM `<vscale x 8 x i16>`.
+#[rustc_scalable_vector(8)]
+#[derive(Copy, Clone)]
+#[unstable(feature = "stdarch_aarch64_sve", issue = "none")]
+pub struct svint16_t(i16);
+
+/// Scalable vector of signed 32-bit integers.
+///
+/// Maps to ACLE `svint32_t` / LLVM `<vscale x 4 x i32>`.
+#[rustc_scalable_vector(4)]
+#[derive(Copy, Clone)]
+#[unstable(feature = "stdarch_aarch64_sve", issue = "none")]
+pub struct svint32_t(i32);
+
+/// Scalable vector of signed 64-bit integers.
+///
+/// Maps to ACLE `svint64_t` / LLVM `<vscale x 2 x i64>`.
+#[rustc_scalable_vector(2)]
+#[derive(Copy, Clone)]
+#[unstable(feature = "stdarch_aarch64_sve", issue = "none")]
+pub struct svint64_t(i64);