rust/library/stdarch/examples/gaussian.rs

//! Hexagon HVX Gaussian 3x3 blur example
//!
//! This example demonstrates the use of Hexagon HVX intrinsics to implement
//! a 3x3 Gaussian blur filter on unsigned 8-bit images.
//!
//! The 3x3 Gaussian kernel is:
//!     1 2 1
//!     2 4 2  / 16
//!     1 2 1
//!
//! This is a separable filter: `[1 2 1]^T * [1 2 1] / 16`.
//!
//! On Hexagon targets, this implementation uses `HvxVectorPair` for widening
//! arithmetic to achieve full precision in the Gaussian computation, avoiding
//! the approximation errors of byte-averaging approaches. On other targets,
//! it runs a reference implementation in pure Rust.
//!
//! # Building and Running (Hexagon)
//!
//! To build (requires Hexagon toolchain):
//!
//!     RUSTFLAGS="-C target-feature=+hvxv62,+hvx-length128b \
//!         -C linker=hexagon-unknown-linux-musl-clang" \
//!         cargo +nightly build -p stdarch_examples --bin gaussian \
//!         --target hexagon-unknown-linux-musl \
//!         -Zbuild-std -Zbuild-std-features=llvm-libunwind
//!
//! To run under QEMU:
//!
//!     qemu-hexagon -L <sysroot>/target/hexagon-unknown-linux-musl \
//!         target/hexagon-unknown-linux-musl/debug/gaussian
//!
//! # Building and Running (Other targets)
//!
//!     cargo +nightly run -p stdarch_examples --bin gaussian

#![cfg_attr(target_arch = "hexagon", feature(stdarch_hexagon))]
#![cfg_attr(target_arch = "hexagon", feature(hexagon_target_feature))]
#![allow(
    unsafe_op_in_unsafe_fn,
    clippy::unwrap_used,
    clippy::print_stdout,
    clippy::missing_docs_in_private_items,
    clippy::cast_possible_wrap,
    clippy::cast_ptr_alignment
)]

/// Image width - must be multiple of HVX vector length on Hexagon
const WIDTH: usize = 256;
const HEIGHT: usize = 16;

// ============================================================================
// Hexagon HVX implementation
// ============================================================================

#[cfg(target_arch = "hexagon")]
mod hvx {
    #[cfg(not(target_feature = "hvx-length128b"))]
    use core_arch::arch::hexagon::v64::*;
    #[cfg(target_feature = "hvx-length128b")]
    use core_arch::arch::hexagon::v128::*;

    /// Vector length in bytes for HVX 128-byte mode
    #[cfg(target_feature = "hvx-length128b")]
    const VLEN: usize = 128;

    /// Vector length in bytes for HVX 64-byte mode
    #[cfg(not(target_feature = "hvx-length128b"))]
    const VLEN: usize = 64;

    /// Vertical 1-2-1 filter pass using HvxVectorPair widening arithmetic
    ///
    /// Computes: dst[x] = (row_above[x] + 2*center[x] + row_below[x] + 2) >> 2
    ///
    /// Uses HvxVectorPair to widen u8 to u16 for precise arithmetic, avoiding
    /// the rounding errors of byte-averaging approximations.
    ///
    /// # Safety
    ///
    /// - `src` must point to the center row with valid data at -stride and +stride
    /// - `dst` must point to a valid output buffer for `width` bytes
    /// - `width` must be a multiple of VLEN
    /// - All pointers must be HVX-aligned (128-byte for 128B mode)
    #[target_feature(enable = "hvxv62")]
    unsafe fn vertical_121_pass(src: *const u8, stride: isize, width: usize, dst: *mut u8) {
        let inp0 = src.offset(-stride) as *const HvxVector;
        let inp1 = src as *const HvxVector;
        let inp2 = src.offset(stride) as *const HvxVector;
        let outp = dst as *mut HvxVector;

        let n_chunks = width / VLEN;
        for i in 0..n_chunks {
            let above = *inp0.add(i);
            let center = *inp1.add(i);
            let below = *inp2.add(i);

            // Widen above + below to 16-bit using HvxVectorPair
            // Q6_Wh_vadd_VubVub: adds two u8 vectors, producing u16 results in a pair
            let above_plus_below: HvxVectorPair = Q6_Wh_vadd_VubVub(above, below);

            // Widen center * 2 (add center to itself)
            let center_x2: HvxVectorPair = Q6_Wh_vadd_VubVub(center, center);

            // Add them: (above + below) + (center * 2) = above + 2*center + below
            let sum: HvxVectorPair = Q6_Wh_vadd_WhWh(above_plus_below, center_x2);

            // Extract high and low vectors from the pair (each contains u16 values)
            let sum_lo = Q6_V_lo_W(sum); // Lower 64 elements as i16
            let sum_hi = Q6_V_hi_W(sum); // Upper 64 elements as i16

            // Arithmetic right shift by 2 (divide by 4) with rounding
            // Add 2 for rounding before shift: (sum + 2) >> 2
            let two = Q6_Vh_vsplat_R(2);
            let sum_lo_rounded = Q6_Vh_vadd_VhVh(sum_lo, two);
            let sum_hi_rounded = Q6_Vh_vadd_VhVh(sum_hi, two);
            let shifted_lo = Q6_Vh_vasr_VhVh(sum_lo_rounded, two);
            let shifted_hi = Q6_Vh_vasr_VhVh(sum_hi_rounded, two);

            // Pack back to u8 with saturation: takes hi and lo halfword vectors,
            // saturates to u8, and interleaves them back to original order
            let result = Q6_Vub_vsat_VhVh(shifted_hi, shifted_lo);

            *outp.add(i) = result;
        }
    }

    /// Horizontal 1-2-1 filter pass using HvxVectorPair widening arithmetic
    ///
    /// Computes: dst[x] = (src[x-1] + 2*src[x] + src[x+1] + 2) >> 2
    ///
    /// Uses `valign` and `vlalign` to shift vectors by 1 byte for neighbor access,
    /// then HvxVectorPair for precise widening arithmetic.
    ///
    /// # Safety
    ///
    /// - `src` and `dst` must point to valid buffers of `width` bytes
    /// - `width` must be a multiple of VLEN
    /// - All pointers must be HVX-aligned
    #[target_feature(enable = "hvxv62")]
    unsafe fn horizontal_121_pass(src: *const u8, width: usize, dst: *mut u8) {
        let inp = src as *const HvxVector;
        let outp = dst as *mut HvxVector;

        let n_chunks = width / VLEN;
        let mut prev = Q6_V_vzero();

        for i in 0..n_chunks {
            let curr = *inp.add(i);
            let next = if i + 1 < n_chunks {
                *inp.add(i + 1)
            } else {
                Q6_V_vzero()
            };

            // Left neighbor (x-1): shift curr right by 1 byte, filling from prev
            let left = Q6_V_vlalign_VVR(curr, prev, 1);

            // Right neighbor (x+1): shift curr left by 1 byte, filling from next
            let right = Q6_V_valign_VVR(next, curr, 1);

            // Widen left + right to 16-bit
            let left_plus_right: HvxVectorPair = Q6_Wh_vadd_VubVub(left, right);

            // Widen center * 2
            let center_x2: HvxVectorPair = Q6_Wh_vadd_VubVub(curr, curr);

            // Add: left + 2*center + right
            let sum: HvxVectorPair = Q6_Wh_vadd_WhWh(left_plus_right, center_x2);

            // Extract high and low vectors
            let sum_lo = Q6_V_lo_W(sum);
            let sum_hi = Q6_V_hi_W(sum);

            // Arithmetic right shift by 2 with rounding
            let two = Q6_Vh_vsplat_R(2);
            let sum_lo_rounded = Q6_Vh_vadd_VhVh(sum_lo, two);
            let sum_hi_rounded = Q6_Vh_vadd_VhVh(sum_hi, two);
            let shifted_lo = Q6_Vh_vasr_VhVh(sum_lo_rounded, two);
            let shifted_hi = Q6_Vh_vasr_VhVh(sum_hi_rounded, two);

            // Pack back to u8 with saturation
            let result = Q6_Vub_vsat_VhVh(shifted_hi, shifted_lo);

            *outp.add(i) = result;

            prev = curr;
        }
    }

    /// Apply Gaussian 3x3 blur to an entire image using separable filtering
    ///
    /// Two-pass approach:
    /// 1. Vertical pass: apply 1-2-1 filter across rows
    /// 2. Horizontal pass: apply 1-2-1 filter across columns
    ///
    /// Combined effect: 3x3 Gaussian kernel [1 2 1; 2 4 2; 1 2 1] / 16
    ///
    /// # Safety
    ///
    /// - `src` and `dst` must point to valid image buffers of `stride * height` bytes
    /// - `tmp` must point to a valid temporary buffer of `width` bytes, HVX-aligned
    /// - `width` must be a multiple of VLEN and >= VLEN
    /// - `stride` must be >= `width`
    /// - All buffers must be HVX-aligned (128-byte for 128B mode)
    #[target_feature(enable = "hvxv62")]
    pub unsafe fn gaussian3x3u8(
        src: *const u8,
        stride: usize,
        width: usize,
        height: usize,
        dst: *mut u8,
        tmp: *mut u8,
    ) {
        let stride_i = stride as isize;

        // Process interior rows (skip first and last which lack vertical neighbors)
        for y in 1..height - 1 {
            let row_src = src.offset(y as isize * stride_i);
            let row_dst = dst.offset(y as isize * stride_i);

            // Pass 1: vertical 1-2-1 into tmp
            vertical_121_pass(row_src, stride_i, width, tmp);

            // Pass 2: horizontal 1-2-1 from tmp into dst
            horizontal_121_pass(tmp, width, row_dst);
        }
    }
}

// ============================================================================
// Reference implementation (works on all targets)
// ============================================================================

/// Reference implementation of Gaussian 3x3 blur
///
/// Kernel:
///     1 2 1
///     2 4 2  / 16
///     1 2 1
fn gaussian3x3u8_reference(src: &[u8], stride: usize, width: usize, height: usize, dst: &mut [u8]) {
    for y in 1..height - 1 {
        for x in 1..width - 1 {
            // Compute column sums (vertical 1-2-1 weights)
            let mut col = [0u32; 3];
            for i in 0..3 {
                col[i] = 1 * src[(y - 1) * stride + x - 1 + i] as u32
                    + 2 * src[y * stride + x - 1 + i] as u32
                    + 1 * src[(y + 1) * stride + x - 1 + i] as u32;
            }
            // Apply horizontal 1-2-1 weights and normalize
            // (1*col[0] + 2*col[1] + 1*col[2] + 8) / 16
            dst[y * stride + x] = ((1 * col[0] + 2 * col[1] + 1 * col[2] + 8) >> 4) as u8;
        }
    }
}

/// Generate deterministic test pattern
fn generate_test_pattern(buf: &mut [u8], width: usize, height: usize) {
    for y in 0..height {
        for x in 0..width {
            buf[y * width + x] = ((x + y * 7) % 256) as u8;
        }
    }
}

// ============================================================================
// Main: runs HVX + reference on Hexagon, reference-only on other targets
// ============================================================================

#[cfg(target_arch = "hexagon")]
fn main() {
    // Aligned buffers for HVX
    #[repr(align(128))]
    struct AlignedBuf<const N: usize>([u8; N]);

    let mut src = AlignedBuf::<{ WIDTH * HEIGHT }>([0u8; WIDTH * HEIGHT]);
    let mut dst_hvx = AlignedBuf::<{ WIDTH * HEIGHT }>([0u8; WIDTH * HEIGHT]);
    let mut tmp = AlignedBuf::<{ WIDTH }>([0u8; WIDTH]);
    let mut dst_ref = vec![0u8; WIDTH * HEIGHT];

    // Generate test pattern
    generate_test_pattern(&mut src.0, WIDTH, HEIGHT);

    // Run HVX implementation
    unsafe {
        hvx::gaussian3x3u8(
            src.0.as_ptr(),
            WIDTH,
            WIDTH,
            HEIGHT,
            dst_hvx.0.as_mut_ptr(),
            tmp.0.as_mut_ptr(),
        );
    }

    // Run reference
    gaussian3x3u8_reference(&src.0, WIDTH, WIDTH, HEIGHT, &mut dst_ref);

    // Verify HVX matches reference (allowing small rounding differences)
    let mut max_diff = 0i32;
    for y in 1..HEIGHT - 1 {
        for x in 1..WIDTH - 1 {
            let idx = y * WIDTH + x;
            let diff = (dst_hvx.0[idx] as i32 - dst_ref[idx] as i32).abs();
            max_diff = max_diff.max(diff);
            // Allow up to 1 LSB difference due to rounding
            assert!(
                diff <= 1,
                "HVX differs from reference at ({}, {}): hvx={}, ref={}, diff={}",
                x,
                y,
                dst_hvx.0[idx],
                dst_ref[idx],
                diff
            );
        }
    }

    println!(
        "Gaussian 3x3 HVX test passed! Max difference from reference: {}",
        max_diff
    );
}

#[cfg(not(target_arch = "hexagon"))]
fn main() {
    let mut src = vec![0u8; WIDTH * HEIGHT];
    let mut dst = vec![0u8; WIDTH * HEIGHT];

    // Generate test pattern
    generate_test_pattern(&mut src, WIDTH, HEIGHT);

    // Run reference implementation
    gaussian3x3u8_reference(&src, WIDTH, WIDTH, HEIGHT, &mut dst);

    // Verify output is non-trivial (blurred values differ from input)
    let mut changed = 0;
    for y in 1..HEIGHT - 1 {
        for x in 1..WIDTH - 1 {
            let idx = y * WIDTH + x;
            if src[idx] != dst[idx] {
                changed += 1;
            }
        }
    }

    println!(
        "Gaussian 3x3 reference test passed! {} pixels changed by blur",
        changed
    );
}