//! Hexagon HVX Gaussian 3x3 blur example //! //! This example demonstrates the use of Hexagon HVX intrinsics to implement //! a 3x3 Gaussian blur filter on unsigned 8-bit images. //! //! The 3x3 Gaussian kernel is: //! 1 2 1 //! 2 4 2 / 16 //! 1 2 1 //! //! This is a separable filter: `[1 2 1]^T * [1 2 1] / 16`. //! //! On Hexagon targets, this implementation uses `HvxVectorPair` for widening //! arithmetic to achieve full precision in the Gaussian computation, avoiding //! the approximation errors of byte-averaging approaches. On other targets, //! it runs a reference implementation in pure Rust. //! //! # Building and Running (Hexagon) //! //! To build (requires Hexagon toolchain): //! //! RUSTFLAGS="-C target-feature=+hvxv62,+hvx-length128b \ //! -C linker=hexagon-unknown-linux-musl-clang" \ //! cargo +nightly build -p stdarch_examples --bin gaussian \ //! --target hexagon-unknown-linux-musl \ //! -Zbuild-std -Zbuild-std-features=llvm-libunwind //! //! To run under QEMU: //! //! qemu-hexagon -L /target/hexagon-unknown-linux-musl \ //! target/hexagon-unknown-linux-musl/debug/gaussian //! //! # Building and Running (Other targets) //! //! cargo +nightly run -p stdarch_examples --bin gaussian #![cfg_attr(target_arch = "hexagon", feature(stdarch_hexagon))] #![cfg_attr(target_arch = "hexagon", feature(hexagon_target_feature))] #![allow( unsafe_op_in_unsafe_fn, clippy::unwrap_used, clippy::print_stdout, clippy::missing_docs_in_private_items, clippy::cast_possible_wrap, clippy::cast_ptr_alignment )] /// Image width - must be multiple of HVX vector length on Hexagon const WIDTH: usize = 256; const HEIGHT: usize = 16; // ============================================================================ // Hexagon HVX implementation // ============================================================================ #[cfg(target_arch = "hexagon")] mod hvx { #[cfg(not(target_feature = "hvx-length128b"))] use core_arch::arch::hexagon::v64::*; #[cfg(target_feature = "hvx-length128b")] use core_arch::arch::hexagon::v128::*; /// Vector length in bytes for HVX 128-byte mode #[cfg(target_feature = "hvx-length128b")] const VLEN: usize = 128; /// Vector length in bytes for HVX 64-byte mode #[cfg(not(target_feature = "hvx-length128b"))] const VLEN: usize = 64; /// Vertical 1-2-1 filter pass using HvxVectorPair widening arithmetic /// /// Computes: dst[x] = (row_above[x] + 2*center[x] + row_below[x] + 2) >> 2 /// /// Uses HvxVectorPair to widen u8 to u16 for precise arithmetic, avoiding /// the rounding errors of byte-averaging approximations. /// /// # Safety /// /// - `src` must point to the center row with valid data at -stride and +stride /// - `dst` must point to a valid output buffer for `width` bytes /// - `width` must be a multiple of VLEN /// - All pointers must be HVX-aligned (128-byte for 128B mode) #[target_feature(enable = "hvxv62")] unsafe fn vertical_121_pass(src: *const u8, stride: isize, width: usize, dst: *mut u8) { let inp0 = src.offset(-stride) as *const HvxVector; let inp1 = src as *const HvxVector; let inp2 = src.offset(stride) as *const HvxVector; let outp = dst as *mut HvxVector; let n_chunks = width / VLEN; for i in 0..n_chunks { let above = *inp0.add(i); let center = *inp1.add(i); let below = *inp2.add(i); // Widen above + below to 16-bit using HvxVectorPair // Q6_Wh_vadd_VubVub: adds two u8 vectors, producing u16 results in a pair let above_plus_below: HvxVectorPair = Q6_Wh_vadd_VubVub(above, below); // Widen center * 2 (add center to itself) let center_x2: HvxVectorPair = Q6_Wh_vadd_VubVub(center, center); // Add them: (above + below) + (center * 2) = above + 2*center + below let sum: HvxVectorPair = Q6_Wh_vadd_WhWh(above_plus_below, center_x2); // Extract high and low vectors from the pair (each contains u16 values) let sum_lo = Q6_V_lo_W(sum); // Lower 64 elements as i16 let sum_hi = Q6_V_hi_W(sum); // Upper 64 elements as i16 // Arithmetic right shift by 2 (divide by 4) with rounding // Add 2 for rounding before shift: (sum + 2) >> 2 let two = Q6_Vh_vsplat_R(2); let sum_lo_rounded = Q6_Vh_vadd_VhVh(sum_lo, two); let sum_hi_rounded = Q6_Vh_vadd_VhVh(sum_hi, two); let shifted_lo = Q6_Vh_vasr_VhVh(sum_lo_rounded, two); let shifted_hi = Q6_Vh_vasr_VhVh(sum_hi_rounded, two); // Pack back to u8 with saturation: takes hi and lo halfword vectors, // saturates to u8, and interleaves them back to original order let result = Q6_Vub_vsat_VhVh(shifted_hi, shifted_lo); *outp.add(i) = result; } } /// Horizontal 1-2-1 filter pass using HvxVectorPair widening arithmetic /// /// Computes: dst[x] = (src[x-1] + 2*src[x] + src[x+1] + 2) >> 2 /// /// Uses `valign` and `vlalign` to shift vectors by 1 byte for neighbor access, /// then HvxVectorPair for precise widening arithmetic. /// /// # Safety /// /// - `src` and `dst` must point to valid buffers of `width` bytes /// - `width` must be a multiple of VLEN /// - All pointers must be HVX-aligned #[target_feature(enable = "hvxv62")] unsafe fn horizontal_121_pass(src: *const u8, width: usize, dst: *mut u8) { let inp = src as *const HvxVector; let outp = dst as *mut HvxVector; let n_chunks = width / VLEN; let mut prev = Q6_V_vzero(); for i in 0..n_chunks { let curr = *inp.add(i); let next = if i + 1 < n_chunks { *inp.add(i + 1) } else { Q6_V_vzero() }; // Left neighbor (x-1): shift curr right by 1 byte, filling from prev let left = Q6_V_vlalign_VVR(curr, prev, 1); // Right neighbor (x+1): shift curr left by 1 byte, filling from next let right = Q6_V_valign_VVR(next, curr, 1); // Widen left + right to 16-bit let left_plus_right: HvxVectorPair = Q6_Wh_vadd_VubVub(left, right); // Widen center * 2 let center_x2: HvxVectorPair = Q6_Wh_vadd_VubVub(curr, curr); // Add: left + 2*center + right let sum: HvxVectorPair = Q6_Wh_vadd_WhWh(left_plus_right, center_x2); // Extract high and low vectors let sum_lo = Q6_V_lo_W(sum); let sum_hi = Q6_V_hi_W(sum); // Arithmetic right shift by 2 with rounding let two = Q6_Vh_vsplat_R(2); let sum_lo_rounded = Q6_Vh_vadd_VhVh(sum_lo, two); let sum_hi_rounded = Q6_Vh_vadd_VhVh(sum_hi, two); let shifted_lo = Q6_Vh_vasr_VhVh(sum_lo_rounded, two); let shifted_hi = Q6_Vh_vasr_VhVh(sum_hi_rounded, two); // Pack back to u8 with saturation let result = Q6_Vub_vsat_VhVh(shifted_hi, shifted_lo); *outp.add(i) = result; prev = curr; } } /// Apply Gaussian 3x3 blur to an entire image using separable filtering /// /// Two-pass approach: /// 1. Vertical pass: apply 1-2-1 filter across rows /// 2. Horizontal pass: apply 1-2-1 filter across columns /// /// Combined effect: 3x3 Gaussian kernel [1 2 1; 2 4 2; 1 2 1] / 16 /// /// # Safety /// /// - `src` and `dst` must point to valid image buffers of `stride * height` bytes /// - `tmp` must point to a valid temporary buffer of `width` bytes, HVX-aligned /// - `width` must be a multiple of VLEN and >= VLEN /// - `stride` must be >= `width` /// - All buffers must be HVX-aligned (128-byte for 128B mode) #[target_feature(enable = "hvxv62")] pub unsafe fn gaussian3x3u8( src: *const u8, stride: usize, width: usize, height: usize, dst: *mut u8, tmp: *mut u8, ) { let stride_i = stride as isize; // Process interior rows (skip first and last which lack vertical neighbors) for y in 1..height - 1 { let row_src = src.offset(y as isize * stride_i); let row_dst = dst.offset(y as isize * stride_i); // Pass 1: vertical 1-2-1 into tmp vertical_121_pass(row_src, stride_i, width, tmp); // Pass 2: horizontal 1-2-1 from tmp into dst horizontal_121_pass(tmp, width, row_dst); } } } // ============================================================================ // Reference implementation (works on all targets) // ============================================================================ /// Reference implementation of Gaussian 3x3 blur /// /// Kernel: /// 1 2 1 /// 2 4 2 / 16 /// 1 2 1 fn gaussian3x3u8_reference(src: &[u8], stride: usize, width: usize, height: usize, dst: &mut [u8]) { for y in 1..height - 1 { for x in 1..width - 1 { // Compute column sums (vertical 1-2-1 weights) let mut col = [0u32; 3]; for i in 0..3 { col[i] = 1 * src[(y - 1) * stride + x - 1 + i] as u32 + 2 * src[y * stride + x - 1 + i] as u32 + 1 * src[(y + 1) * stride + x - 1 + i] as u32; } // Apply horizontal 1-2-1 weights and normalize // (1*col[0] + 2*col[1] + 1*col[2] + 8) / 16 dst[y * stride + x] = ((1 * col[0] + 2 * col[1] + 1 * col[2] + 8) >> 4) as u8; } } } /// Generate deterministic test pattern fn generate_test_pattern(buf: &mut [u8], width: usize, height: usize) { for y in 0..height { for x in 0..width { buf[y * width + x] = ((x + y * 7) % 256) as u8; } } } // ============================================================================ // Main: runs HVX + reference on Hexagon, reference-only on other targets // ============================================================================ #[cfg(target_arch = "hexagon")] fn main() { // Aligned buffers for HVX #[repr(align(128))] struct AlignedBuf([u8; N]); let mut src = AlignedBuf::<{ WIDTH * HEIGHT }>([0u8; WIDTH * HEIGHT]); let mut dst_hvx = AlignedBuf::<{ WIDTH * HEIGHT }>([0u8; WIDTH * HEIGHT]); let mut tmp = AlignedBuf::<{ WIDTH }>([0u8; WIDTH]); let mut dst_ref = vec![0u8; WIDTH * HEIGHT]; // Generate test pattern generate_test_pattern(&mut src.0, WIDTH, HEIGHT); // Run HVX implementation unsafe { hvx::gaussian3x3u8( src.0.as_ptr(), WIDTH, WIDTH, HEIGHT, dst_hvx.0.as_mut_ptr(), tmp.0.as_mut_ptr(), ); } // Run reference gaussian3x3u8_reference(&src.0, WIDTH, WIDTH, HEIGHT, &mut dst_ref); // Verify HVX matches reference (allowing small rounding differences) let mut max_diff = 0i32; for y in 1..HEIGHT - 1 { for x in 1..WIDTH - 1 { let idx = y * WIDTH + x; let diff = (dst_hvx.0[idx] as i32 - dst_ref[idx] as i32).abs(); max_diff = max_diff.max(diff); // Allow up to 1 LSB difference due to rounding assert!( diff <= 1, "HVX differs from reference at ({}, {}): hvx={}, ref={}, diff={}", x, y, dst_hvx.0[idx], dst_ref[idx], diff ); } } println!( "Gaussian 3x3 HVX test passed! Max difference from reference: {}", max_diff ); } #[cfg(not(target_arch = "hexagon"))] fn main() { let mut src = vec![0u8; WIDTH * HEIGHT]; let mut dst = vec![0u8; WIDTH * HEIGHT]; // Generate test pattern generate_test_pattern(&mut src, WIDTH, HEIGHT); // Run reference implementation gaussian3x3u8_reference(&src, WIDTH, WIDTH, HEIGHT, &mut dst); // Verify output is non-trivial (blurred values differ from input) let mut changed = 0; for y in 1..HEIGHT - 1 { for x in 1..WIDTH - 1 { let idx = y * WIDTH + x; if src[idx] != dst[idx] { changed += 1; } } } println!( "Gaussian 3x3 reference test passed! {} pixels changed by blur", changed ); }