提问人:Chris 提问时间:9/2/2023 最后编辑:kmdrekoChris 更新时间:9/6/2023 访问量:327
编译 Rust 代码时是否缺少 AVX512 的目标功能?
Am I missing a target-feature for AVX512 when I compile my Rust code?
问:
我编写了一些使用 AVX2 和 AVX512 指令来加速图像合成的 Rust 函数。我使用的是 AMD 7950x CPU。
当我运行时,我得到:RUSTFLAGS="-C target-cpu=native" cargo bench
test overlay_using_avx2 ... bench: 483,596 ns/iter (+/- 10,006)
test overlay_using_avx512 ... bench: 317,818 ns/iter (+/- 729)
但是,我想在一台机器上构建可执行文件,然后在另一台机器上运行它。因此,我显式启用了我的代码所需的功能,并在运行时检查它们是否存在。但是,当我这样做时,AVX512 基准测试运行速度较慢,我不明白为什么。我正在运行:
RUSTFLAGS="-C target-feature=+avx2,+avx,+sse2,+avx512f,+avx512bw" cargo bench
:
test overlay_using_avx2 ... bench: 490,664 ns/iter (+/- 13,172)
test overlay_using_avx512 ... bench: 1,519,720 ns/iter (+/- 38,608)
我是否需要启用列表中的其他功能?是否可以通过设置查看启用了哪些功能?rustc --print target-features
target-cpu=native
我的基准测试代码在下面,每晚运行一次:
#![feature(stdsimd)]
#![feature(test)]
use std::arch::x86_64::*;
unsafe fn overlay_chunk_avx2(this_chunk: &mut [u8], image_chunk: &[u8], c1: __m256i, c2: __m256i) {
let this_ptr = this_chunk.as_mut_ptr() as *mut __m128i;
let image_ptr = image_chunk.as_ptr() as *const __m128i;
let this_argb = _mm_loadu_si128(this_ptr);
let image_argb = _mm_loadu_si128(image_ptr);
let this_u16 = _mm256_cvtepu8_epi16(this_argb);
let image_u16 = _mm256_cvtepu8_epi16(image_argb);
let image_alpha = _mm256_shuffle_epi8(image_u16, c1);
let image_inv_alpha = _mm256_sub_epi8(c2, image_alpha);
let this_blended = _mm256_mullo_epi16(this_u16, image_inv_alpha);
let image_blended = _mm256_mullo_epi16(image_u16, image_alpha);
let blended = _mm256_add_epi16(this_blended, image_blended);
let divided = _mm256_srli_epi16(blended, 8);
let lo_lane = _mm256_castsi256_si128(divided);
let hi_lane = _mm256_extracti128_si256(divided, 1);
let divided_u8 = _mm_packus_epi16(lo_lane, hi_lane);
_mm_storeu_si128(this_ptr, divided_u8);
}
unsafe fn overlay_chunk_avx512(this_chunk: &mut [u8], image_chunk: &[u8], c1: __m512i, c2: __m512i) {
let this_ptr = this_chunk.as_mut_ptr() as *mut i8;
let image_ptr = image_chunk.as_ptr() as *const i8;
let this_argb = _mm256_loadu_epi8(this_ptr);
let image_argb = _mm256_loadu_epi8(image_ptr);
let this_u16 = _mm512_cvtepu8_epi16(this_argb);
let image_u16 = _mm512_cvtepu8_epi16(image_argb);
let image_alpha = _mm512_shuffle_epi8(image_u16, c1);
let image_inv_alpha = _mm512_sub_epi8(c2, image_alpha);
let this_blended = _mm512_mullo_epi16(this_u16, image_inv_alpha);
let image_blended = _mm512_mullo_epi16(image_u16, image_alpha);
let blended = _mm512_add_epi16(this_blended, image_blended);
let divided = _mm512_srli_epi16(blended, 8);
let divided_u8 = _mm512_cvtepi16_epi8(divided);
_mm256_storeu_epi8(this_ptr, divided_u8);
}
extern crate test;
#[bench]
fn overlay_using_avx2(bencher: &mut test::Bencher) {
let mut frame = vec![0; 1920 * 1080 * 4];
let image = vec![0; 1920 * 1080 * 4];
let constant1 = unsafe { _mm256_set_epi8(-1, 24, -1, 24, -1, 24, -1, -1, -1, 16, -1, 16, -1, 16, -1, -1, -1, 8, -1, 8, -1, 8, -1, -1, -1, 0, -1, 0, -1, 0, -1, -1) };
let constant2 = unsafe { _mm256_set_epi8(0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0) };
bencher.iter(|| {
let frame_chunks = frame.chunks_exact_mut(128 / 8);
let image_chunks = image.chunks_exact(128 / 8);
for (frame_chunk, image_chunk) in frame_chunks.zip(image_chunks) {
unsafe { overlay_chunk_avx2(frame_chunk, image_chunk, constant1, constant2); }
}
});
}
#[bench]
fn overlay_using_avx512(bencher: &mut test::Bencher) {
let mut frame = vec![0; 1920 * 1080 * 4];
let image = vec![0; 1920 * 1080 * 4];
let constant1 = unsafe { _mm512_set_epi8(-1, 56, -1, 56, -1, 56, -1, -1, -1, 48, -1, 48, -1, 48, -1, -1, -1, 40, -1, 40, -1, 40, -1, -1, -1, 32, -1, 32, -1, 32, -1, -1, -1, 24, -1, 24, -1, 24, -1, -1, -1, 16, -1, 16, -1, 16, -1, -1, -1, 8, -1, 8, -1, 8, -1, -1, -1, 0, -1, 0, -1, 0, -1, -1) };
let constant2 = unsafe { _mm512_set_epi8(0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0, 0, -1, 0, -1, 0, -1, 1, 0) };
bencher.iter(|| {
let frame_chunks = frame.chunks_exact_mut(256 / 8);
let image_chunks = image.chunks_exact(256 / 8);
for (frame_chunk, image_chunk) in frame_chunks.zip(image_chunks) {
unsafe { overlay_chunk_avx512(frame_chunk, image_chunk, constant1, constant2); }
}
});
}
答:
4赞
harold
9/2/2023
#1
看起来您需要正确处理和.+avx512vl
_mm256_loadu_epi8
_mm256_storeu_epi8
没有该功能:(https://godbolt.org/z/roo8vf38q)
example::overlay_chunk_avx512:
push rbp
mov rbp, rsp
push r15
push r14
push r12
push rbx
and rsp, -64
sub rsp, 192
mov rbx, r9
mov r14, r8
mov r15, rdx
mov r12, rdi
mov rdi, rsp
mov rsi, r12
call core::core_arch::x86::avx512bw::_mm256_loadu_epi8
vpmovzxbw zmm0, ymmword ptr [rsp]
vmovdqa64 zmmword ptr [rsp + 64], zmm0
mov rdi, rsp
mov rsi, r15
vzeroupper
call core::core_arch::x86::avx512bw::_mm256_loadu_epi8
vpmovzxbw zmm0, ymmword ptr [rsp]
vpshufb zmm1, zmm0, zmmword ptr [r14]
vmovdqa64 zmm2, zmmword ptr [rbx]
vpsubb zmm2, zmm2, zmm1
vpmullw zmm2, zmm2, zmmword ptr [rsp + 64]
vpmullw zmm0, zmm1, zmm0
vpaddw zmm0, zmm2, zmm0
vpsrlw zmm0, zmm0, 8
vpmovwb ymmword ptr [rsp + 32], zmm0
lea rsi, [rsp + 32]
mov rdi, r12
vzeroupper
call core::core_arch::x86::avx512bw::_mm256_storeu_epi8
lea rsp, [rbp - 32]
pop rbx
pop r12
pop r14
pop r15
pop rbp
ret
使用该功能:(https://godbolt.org/z/oTvEaPhPb)
example::overlay_chunk_avx512:
vpmovzxbw zmm0, ymmword ptr [rdi]
vpmovzxbw zmm1, ymmword ptr [rdx]
vpshufb zmm2, zmm1, zmmword ptr [r8]
vmovdqa64 zmm3, zmmword ptr [r9]
vpsubb zmm3, zmm3, zmm2
vpmullw zmm0, zmm3, zmm0
vpmullw zmm1, zmm2, zmm1
vpaddw zmm0, zmm0, zmm1
vpsrlw zmm0, zmm0, 8
vpmovwb ymmword ptr [rdi], zmm0
vzeroupper
ret
评论
1赞
harold
9/2/2023
直接由内存和内存产生的实际上最终并不需要,但无论如何vpmovzxbw
vpmovwb
AVX512VL
1赞
Peter Cordes
9/3/2023
奇怪的是,它允许您使用未启用的扩展中的内部函数。godbolt.org/z/d6h9e7feW 表明,即使未启用 AVX-512VL,使用 AVX2 加载/存储内部函数也能生成良好的代码。(不幸的是,它要求大小写就像在 C 中一样;太糟糕了,Rust 从一开始就没有像更新的 C 内部函数那样使用)。随着 Xeon Phi 的停产,没有 AVX-512VL 的 AVX-512 对于编译器开发人员来说优先级较低,但允许内部函数然后屠杀它们似乎很糟糕。_mm256_loadu_si256(this_ptr as *const __m256i);
*__m256i
void*
1赞
Peter Cordes
9/3/2023
(我从来都不喜欢未蒙面或版本。它只是一个全矢量加载;我们不需要一个新的内在函数。泛型指针 arg 类型很好,但不会以看起来像单元素加载的奇怪名称为代价,并且并非所有编译器都可以移植。如何使用 gcc 或 clang 模拟_mm256_loadu_epi32?._mm256_loadu_epi64,_mm256_storeu_epi64需要 AVX512VL? 报告说,与 rustc 不同,clang 拒绝那些没有 AVX-512VL 的内部函数_mm_loadu_epi8
_mm256
评论