寄存器之间是否有像分散这样的 SIMD 内部函数?

Is there a SIMD intrinsics like scatter but between registers?

提问人:FireTner 提问时间:11/9/2023 最后编辑:Peter CordesFireTner 更新时间:11/11/2023 访问量:100

问:

所以据我所知,如果你想做,有_mm_shuffle_epi8

dst[i] = a[b[i]]

但我的问题是,是否存在一个内在的

dst[b[i]] = a[i]

我希望它能与 16 位的 8 个元素一起使用(无论它是否签名)

我查看了英特尔内部函数指南,但除了排列和随机播放之外,没有找到任何东西,它们看起来很接近,但我认为它们没有 dst[b[i]] = a[i]。我试着在网上寻找,但我不确定要搜索什么。我唯一发现我需要在 simd 寄存器之间“分散”的“排列”。

SIMD SSE AVX

评论

1赞 Peter Cordes 11/9/2023
否,所有 x86 随机播放都使用源索引,而不是目标索引。我认为其他 ISA 也是如此(如 AArch64 advsimd 和 PowerPC Altivec,尽管您提到过,但我认为您对 x86 最感兴趣)。相关新闻: 从基于源的索引转换为基于目标的索引_mm_shuffle_epi8

答:

3赞 Andrey Semashev 11/9/2023 #1

没有有效的方法可以做到这一点,因为没有 x86 指令(截至本答案时)根据目标元素索引置换向量寄存器值。但是,如果有许多元素需要根据相同的掩码进行随机排序,则可以创建与基于目标的遮罩具有相同效果的基于源的随机排序掩码,并在数据上重复使用它。

下面是一个示例:

//! Creates a source-based shuffle mask from a target-based one
__m128i create_shuffle_mask(__m128i mm_target_indices)
{
    __m128i mm_res = _mm_setzero_si128();
    // Set mm_res to all-ones by default. This will result in zeroing
    // the elements that are not specified in the target indices.
    mm_res = _mm_cmpeq_epi32(mm_res, mm_res);

    const __m128i mm_lowest_byte_mask = _mm_setr_epi32(0xFF, 0, 0, 0);
    const __m128i mm_lowest_byte_7 = _mm_setr_epi32(7, 0, 0, 0);
    const __m128i mm_lowest_byte_8 = _mm_setr_epi32(8, 0, 0, 0);
    const __m128i mm_1 = _mm_set1_epi8(1);
    __m128i mm_i = _mm_setzero_si128();
    for (unsigned int i = 0; i < 16; ++i)
    {
        // Target index, [0; 7]
        __m128i mm_lowest_index =
            _mm_and_si128(mm_target_indices, mm_lowest_byte_7);
        // A bit that indicates whether the index is in range [8; 15]
        __m128i mm_use_hi = _mm_and_si128(mm_target_indices, mm_lowest_byte_8);

        // Multiply the index by 8 and shift the byte mask into position
        // in the lower half of the vector
        __m128i mm_byte_mask = _mm_sll_epi64(mm_lowest_byte_mask,
            _mm_slli_epi32(mm_lowest_index, 3));
        if (!_mm_testz_si128(mm_use_hi, mm_use_hi))
        {
            // Shift the byte mask into the upper half of the vector
            mm_byte_mask = _mm_slli_si128(mm_byte_mask, 8);
        }

        // Put the index into the final mask
        mm_res = _mm_blendv_epi8(mm_res, mm_i, mm_byte_mask);

        mm_i = _mm_add_epi8(mm_i, mm_1);
        mm_target_indices = _mm_srli_si128(mm_target_indices, 1);
    }

    return mm_res;
}

__m128i mm_data = _mm_setr_epi8(
    100, 101, 102, 103, 104, 105, 106, 107,
    108, 109, 110, 111, 112, 113, 114, 115);
__m128i mm_target_indices = _mm_setr_epi8(
    0, 15, 1, 14, 2, 13, 3, 12, 4, 11, 5, 10, 6, 9, 7, 8);
__m128i mm_shuffle_mask = create_shuffle_mask(mm_target_indices);
__m128i mm_res = _mm_shuffle_epi8(mm_data, mm_shuffle_mask);
// mm_res = { 100, 102, 104, 106, 108, 110, 112, 114,
//            115, 113, 111, 109, 107, 105, 103, 101 }

住在 Coliru 上。

应该注意的是,与基于源的掩码不同,基于源的掩码只写入每个目标元素一次,而基于目标的掩码可以跳过目标元素或写入多次。这意味着您必须决定在这些情况下所需的行为。上面的代码创建了一个随机掩码,该掩码将零在基于目标的掩码中未提及的目标元素,并在基于目标的掩码中使用最后提到的元素(即最高位置的元素),如果目标元素被写入了多次。

3赞 harold 11/11/2023 #2

没有直接的分散方法(我不指望我们会拥有它(我们有一个记忆目的地,但它不是很好)),但令人惊讶的是,你可以做的是反转排列。然后,您可以使用反向排列来执行基于源索引的常规随机排序。要做到这一点,一个明显的要求是没有“碰撞”,其中多个元素被发送到同一位置(但这无论如何意味着什么)。

这是一些 AVX2 代码,取自 (Not) 转置 16x16 位矩阵(我也写了),稍作修改,以便直接获取并返回 a。只需使用 .__m128i_mm_shuffle_epi8(a, invert_permutation_avx2(b))

__m256i nottranspose16x16(__m256i x)
{
    // exchange off-diagonal quadrants
    x = _mm256_shuffle_epi8(x, _mm256_setr_epi8(
        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
        0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15));
    x = _mm256_permutex_epi64(x, _MM_SHUFFLE(3, 1, 2, 0));
    x = _mm256_shuffle_epi8(x, _mm256_setr_epi8(
        0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15,
        0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15));
    // rotate every row by its y coordinate
    __m256i shifts = _mm256_setr_epi16(
        1 << 0, 1 << 1, 1 << 2, 1 << 3,
        1 << 4, 1 << 5, 1 << 6, 1 << 7,
        1 << 8, 1 << 9, 1 << 10, 1 << 11,
        1 << 12, 1 << 13, 1 << 14, 1 << 15);
    __m256i sll = _mm256_mullo_epi16(x, shifts);
    __m256i srl = _mm256_mulhi_epu16(x, shifts);
    x = _mm256_or_si256(sll, srl);
    // within each quadrant independently, 
    // rotate every column by its x coordinate
    __m256i x0, x1, m;
    // rotate by 4
    m = _mm256_set1_epi8(0x0F);
    x0 = _mm256_and_si256(x, m);
    x1 = _mm256_andnot_si256(m, _mm256_alignr_epi8(x, x, 8));
    x = _mm256_or_si256(x0, x1);
    // rotate by 2
    m = _mm256_set1_epi8(0x33);
    x0 = _mm256_and_si256(x, m);
    x1 = _mm256_andnot_si256(m, _mm256_alignr_epi8(x, x, 4));
    x = _mm256_or_si256(x0, x1);
    // rotate by 1
    m = _mm256_set1_epi8(0x55);
    x0 = _mm256_and_si256(x, m);
    x1 = _mm256_andnot_si256(m, _mm256_alignr_epi8(x, x, 2));
    x = _mm256_or_si256(x0, x1);
    return x;
}

__m128i invert_permutation_avx2(__m128i p)
{
    __m256i v = _mm256_cvtepu8_epi16(p);
    // indexes to masks
    v = _mm256_or_si256(v, _mm256_slli_epi64(v, 8));
    v = _mm256_add_epi8(v, _mm256_set1_epi16(0xF878));
    __m256i m = _mm256_shuffle_epi8(_mm256_setr_epi8(
        1, 2, 4, 8, 16, 32, 64, 128,
        1, 2, 4, 8, 16, 32, 64, 128,
        1, 2, 4, 8, 16, 32, 64, 128,
        1, 2, 4, 8, 16, 32, 64, 128), v);
    // ???
    m = nottranspose16x16(m);
    // masks to indexes
    __m256i deBruijn = _mm256_and_si256(_mm256_mulhi_epu16(m, _mm256_set1_epi16(0x9AF0)), _mm256_set1_epi16(0x000F));
    __m128i i = _mm_packs_epi16(_mm256_castsi256_si128(deBruijn), _mm256_extracti128_si256(deBruijn, 1));
    i = _mm_shuffle_epi8(_mm_setr_epi8(
        0, 1, 2, 5, 3, 9, 6, 11, 15, 4, 8, 10, 14, 7, 13, 12), i);
    // un-mess-up the indexes
    i = _mm_sub_epi8(i, _mm_setr_epi8(0, 7, 6, 5, 4, 3, 2, 1, 8, 15, 14, 13, 12, 11, 10, 9));
    i = _mm_and_si128(i, _mm_set1_epi8(0x0F));
    i = _mm_shuffle_epi8(i, _mm_setr_epi8(0, 7, 6, 5, 4, 3, 2, 1, 8, 15, 14, 13, 12, 11, 10, 9));
    return i;
}

使用 GFNI 会更容易做到这一点(因为 GF2P8AFFINEQB 对于转置位矩阵非常有用),特别是如果我们也有 AVX512 用于 AVX2 遇到一些问题的其他操作。基于这个问题的标签,我选择不使用那些较新的指令集扩展。