SIMD in C++ 26

Single Instruction Multiple Data雖然不是什麼新玩意，不過走到標準化也花了二十幾年，寫一下自己的感想

石器時代 Inline Assembly

當我剛開始工作的時候，那時候還在使用MMX，SSE都還沒流行，更遑論之後的AVX了
典型的Assembly Code長這樣

add_AVX:
  // size <= 0 --> return
  testq %rdi, %rdi
  jle end_loop

  // i = 0
  movl $0, %eax

start_loop:
  // __m256i b_part = _mm256_loadu_si256((__m256i*) &b[i]);
    // compiles into two instructions, each of which loads 128 bits
  vmovdqu (%rdx,%rax,2), %xmm0
  vinserti128 $0x1, 16(%rdx,%rax,2), %ymm0, %ymm0

  // __m256i a_part = _mm_loadu_si128((__m128i*) &b[i]);
  vmovdqu (%rsx,%rax,2), %xmm1
  vinserti128 $0x1, 16(%rsx,%rax,2), %ymm1, %ymm1

  // a_part = _mm256_add_epi16(a_part, b_part);
  vpaddw %ymm1, %ymm0

  // _mm256_storeu_si256((__m256i*) &a[i], a_part)
  vmovups %ymm0, (%rsi,%rax,2)
  vextracti128 $0x1, %ymm0, 16(%rsi,%rax,2)

  // i += 16
  addq $16, %rax
  
  // i < size --> return
  cmpq %rax, %rdi
  jg start_loop
end:
  ret

雖然可以運作，不過問題也是不少

MSVC和GCC的Inline Assembly的寫法不同，更何況MSVC在64bit之後就不支持Inline Assembly了
要為每個Artitecture維護一份自己的Assembly Code，MMX一份，SSE1/2/3/4，AVX系列都要維護，也就是Portable issue

最大的問題，能夠寫Assembly Code的人，大概比日本壓縮機還要少
於是就從石器時代進入到青銅時代

青銅時代 Intrinsic function

Intrinsic function是一種特殊的函數，由編譯器維護，由於編譯器能夠對Intrinsic function做更進一步的最佳化，通常用於向量化和平行化
以下是一個範例

/* vectorized version */
void add_AVX(long size, unsigned short * a, const unsigned short *b) {
    for (long i = 0; i < size; i += 16) {
        /* load 256 bits from a */
        /* a_part = {a[i], a[i+1], a[i+2], ..., a[i+15]} */
        __m256i a_part = _mm256_loadu_si256((__m256i*) &a[i]);
        /* load 256 bits from b */
        /* b_part = {b[i], b[i+1], b[i+2], ..., b[i+15]} */
        __m256i b_part = _mm256_loadu_si256((__m256i*) &b[i]);
        /* a_part = {a[i] + b[i], a[i+1] + b[i+1], ...,
                     a[i+7] + b[i+15]}
         */
        a_part = _mm256_add_epi16(a_part, b_part);
        _mm256_storeu_si256((__m256i*) &a[i], a_part);
    }
}

看起來像是正常的C Code了，解決了上面1和3的問題，不過問提2還是存在
看看知名的llama.cpp當中的一段

void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
    assert(k % QK8_1 == 0);
    const int nb = k / QK8_1;

    block_q8_1 * GGML_RESTRICT y = vy;

#if defined(__ARM_NEON)
    // Ignore
#elif defined __wasm_simd128__
    // Ignore
#elif defined(__AVX2__) || defined(__AVX__)
    // Ignore
#elif defined(__riscv_v_intrinsic)
    // Ignore
#elif defined(__POWER9_VECTOR__)
    // Ignore
#elif defined(__loongarch_asx)
    // Ignore
#elif defined(__VXE__) || defined(__VXE2__)
    // Ignore
#else
    // fallback
#endif
}

可以看到，不同的架構就有不同的Intrinsic function sets，不能重複使用，因此就要維護好幾份程式碼

鐵器時代

之後就有一派人馬，封裝了各架構不同的Intrinsic function，包裝成演算法的方式提供
例如

highway
EVE
xsimd

std-simd
雖然細節不盡相同，不過程式碼大概像這樣

using Vec3D = std::array<float, 3>;
float scalar_product(Vec3D a, Vec3D b) {
  return a[0] * b[0] + a[1] * b[1] + a[2] * b[2];
}

更接近一般的C++ Code了，而其中的std-simd就是C++26 SIMD的前身

Simd in Rust

看看Rust的方式，雖然能看得懂，不過談不上喜歡

fn reduce(x: &[i32]) -> i32 {
    assert!(x.len() % 4 == 0);
    let mut sum = i32x4::splat(0); // [0, 0, 0, 0]
    for i in (0..x.len()).step_by(4) {
        sum += i32x4::from_slice_unaligned(&x[i..]);
    }
    sum.wrapping_sum()
}

let x = [0, 1, 2, 3, 4, 5, 6, 7];
assert_eq!(reduce(&x), 28);

A possible SIMD reduce implemenation in C++26

基於目前的SIMD TS，之後SIMD的程式碼可能長這要，與C++現有的工具可以很好的配合

#include <algorithm>
#include <array>
#include <print>
#include <ranges>
#include <experimental/simd>
#include <print>

namespace stdx = std::experimental;
namespace stdv = std::views;
namespace stdr = std::ranges;

template <typename T, auto N>
constexpr auto reduce(const std::array<T, N> &arr) {
        using simd_t = stdx::native_simd<T>;

        constexpr auto step = simd_t::size();
        constexpr auto tile = N / step;
        constexpr auto left = N % step;

        T sum {};

        for (const auto &batch : arr | stdv::stride(step) | stdv::take(tile)) {
                simd_t temp(std::addressof(batch), stdx::element_aligned);
                sum += stdx::reduce(temp, std::plus{});
        }

        if constexpr (left) {
                auto left_view = arr | stdv::drop(tile * step);
                std::ranges::for_each(left_view, [&](auto v) { sum += v; });
        }

        return sum;
}


int main() {
        std::array<int, 7> arr {1, 2, 3, 4, 5, 6, 7};
        std::println("{}", reduce(arr));
}

不過眾人質疑的一點，是否之後性能能否達到Intrinsic function的水平，不過至少是解決了Portable這個痛點了

Reference

c-c++ assembly inline x86-64 128-bit SIMD - brief summary.md
SIMD for C++ Developers
Intrinsic function