SIMD 常用示例

2023-01-12 language

查找字符

用于从字符流中查找指定字符数量,可以参考 Searching Gigabytes of Data per second with simd

__attribute__((target("sse4.2")))
static inline size_t char_count(const char *data, size_t len, const char ch) {
    constexpr int SIMD_SIZE = sizeof(__m128i);
    const auto end = data + len;
    const auto simd_end = data + (len & ~(SIMD_SIZE - 1));

    size_t total = 0;
    __m128i cmp = _mm_set1_epi8(ch);
    for (; data < simd_end; data += SIMD_SIZE) {
        __m128i res = _mm_cmpeq_epi8(_mm_loadu_si128((__m128i const *) data), cmp);
        total += __builtin_popcount(_mm_movemask_epi8(res));
    }
    for (; data < end; data++) {
        if (*data == ch)
            total++;
    }
    return total;
}

__attribute__((target("default")))
static inline size_t char_count(const char *data, size_t len, const char ch) {
    size_t total = 0;
    for (; len > 0; len--, data++) {
        if (*data == ch)
            total++;
    }
    return total;
}

UTF8

SIMD ASCII Check UTF8 检查 UTF8 是否为 ASCII 字符,生产用的可以参考 SIMDJSON 库的实现。

对于字符流来说,只要某个字符超过 127 ,那么就肯定不是 ASCII 字符。

__attribute__((target("avx512f")))
bool utf8_is_ascii(const char *src, size_t len) {
    size_t i = 0;
    __m512i has_error = _mm512_setzero_si512();
    for (; i <= len - 64; i += 64) {
        __m512i current_bytes = _mm512_loadu_si512((const __m512i *) (src + i));
        has_error = _mm512_or_si512(has_error, current_bytes);
    }
    int reduced = _mm512_reduce_or_epi32(has_error);

    for (; i < len; i++) {
        reduced |= src[i];
    }

    return !(reduced & 0x80808080);
}

__attribute__((target("avx2")))
bool utf8_is_ascii(const char *src, size_t len) {
    size_t i = 0;
    __m256i has_error = _mm256_setzero_si256();
    for (; i <= len - 32; i += 32) {
        __m256i current_bytes = _mm256_loadu_si256((const __m256i *) (src + i));
        has_error = _mm256_or_si256(has_error, current_bytes);
    }
    int error_mask = _mm256_movemask_epi8(has_error);

    char tail_has_error = 0;
    for (; i < len; i++) {
        tail_has_error |= src[i];
    }
    error_mask |= (tail_has_error & 0x80);

    return !error_mask;
}

__attribute__((target("sse2")))
bool utf8_is_ascii(const char *src, size_t len) {
    size_t i = 0;
    __m128i has_error = _mm_setzero_si128();
    for (; i <= len - 16; i += 16) {
        __m128i current_bytes = _mm_loadu_si128((const __m128i *) (src + i));
        has_error = _mm_or_si128(has_error, current_bytes);
    }
    int error_mask = _mm_movemask_epi8(has_error);

    char tail_has_error = 0;
    for (; i < len; i++) {
        tail_has_error |= src[i];
    }
    error_mask |= (tail_has_error & 0x80);

    return !error_mask;
}

__attribute__((target("default")))
bool utf8_is_ascii(const char *src, size_t len) {
    char is_ascii = 0;
    for (size_t i = 0; i < len; i++) {
        is_ascii |= src[i];
    }
    return !(is_ascii & 0x80);
}