intrinsicを用いたSIMD化
Link: intrinsicを用いたSIMD化
めっちゃわかりやすく書いてあるから参考に簡単な文字列の比較コードを書いてみた。
けどstrcmpに勝ててないという。笑
#define ALIGNED __attribute__ ((aligned (16))) static __m128i rmask, rzero; /* initialize simd const */ void simd_init( void ) { /* mask for shuffle */ rmask = _mm_setr_epi8( /* 3, 2, 1, 0, 7, 6, 5, 4, */ /* 11, 10, 9, 8, 15, 14, 13, 12 */ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 /* 0, 1, 2, 3, 4, 5, 6, 7, */ /* 8, 9, 10, 11, 12, 13, 14, 15 */ ); rzero = _mm_setr_epi32( 0, 0, 0, 0 ); } int simd_strcmp( const char *sa, const char *sb, int len ) { ALIGNED uint64_t tmp[2]; ALIGNED uint64_t a[2], b[2]; __m128i ra, rb, rr; for ( int i = 0; i < len; i+=16 ) { ra = _mm_load_si128((__m128i*)(&sa[i])); rb = _mm_load_si128((__m128i*)(&sb[i])); rr = _mm_cmpeq_epi64( ra, rb ); rr = _mm_shuffle_epi32( rr, _MM_SHUFFLE(0,2,1,3) ); _mm_store_si128((__m128i*)tmp,ra); if ( tmp[0] != 0xFFFFFFFFFFFFFFFF ) break; } ra = _mm_shuffle_epi8( ra, rmask ); _mm_store_si128((__m128i*)a, ra); rb = _mm_shuffle_epi8( rb, rmask ); _mm_store_si128((__m128i*)b, rb); if ( a[0] < b[0] ) return -1; else if ( a[0] > b[0] ) return 1; else if ( a[1] < b[1] ) return -1; else if ( a[1] > b[1] ) return 1; return 0; }