c++ - AVX 256-bit code performing slightly worse than equivalent 128-bit SSSE3 code -
i trying write efficient hamming-distance code. inspired wojciech muła's extremely clever sse3 popcount implementation, coded avx2 equivalent solution, time using 256 bit registers. l expecting @ least 30%-40% improvement based on doubled parallelism of involved operations, surprise, avx2 code tad slower (around 2%)!
can enlighten me of possible reasons why i'm not obtaining expected performance boost?
unrolled, sse3 hamming distance of 2 64-byte blocks:
int32 sse_popcount(const uint32* __restrict pa, const uint32* __restrict pb) { __m128i paccum = _mm_setzero_si128(); __m128i = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pa)); __m128i b = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pb)); __m128i err = _mm_xor_si128 (a, b); __m128i lo = _mm_and_si128 (err, low_mask); __m128i hi = _mm_srli_epi16 (err, 4); hi = _mm_and_si128 (hi, low_mask); __m128i popcnt1 = _mm_shuffle_epi8(lookup, lo); __m128i popcnt2 = _mm_shuffle_epi8(lookup, hi); paccum = _mm_add_epi8(paccum, popcnt1); paccum = _mm_add_epi8(paccum, popcnt2); = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pa + 4)); b = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pb + 4)); err = _mm_xor_si128 (a, b); lo = _mm_and_si128 (err, low_mask); hi = _mm_srli_epi16 (err, 4); hi = _mm_and_si128 (hi, low_mask); popcnt1 = _mm_shuffle_epi8(lookup, lo); popcnt2 = _mm_shuffle_epi8(lookup, hi); paccum = _mm_add_epi8(paccum, popcnt1); paccum = _mm_add_epi8(paccum, popcnt2); = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pa + 8)); b = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pb + 8)); err = _mm_xor_si128 (a, b); lo = _mm_and_si128 (err, low_mask); hi = _mm_srli_epi16 (err, 4); hi = _mm_and_si128 (hi, low_mask); popcnt1 = _mm_shuffle_epi8(lookup, lo); popcnt2 = _mm_shuffle_epi8(lookup, hi); paccum = _mm_add_epi8(paccum, popcnt1); paccum = _mm_add_epi8(paccum, popcnt2); = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pa + 12)); b = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pb + 12)); err = _mm_xor_si128 (a, b); lo = _mm_and_si128 (err, low_mask); hi = _mm_srli_epi16 (err, 4); hi = _mm_and_si128 (hi, low_mask); popcnt1 = _mm_shuffle_epi8(lookup, lo); popcnt2 = _mm_shuffle_epi8(lookup, hi); paccum = _mm_add_epi8(paccum, popcnt1); paccum = _mm_add_epi8(paccum, popcnt2); paccum = _mm_sad_epu8(paccum, _mm_setzero_si128()); uint64 result = paccum.m128i_u64[0] + paccum.m128i_u64[1]; return (int32)result; }
ununrolled, equivalent version using avx's 256-bit registers:
int32 avx_popcount(const uint32* __restrict pa, const uint32* __restrict pb) { __m256i paccum = _mm256_setzero_si256(); __m256i = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pa)); __m256i b = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pb)); __m256i err = _mm256_xor_si256 (a, b); __m256i lo = _mm256_and_si256 (err, low_mask256); __m256i hi = _mm256_srli_epi16 (err, 4); hi = _mm256_and_si256 (hi, low_mask256); __m256i popcnt1 = _mm256_shuffle_epi8(lookup256, lo); __m256i popcnt2 = _mm256_shuffle_epi8(lookup256, hi); paccum = _mm256_add_epi8(paccum, popcnt1); paccum = _mm256_add_epi8(paccum, popcnt2); = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pa + 8)); b = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pb + 8)); err = _mm256_xor_si256 (a, b); lo = _mm256_and_si256 (err, low_mask256); hi = _mm256_srli_epi16 (err, 4); hi = _mm256_and_si256 (hi, low_mask256); popcnt1 = _mm256_shuffle_epi8(lookup256, lo); popcnt2 = _mm256_shuffle_epi8(lookup256, hi); paccum = _mm256_add_epi8(paccum, popcnt1); paccum = _mm256_add_epi8(paccum, popcnt2); paccum = _mm256_sad_epu8(paccum, _mm256_setzero_si256()); uint64 result = paccum.m256i_i64[0] + paccum.m256i_u64[1] + paccum.m256i_i64[2] + paccum.m256i_i64[3]; return (int32)result; }
i verified output assembly code emitted compiler , looks good, expected direct translation of intrinsic instruction machine instruction. thing noticed on avx2 version, last line population count of 4 quad-words accumulated, generates more complex code sse3 version (where 2 quad-words need accumulated obtain population count), still expect faster throughput.
avx2 code generated quad-word accumulation
vextractf128 xmm0, ymm2, 1 psrldq xmm0, 8 movd ecx, xmm2 movd eax, xmm0 vextractf128 xmm0, ymm2, 1 psrldq xmm2, 8 add eax, ecx movd ecx, xmm0 add eax, ecx movd ecx, xmm2 add eax, ecx
sse3 code generated quad-word accumulation
movd ecx, xmm2 psrldq xmm2, 8 movd eax, xmm2 add eax, ecx
my test program calling 1 million times each routine, different input values, reusing 2 static buffers holding data of pa
, pb
parameters. in limited understanding of cpu architecture, locality (reusing same memory buffers on , over) should warm cpu caches nicely , not tied memory bandwidth problem, other possibly memory bandwidth, can't understand why there no performance improvement.
test routine
int _tmain(int argc, _tchar* argv[]) { lookup = _mm_setr_epi8( /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 ); low_mask = _mm_set1_epi8(0xf); lookup256 = _mm256_setr_epi8( /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4, /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, /* 8 */ 1, /* 9 */ 2, /* */ 2, /* b */ 3, /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4 ); low_mask256 = _mm256_set1_epi8(0xf); std::default_random_engine generator; generator.seed(37); std::uniform_int_distribution<uint32> distribution(0, ulong_max); auto dice = std::bind( distribution, generator); uint32 a[16]; uint32 b[16]; int count; count = 0; { cout << "avx popcount\r\n"; boost::timer::auto_cpu_timer t; for( int = 0; < 1000000; i++ ) { for( int j = 0; j < 16; j++ ) { a[j] = dice(); b[j] = dice(); } count+= avx_popcount(a, b); } } cout << count << "\r\n"; std::default_random_engine generator2; generator2.seed(37); std::uniform_int_distribution<uint32> distribution2(0, ulong_max); auto dice2 = std::bind( distribution2, generator2); count = 0; { cout << "sse popcount\r\n"; boost::timer::auto_cpu_timer t; for( int = 0; < 1000000; i++ ) { for( int j = 0; j < 16; j++ ) { a[j] = dice2(); b[j] = dice2(); } count+= sse_popcount(a, b); } } cout << count << "\r\n"; getch(); return 0; }
test machine intel corei7 4790, , i'm using visual studio 2012 pro.
in addition minor issues in comments (compiling /arch:avx
) primary problem generation of random input arrays on each iteration. bottleneck test not evaluating methods. note - i'm not using boost, gettickcount
works purpose. consider :
int count; count = 0; { cout << "avx popcount\r\n"; unsigned int tick = gettickcount(); (int = 0; < 1000000; i++) { (int j = 0; j < 16; j++) { a[j] = dice(); b[j] = dice(); } count += avx_popcount(a, b); } tick = gettickcount() - tick; cout << tick << "\r\n"; }
produces output :
avx popcount
2309
256002470
so 2309ms complete... happens if rid of avx routine altogether? make input arrays :
int count; count = 0; { cout << "just making arrays...\r\n"; unsigned int tick = gettickcount(); (int = 0; < 1000000; i++) { (int j = 0; j < 16; j++) { a[j] = dice(); b[j] = dice(); } } tick = gettickcount() - tick; cout << tick << "\r\n"; }
produces output:
just making arrays...
2246
how that. not surprising, really, since you're generating 32 random numbers, can quite expensive, , performing rather fast integer math , shuffles.
so...
now let's add factor of 100 more iterations , random generator out of tight loop. compiling here optimizations disabled run code expected , not throw away "useless" iterations - presumably code care here (manually) optimized!
(int j = 0; j < 16; j++) { a[j] = dice(); b[j] = dice(); } int count; count = 0; { cout << "avx popcount\r\n"; unsigned int tick = gettickcount(); (int = 0; < 100000000; i++) { count += avx_popcount(a, b); } tick = gettickcount() - tick; cout << tick << "\r\n"; } cout << count << "\r\n"; count = 0; { cout << "sse popcount\r\n"; unsigned int tick = gettickcount(); (int = 0; < 100000000; i++) { count += sse_popcount(a, b); } tick = gettickcount() - tick; cout << tick << "\r\n"; } cout << count << "\r\n";
produces output :
avx popcount
3744
730196224
sse popcount
5616
730196224
so congratulations - can pat on back, avx routine indeed third faster sse routine (tested on haswell i7 here). lesson sure profiling think profiling!
Comments
Post a Comment