c++ - AVX 256-bit code performing slightly worse than equivalent 128-bit SSSE3 code -


i trying write efficient hamming-distance code. inspired wojciech muła's extremely clever sse3 popcount implementation, coded avx2 equivalent solution, time using 256 bit registers. l expecting @ least 30%-40% improvement based on doubled parallelism of involved operations, surprise, avx2 code tad slower (around 2%)!

can enlighten me of possible reasons why i'm not obtaining expected performance boost?

unrolled, sse3 hamming distance of 2 64-byte blocks:

int32 sse_popcount(const uint32* __restrict pa, const uint32* __restrict pb) {     __m128i paccum  = _mm_setzero_si128();     __m128i       = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pa));    __m128i b       = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pb));    __m128i err     = _mm_xor_si128   (a, b);    __m128i lo      = _mm_and_si128   (err, low_mask);    __m128i hi      = _mm_srli_epi16  (err, 4);            hi      = _mm_and_si128   (hi, low_mask);    __m128i popcnt1 = _mm_shuffle_epi8(lookup, lo);    __m128i popcnt2 = _mm_shuffle_epi8(lookup, hi);            paccum  = _mm_add_epi8(paccum, popcnt1);            paccum  = _mm_add_epi8(paccum, popcnt2);                   = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pa + 4));            b       = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pb + 4));            err     = _mm_xor_si128   (a, b);            lo      = _mm_and_si128   (err, low_mask);            hi      = _mm_srli_epi16  (err, 4);            hi      = _mm_and_si128   (hi, low_mask);            popcnt1 = _mm_shuffle_epi8(lookup, lo);            popcnt2 = _mm_shuffle_epi8(lookup, hi);            paccum  = _mm_add_epi8(paccum, popcnt1);            paccum  = _mm_add_epi8(paccum, popcnt2);                   = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pa + 8));            b       = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pb + 8));            err     = _mm_xor_si128   (a, b);            lo      = _mm_and_si128   (err, low_mask);            hi      = _mm_srli_epi16  (err, 4);            hi      = _mm_and_si128   (hi, low_mask);            popcnt1 = _mm_shuffle_epi8(lookup, lo);            popcnt2 = _mm_shuffle_epi8(lookup, hi);            paccum  = _mm_add_epi8(paccum, popcnt1);            paccum  = _mm_add_epi8(paccum, popcnt2);                   = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pa + 12));            b       = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pb + 12));            err     = _mm_xor_si128   (a, b);            lo      = _mm_and_si128   (err, low_mask);            hi      = _mm_srli_epi16  (err, 4);            hi      = _mm_and_si128   (hi, low_mask);            popcnt1 = _mm_shuffle_epi8(lookup, lo);            popcnt2 = _mm_shuffle_epi8(lookup, hi);            paccum  = _mm_add_epi8(paccum, popcnt1);            paccum  = _mm_add_epi8(paccum, popcnt2);             paccum  = _mm_sad_epu8(paccum, _mm_setzero_si128());    uint64  result =  paccum.m128i_u64[0] + paccum.m128i_u64[1];    return (int32)result; } 

ununrolled, equivalent version using avx's 256-bit registers:

int32 avx_popcount(const uint32* __restrict pa, const uint32* __restrict pb) {    __m256i paccum =  _mm256_setzero_si256();     __m256i       = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pa));    __m256i b       = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pb));    __m256i err     = _mm256_xor_si256   (a, b);    __m256i lo      = _mm256_and_si256   (err, low_mask256);    __m256i hi      = _mm256_srli_epi16  (err, 4);            hi      = _mm256_and_si256   (hi, low_mask256);    __m256i popcnt1 = _mm256_shuffle_epi8(lookup256, lo);    __m256i popcnt2 = _mm256_shuffle_epi8(lookup256, hi);            paccum  = _mm256_add_epi8(paccum, popcnt1);            paccum  = _mm256_add_epi8(paccum, popcnt2);                   = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pa + 8));            b       = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pb + 8));            err     = _mm256_xor_si256   (a, b);            lo      = _mm256_and_si256   (err, low_mask256);            hi      = _mm256_srli_epi16  (err, 4);            hi      = _mm256_and_si256   (hi, low_mask256);            popcnt1 = _mm256_shuffle_epi8(lookup256, lo);            popcnt2 = _mm256_shuffle_epi8(lookup256, hi);            paccum  = _mm256_add_epi8(paccum, popcnt1);            paccum  = _mm256_add_epi8(paccum, popcnt2);             paccum  = _mm256_sad_epu8(paccum, _mm256_setzero_si256());            uint64  result =  paccum.m256i_i64[0] + paccum.m256i_u64[1] + paccum.m256i_i64[2] + paccum.m256i_i64[3];    return (int32)result; } 

i verified output assembly code emitted compiler , looks good, expected direct translation of intrinsic instruction machine instruction. thing noticed on avx2 version, last line population count of 4 quad-words accumulated, generates more complex code sse3 version (where 2 quad-words need accumulated obtain population count), still expect faster throughput.

avx2 code generated quad-word accumulation

vextractf128 xmm0, ymm2, 1 psrldq  xmm0, 8 movd    ecx, xmm2 movd    eax, xmm0 vextractf128 xmm0, ymm2, 1 psrldq  xmm2, 8 add eax, ecx movd    ecx, xmm0 add eax, ecx movd    ecx, xmm2 add eax, ecx 

sse3 code generated quad-word accumulation

movd    ecx, xmm2 psrldq  xmm2, 8 movd    eax, xmm2 add eax, ecx 

my test program calling 1 million times each routine, different input values, reusing 2 static buffers holding data of pa , pb parameters. in limited understanding of cpu architecture, locality (reusing same memory buffers on , over) should warm cpu caches nicely , not tied memory bandwidth problem, other possibly memory bandwidth, can't understand why there no performance improvement.

test routine

int _tmain(int argc, _tchar* argv[]) {     lookup = _mm_setr_epi8(         /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,         /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,         /* 8 */ 1, /* 9 */ 2, /* */ 2, /* b */ 3,         /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4     );    low_mask = _mm_set1_epi8(0xf);     lookup256 = _mm256_setr_epi8(         /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,         /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,         /* 8 */ 1, /* 9 */ 2, /* */ 2, /* b */ 3,         /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,         /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,         /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,         /* 8 */ 1, /* 9 */ 2, /* */ 2, /* b */ 3,         /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4     );     low_mask256 = _mm256_set1_epi8(0xf);      std::default_random_engine generator;    generator.seed(37);    std::uniform_int_distribution<uint32> distribution(0, ulong_max);    auto dice = std::bind( distribution, generator);      uint32 a[16];    uint32 b[16];     int count;    count = 0;    {       cout << "avx popcount\r\n";       boost::timer::auto_cpu_timer t;       for( int = 0; < 1000000; i++ ) {          for( int j = 0; j < 16; j++ ) {             a[j] = dice();             b[j] = dice();          }          count+= avx_popcount(a, b);       }    }     cout << count << "\r\n";      std::default_random_engine generator2;    generator2.seed(37);    std::uniform_int_distribution<uint32> distribution2(0, ulong_max);    auto dice2 = std::bind( distribution2, generator2);      count = 0;    {       cout << "sse popcount\r\n";       boost::timer::auto_cpu_timer t;       for( int = 0; < 1000000; i++ ) {          for( int j = 0; j < 16; j++ ) {             a[j] = dice2();             b[j] = dice2();          }          count+= sse_popcount(a, b);       }    }    cout << count << "\r\n";     getch();    return 0; } 

test machine intel corei7 4790, , i'm using visual studio 2012 pro.

in addition minor issues in comments (compiling /arch:avx) primary problem generation of random input arrays on each iteration. bottleneck test not evaluating methods. note - i'm not using boost, gettickcount works purpose. consider :

int count; count = 0; {     cout << "avx popcount\r\n";     unsigned int tick = gettickcount();     (int = 0; < 1000000; i++) {         (int j = 0; j < 16; j++) {             a[j] = dice();             b[j] = dice();         }         count += avx_popcount(a, b);     }     tick = gettickcount() - tick;     cout << tick << "\r\n"; } 

produces output :

avx popcount
2309
256002470

so 2309ms complete... happens if rid of avx routine altogether? make input arrays :

int count; count = 0; {     cout << "just making arrays...\r\n";     unsigned int tick = gettickcount();     (int = 0; < 1000000; i++) {         (int j = 0; j < 16; j++) {             a[j] = dice();             b[j] = dice();         }                }     tick = gettickcount() - tick;     cout << tick << "\r\n"; } 

produces output:

just making arrays...
2246

how that. not surprising, really, since you're generating 32 random numbers, can quite expensive, , performing rather fast integer math , shuffles.

so...

now let's add factor of 100 more iterations , random generator out of tight loop. compiling here optimizations disabled run code expected , not throw away "useless" iterations - presumably code care here (manually) optimized!

    (int j = 0; j < 16; j++) {         a[j] = dice();         b[j] = dice();     }      int count;     count = 0;     {         cout << "avx popcount\r\n";         unsigned int tick = gettickcount();         (int = 0; < 100000000; i++) {                        count += avx_popcount(a, b);         }         tick = gettickcount() - tick;         cout << tick << "\r\n";     }      cout << count << "\r\n";      count = 0;     {         cout << "sse popcount\r\n";         unsigned int tick = gettickcount();         (int = 0; < 100000000; i++) {             count += sse_popcount(a, b);         }         tick = gettickcount() - tick;         cout << tick << "\r\n";     }     cout << count << "\r\n"; 

produces output :

avx popcount
3744
730196224
sse popcount
5616
730196224

so congratulations - can pat on back, avx routine indeed third faster sse routine (tested on haswell i7 here). lesson sure profiling think profiling!


Comments

Popular posts from this blog

python - argument must be rect style object - Pygame -

webrtc - Which ICE candidate am I using and why? -

c# - Better 64-bit byte array hash -