Accumulate vector of integer with sse

Here is an int version which I just threw together:

#include <iostream>
#include <vector>

#include <smmintrin.h>  // SSE4

#define ROUND_DOWN(m, n) ((m) & ~((n) - 1))

static int accumulate(const std::vector<int>& v)
{
    // copy the length of v and a pointer to the data onto the local stack
    const size_t N = v.size();
    const int* p = (N > 0) ? &v.front() : NULL;

    __m128i mmSum = _mm_setzero_si128();
    int sum = 0;
    size_t i = 0;

    // unrolled loop that adds up 4 elements at a time
    for(; i < ROUND_DOWN(N, 4); i+=4)
    {
        mmSum = _mm_add_epi32(mmSum, _mm_loadu_si128((__m128i *)(p + i)));
    }

    // add up the four int values from mmSum into a single value
    mmSum = _mm_hadd_epi32(mmSum, mmSum);
    mmSum = _mm_hadd_epi32(mmSum, mmSum);
    sum = _mm_extract_epi32(mmSum, 0);

    // add up single values until all elements are covered
    for(; i < N; i++)
    {
        sum += p[i];
    }

    return sum;
}

int main()
{
    std::vector<int> v;

    for (int i = 0; i < 10; ++i)
    {
        v.push_back(i);
    }

    int sum = accumulate(v);

    std::cout << sum << std::endl;

    return 0;
}

Compile and run:

$ g++ -Wall -msse4 -O3 accumulate.cpp && ./a.out 
45

The ideal way to do this is to let the compiler auto-vectorize your code and keep your code simple and readable. You don't should not need anything more that

int sum = 0;
for(int i=0; i<v.size(); i++) sum += v[i];

The link you pointed to, http://fastcpp.blogspot.com.au/2011/04/how-to-process-stl-vector-using-sse.html, does not seem to understand how to make the compiler vectorize the code.

For floating point, which is what that link uses, what you need to know is that floating point arithmetic is not associative and therefore depends on the order that you do the reduction. GCC, MSVC, and Clang will not do auto-vectorization for a reduction unless you tell it to use a different floating point model otherwise your result could depend on your hardware. ICC, however, defaults to associative floating point math so it will vectorize the code with e.g. -O3.

Not only will GCC, MSVC, and Clang not vectorize unless associative math is allowed but they won't unroll the loop to allow partial sums in order to overcome the latency of the summation. In this case only Clang and ICC will unroll to partial sums anyway. Clang unrolls four times and ICC twice.

One way to enable associative floating point arithmetic with GCC is with the -Ofast flag. With MSVC use /fp:fast

I tested the code below with GCC 4.9.2, XeonE5-1620 (IVB) @ 3.60GHz, Ubuntu 15.04.

-O3 -mavx -fopenmp                       0.93 s
-Ofast -mavx -fopenmp                    0.19 s
-Ofast -mavx -fopenmp -funroll-loops     0.19 s

That's about a five times speed-up. Although, GCC does unroll the loop eight times it does not do independent partial sums (see the assembly below). This is the reason the unrolled version is no better.

I only used OpenMP for its convenient cross-platform/compiler timing function: omp_get_wtime().

Another advantage auto-vectorization has is it works for AVX simply by enabling a compiler switch (e.g. -mavx). Otherwise, if you wanted AVX, you would have to rewrite your code to use the AVX intrinsics and maybe have to ask another question on SO on how to do this.

So currently the only compiler which will auto-vectorize your loop as well as unroll to four partial sums is Clang. See the code and assembly at the end of this answer.


Here is the code I used to test the performance

#include <stdio.h>
#include <omp.h>
#include <vector>

float sumf(float *x, int n)
{
  float sum = 0;
  for(int i=0; i<n; i++) sum += x[i];
  return sum;
}

#define N 10000 // the link used this value
int main(void)
{
  std::vector<float> x;
  for(int i=0; i<N; i++) x.push_back(1 -2*(i%2==0));
  //float x[N]; for(int i=0; i<N; i++) x[i] = 1 -2*(i%2==0);                                                                                                                                                        
  float sum = 0;
  sum += sumf(x.data(),N);
  double dtime = -omp_get_wtime();
  for(int r=0; r<100000; r++) {
    sum += sumf(x.data(),N);
  }
  dtime +=omp_get_wtime();
  printf("sum %f time %f\n", sum, dtime);
}

Edit:

I should have taken my own advice and looked at the assembly.

The main loop for -O3. It's clear it only does a scalar sum.

.L3:
    vaddss  (%rdi), %xmm0, %xmm0
    addq    $4, %rdi
    cmpq    %rax, %rdi
    jne .L3

The main loop for -Ofast. It does a vector sum but no unrolling.

.L8:
    addl    $1, %eax
    vaddps  (%r8), %ymm1, %ymm1
    addq    $32, %r8
    cmpl    %eax, %ecx
    ja  .L8

The main loop for -O3 -funroll-loops. Vector sum with 8x unroll

.L8:
    vaddps  (%rax), %ymm1, %ymm2
    addl    $8, %ebx
    addq    $256, %rax
    vaddps  -224(%rax), %ymm2, %ymm3
    vaddps  -192(%rax), %ymm3, %ymm4
    vaddps  -160(%rax), %ymm4, %ymm5
    vaddps  -128(%rax), %ymm5, %ymm6
    vaddps  -96(%rax), %ymm6, %ymm7
    vaddps  -64(%rax), %ymm7, %ymm8
    vaddps  -32(%rax), %ymm8, %ymm1
    cmpl    %ebx, %r9d
    ja  .L8

Edit:

Putting the following code in Clang 3.7 (-O3 -fverbose-asm -mavx)

float sumi(int *x)
{
  x = (int*)__builtin_assume_aligned(x, 64);
  int sum = 0;
  for(int i=0; i<2048; i++) sum += x[i];
  return sum;
}

produces the following assembly. Notice that it's vectorized to four independent partial sums.

sumi(int*):                              # @sumi(int*)
    vpxor   xmm0, xmm0, xmm0
    xor eax, eax
    vpxor   xmm1, xmm1, xmm1
    vpxor   xmm2, xmm2, xmm2
    vpxor   xmm3, xmm3, xmm3
.LBB0_1:                                # %vector.body
    vpaddd  xmm0, xmm0, xmmword ptr [rdi + 4*rax]
    vpaddd  xmm1, xmm1, xmmword ptr [rdi + 4*rax + 16]
    vpaddd  xmm2, xmm2, xmmword ptr [rdi + 4*rax + 32]
    vpaddd  xmm3, xmm3, xmmword ptr [rdi + 4*rax + 48]
    vpaddd  xmm0, xmm0, xmmword ptr [rdi + 4*rax + 64]
    vpaddd  xmm1, xmm1, xmmword ptr [rdi + 4*rax + 80]
    vpaddd  xmm2, xmm2, xmmword ptr [rdi + 4*rax + 96]
    vpaddd  xmm3, xmm3, xmmword ptr [rdi + 4*rax + 112]
    add rax, 32
    cmp rax, 2048
    jne .LBB0_1
    vpaddd  xmm0, xmm1, xmm0
    vpaddd  xmm0, xmm2, xmm0
    vpaddd  xmm0, xmm3, xmm0
    vpshufd xmm1, xmm0, 78          # xmm1 = xmm0[2,3,0,1]
    vpaddd  xmm0, xmm0, xmm1
    vphaddd xmm0, xmm0, xmm0
    vmovd   eax, xmm0
    vxorps  xmm0, xmm0, xmm0
    vcvtsi2ss   xmm0, xmm0, eax
    ret