Part 2 : SSE Intrinsics - Example answer

#include "workshop.h"

#ifdef __SSE2__
  #include <emmintrin.h>
#else
  #warning SSE2 not supported. Code will not compile.
#endif

int main(int argc, char **argv)
{
    const int size = 512;

    auto a = workshop::Array<float>(size);
    auto b = workshop::Array<float>(size);
    auto c = workshop::Array<float>(size);

    auto sse_a = workshop::AlignedArray<__m128>(size/4);
    auto sse_b = workshop::AlignedArray<__m128>(size/4);
    auto sse_c = workshop::AlignedArray<__m128>(size/4);


    for (int i=0; i<size; ++i)
    {
        a[i] = 1.0*(i+1);
        b[i] = 2.5*(i+1);
        c[i] = 0.0;
    }

    for (int i=0; i<size; i+=4)
    {
        sse_a[i/4] = _mm_set_ps(1.0*(i+3+1),
                                1.0*(i+2+1),
                                1.0*(i+1+1),
                                1.0*(i+0+1));

        sse_b[i/4] = _mm_set_ps(2.5*(i+3+1),
                                2.5*(i+2+1),
                                2.5*(i+1+1),
                                2.5*(i+0+1));

        sse_c[i/4] = _mm_set1_ps(0.0);
    }

    auto timer = workshop::start_timer();

    for (int j=0; j<100000; ++j)
    {
        for (int i=0; i<size; ++i)
        {
            c[i] = std::sqrt(a[i] + b[i]);
        }
    }

    auto duration = workshop::get_duration(timer);

    timer = workshop::start_timer();

    for (int j=0; j<100000; ++j)
    {    
        for (int i=0; i<size/4; ++i)
        {
            sse_c[i] = _mm_sqrt_ps( _mm_add_ps(sse_a[i], sse_b[i]) );
        }
    }

    auto vector_duration = workshop::get_duration(timer);

    std::cout << "The standard loop took " << duration
              << " microseconds to complete." << std::endl;

    std::cout << "The vectorised loop took " << vector_duration
              << " microseconds to complete." << std::endl;

    return 0;
}

Previous Up Next