Index: liboil/simdpack/multsum.c =================================================================== RCS file: /cvs/liboil/liboil/liboil/simdpack/multsum.c,v retrieving revision 1.12 diff -u -p -r1.12 multsum.c --- liboil/simdpack/multsum.c 13 Dec 2005 20:13:44 -0000 1.12 +++ liboil/simdpack/multsum.c 17 May 2006 18:00:54 -0000 @@ -55,3 +55,47 @@ static void multsum_f32_unroll2 (float * } OIL_DEFINE_IMPL (multsum_f32_unroll2, multsum_f32); + +static void multsum_f64_unroll8 (double *dest, double *src1, int sstr1, + double *src2, int sstr2, int n) +{ + int i = 0; + double sum = 0; + + while(i +#include +#include + +#define MULTSUM_SSE2_NSTRIDED(i) { \ + t1 = _mm_load_pd(&OIL_GET(src1, i, double)); \ + t2 = _mm_load_pd(&OIL_GET(src2, i, double)); \ + t1 = _mm_mul_pd(t1,t2); \ + sum.reg = _mm_add_pd(sum.reg,t1); \ +} +#define MULTSUM_SSE2_NSTRIDEDP(i) { \ + t1 = _mm_load_pd(&OIL_GET(src1, i*sstr1, double)); \ + t2 = _mm_loadl_pd(t2, &OIL_GET(src2, i*sstr2, double)); \ + t2 = _mm_loadh_pd(t2, &OIL_GET(src2, (i+1)*sstr2, double)); \ + t1 = _mm_mul_pd(t1,t2); \ + sum.reg = _mm_add_pd(sum.reg,t1); \ +} +#define MULTSUM_SSE2_STRIDED(i) { \ + t1 = _mm_loadl_pd(t1, &OIL_GET(src1, i*sstr1, double)); \ + t1 = _mm_loadh_pd(t1, &OIL_GET(src1, (i+1)*sstr1, double)); \ + t2 = _mm_loadl_pd(t2, &OIL_GET(src2, i*sstr2, double)); \ + t2 = _mm_loadh_pd(t2, &OIL_GET(src2, (i+1)*sstr2, double)); \ + t1 = _mm_mul_pd(t1,t2); \ + sum.reg = _mm_add_pd(sum.reg,t1); \ +} + + +static void +multsum_f64_sse2_unroll4(double *dest, + const double *src1, int sstr1, + const double *src2, int sstr2, + int n) +{ + __m128d t1, t2; + union { + __m128d reg; + double vals[2]; + } sum; + int i = 0; + + sum.reg = _mm_setzero_pd(); + while (i < n-3) { + MULTSUM_SSE2_STRIDED(0); + MULTSUM_SSE2_STRIDED(2); + + OIL_INCREMENT(src1, 4*sstr1); + OIL_INCREMENT(src2, 4*sstr2); + i += 4; + } + while (i < n-1) { + MULTSUM_SSE2_STRIDED(0); + + OIL_INCREMENT(src1, 2*sstr1); + OIL_INCREMENT(src2, 2*sstr2); + i+=2; + } + *dest = sum.vals[0] + sum.vals[1]; + if (i < n) { + *dest += (OIL_GET(src1,0,double)*OIL_GET(src2,0,double)); + } +} +OIL_DEFINE_IMPL_FULL (multsum_f64_sse2_unroll4, multsum_f64, OIL_IMPL_FLAG_SSE2);