Index: liboil/sse/multsum_sse.c =================================================================== RCS file: /cvs/liboil/liboil/liboil/sse/multsum_sse.c,v retrieving revision 1.2 diff -u -r1.2 multsum_sse.c --- liboil/sse/multsum_sse.c 23 May 2006 04:00:30 -0000 1.2 +++ liboil/sse/multsum_sse.c 31 May 2006 18:09:17 -0000 @@ -5,64 +5,309 @@ #include #include -#define MULTSUM_SSE2_NSTRIDED(i) { \ - t1 = _mm_load_pd(&OIL_GET(src1, i, double)); \ - t2 = _mm_load_pd(&OIL_GET(src2, i, double)); \ - t1 = _mm_mul_pd(t1,t2); \ - sum.reg = _mm_add_pd(sum.reg,t1); \ +#define MULTSUM_F32_SSE2_NSTRIDED_U(i) \ + t1 = _mm_load_ps(src1+i); \ + t2 = _mm_loadu_ps(src2+i); \ + t1 = _mm_mul_ps(t1,t2); \ + sum = _mm_add_ps(sum,t1); + +#define MULTSUM_F32_SSE2_NSTRIDED(i) \ + t1 = _mm_loadu_ps(src1+i); \ + t2 = _mm_loadu_ps(src2+i); \ + t1 = _mm_mul_ps(t1,t2); \ + sum = _mm_add_ps(sum,t1); + + +static void +multsum_f32_ns_sse(float *dest, + const float *src1, + const float *src2, + int n) +{ + float tmp[4]; + __m128 t1, t2, sum; + + sum = _mm_setzero_ps(); + + *dest = 0; + while ((((int)src1)&15) && 0 < n) { + *dest += *src1++ * *src2++; + n--; + } + + while (3 < n) { + MULTSUM_F32_SSE2_NSTRIDED_U(0); + + src1 += 4; + src2 += 4; + n -= 4; + } + _mm_storeu_ps(tmp,sum); + *dest += tmp[0] + tmp[1] + tmp[2] + tmp[3]; + while (0 < n) { + *dest += *src1++ * *src2++; + n--; + } } -#define MULTSUM_SSE2_NSTRIDEDP(i) { \ - t1 = _mm_load_pd(&OIL_GET(src1, i*sstr1, double)); \ - t2 = _mm_loadl_pd(t2, &OIL_GET(src2, i*sstr2, double)); \ - t2 = _mm_loadh_pd(t2, &OIL_GET(src2, (i+1)*sstr2, double)); \ +OIL_DEFINE_IMPL_FULL(multsum_f32_ns_sse, multsum_f32_ns, OIL_IMPL_FLAG_SSE); + +#define MULTSUM_SSE2_NSTRIDED(i) \ + t1 = _mm_load_pd(src1+i); \ + t2 = _mm_loadu_pd(src2+i); \ t1 = _mm_mul_pd(t1,t2); \ - sum.reg = _mm_add_pd(sum.reg,t1); \ + sum = _mm_add_pd(sum,t1); + +static void +multsum_f64_ns_sse2_unroll2(double *dest, + const double *src1, + const double *src2, + int n) +{ + double tmp; + __m128d t1, t2, sum; + + sum = _mm_setzero_pd(); + + *dest = 0; + while ((((int)src1)&15) && 0 < n) { + *dest += *src1++ * *src2++; + n--; + } + + while (5 < n) { + MULTSUM_SSE2_NSTRIDED(0); + MULTSUM_SSE2_NSTRIDED(2); + MULTSUM_SSE2_NSTRIDED(4); + + src1+=6; + src2+=6; + n -= 6; + } + while (3 < n) { + MULTSUM_SSE2_NSTRIDED(0); + MULTSUM_SSE2_NSTRIDED(2); + + src1+=4; + src2+=4; + n -= 4; + } + while (1 < n) { + MULTSUM_SSE2_NSTRIDED(0); + + src1+=2; + src2+=2; + n -= 2; + } + _mm_storel_pd(&tmp,sum); + *dest += tmp; + _mm_storeh_pd(&tmp,sum); + *dest += tmp; + while (0 < n) { + *dest += *src1++ * *src2++; + n--; + } +} +OIL_DEFINE_IMPL_FULL(multsum_f64_ns_sse2_unroll2, multsum_f64_ns, OIL_IMPL_FLAG_SSE2); + +static void +multsum_f64_ns_sse2_unroll(double *dest, + const double *src1, + const double *src2, + int n) +{ + double tmp; + __m128d t1, t2, sum; + + sum = _mm_setzero_pd(); + + *dest = 0; + while ((((int)src1)&15) && 0 < n) { + *dest += *src1++ * *src2++; + n--; + } + + while (3 < n) { + MULTSUM_SSE2_NSTRIDED(0); + MULTSUM_SSE2_NSTRIDED(2); + + src1+=4; + src2+=4; + n -= 4; + } + while (1 < n) { + MULTSUM_SSE2_NSTRIDED(0); + + src1+=2; + src2+=2; + n -= 2; + } + _mm_storel_pd(&tmp,sum); + *dest += tmp; + _mm_storeh_pd(&tmp,sum); + *dest += tmp; + while (0 < n) { + *dest += *src1++ * *src2++; + n--; + } +} +OIL_DEFINE_IMPL_FULL(multsum_f64_ns_sse2_unroll, multsum_f64_ns, OIL_IMPL_FLAG_SSE2); + +static void +multsum_f64_ns_sse2(double *dest, + const double *src1, + const double *src2, + int n) +{ + double tmp; + __m128d t1, t2, sum; + + sum = _mm_setzero_pd(); + + *dest = 0; + while ((((int)src1)&15) && 0 < n) { + *dest += *src1++ * *src2++; + n--; + } + while (1 < n) { + MULTSUM_SSE2_NSTRIDED(0); + + src1+=2; + src2+=2; + n -= 2; + } + _mm_storel_pd(&tmp,sum); + *dest += tmp; + _mm_storeh_pd(&tmp,sum); + *dest += tmp; + while (0 < n) { + *dest += *src1++ * *src2++; + n--; + } } -#define MULTSUM_SSE2_STRIDED(i) { \ +OIL_DEFINE_IMPL_FULL(multsum_f64_ns_sse2, multsum_f64_ns, OIL_IMPL_FLAG_SSE2); + +#define MULTSUM_SSE2_STRIDED(i) \ t1 = _mm_loadl_pd(t1, &OIL_GET(src1, i*sstr1, double)); \ t1 = _mm_loadh_pd(t1, &OIL_GET(src1, (i+1)*sstr1, double)); \ t2 = _mm_loadl_pd(t2, &OIL_GET(src2, i*sstr2, double)); \ t2 = _mm_loadh_pd(t2, &OIL_GET(src2, (i+1)*sstr2, double)); \ t1 = _mm_mul_pd(t1,t2); \ - sum.reg = _mm_add_pd(sum.reg,t1); \ -} + sum = _mm_add_pd(sum,t1); -#ifdef ENABLE_BROKEN_IMPLS static void -multsum_f64_sse2_unroll4(double *dest, +multsum_f64_sse2_unrolla(double *dest, const double *src1, int sstr1, const double *src2, int sstr2, int n) { - __m128d t1, t2; - union { - __m128d reg; - double vals[2]; - } sum; - int i = 0; + double tmp; + __m128d t1, t2, sum; - sum.reg = _mm_setzero_pd(); - while (i < n-3) { + sum = _mm_setzero_pd(); + while (3 < n) { MULTSUM_SSE2_STRIDED(0); MULTSUM_SSE2_STRIDED(2); OIL_INCREMENT(src1, 4*sstr1); OIL_INCREMENT(src2, 4*sstr2); - i += 4; + n -= 4; } - while (i < n-1) { + while (1 < n) { MULTSUM_SSE2_STRIDED(0); OIL_INCREMENT(src1, 2*sstr1); OIL_INCREMENT(src2, 2*sstr2); - i+=2; + n -= 2; + } + _mm_storel_pd(dest,sum); + _mm_storeh_pd(&tmp,sum); + *dest += tmp; + while (0 < n) { + *dest += *src1 * *src2; + OIL_INCREMENT(src1, sstr1); + OIL_INCREMENT(src2, sstr2); + n--; + } +} +OIL_DEFINE_IMPL_FULL(multsum_f64_sse2_unrolla, multsum_f64, OIL_IMPL_FLAG_SSE2); + +static void +multsum_f64_sse2_unrollb(double *dest, + const double *src1, int sstr1, + const double *src2, int sstr2, + int n) +{ + double tmp; + __m128d t1, t2, t3, t4, sum1, sum2; + + sum1 = _mm_setzero_pd(); + sum2 = _mm_setzero_pd(); + while (3 < n) { + t1 = _mm_loadl_pd(t1, &OIL_GET(src1, 0, double)); + t1 = _mm_loadh_pd(t1, &OIL_GET(src1, sstr1, double)); + t3 = _mm_loadl_pd(t3, &OIL_GET(src1, 2*sstr1, double)); + t3 = _mm_loadh_pd(t3, &OIL_GET(src1, 3*sstr1, double)); + + t2 = _mm_loadl_pd(t2, &OIL_GET(src2, 0, double)); + t2 = _mm_loadh_pd(t2, &OIL_GET(src2, sstr2, double)); + t4 = _mm_loadl_pd(t4, &OIL_GET(src2, 2*sstr2, double)); + t4 = _mm_loadh_pd(t4, &OIL_GET(src2, 3*sstr2, double)); + + t1 = _mm_mul_pd(t1,t2); + t3 = _mm_mul_pd(t3,t4); + sum1 = _mm_add_pd(sum1,t1); + sum2 = _mm_add_pd(sum2,t3); + + OIL_INCREMENT(src1, 4*sstr1); + OIL_INCREMENT(src2, 4*sstr2); + n -= 4; } - *dest = sum.vals[0] + sum.vals[1]; - if (i < n) { - *dest += (OIL_GET(src1,0,double)*OIL_GET(src2,0,double)); + sum1 = _mm_add_pd(sum1,sum2); + _mm_storel_pd(dest,sum1); + _mm_storeh_pd(&tmp,sum1); + *dest += tmp; + while (0 < n) { + *dest += *src1 * *src2; + OIL_INCREMENT(src1, sstr1); + OIL_INCREMENT(src2, sstr2); + n--; } } -OIL_DEFINE_IMPL_FULL (multsum_f64_sse2_unroll4, multsum_f64, OIL_IMPL_FLAG_SSE2); -#endif +OIL_DEFINE_IMPL_FULL(multsum_f64_sse2_unrollb, multsum_f64, OIL_IMPL_FLAG_SSE2); + +static void +multsum_f64_sse2(double *dest, + const double *src1, int sstr1, + const double *src2, int sstr2, + int n) +{ + double tmp; + __m128d t1, t2, sum1; + + sum1 = _mm_setzero_pd(); + while (1 < n) { + t1 = _mm_loadl_pd(t1, &OIL_GET(src1, 0, double)); + t1 = _mm_loadh_pd(t1, &OIL_GET(src1, sstr1, double)); + + t2 = _mm_loadl_pd(t2, &OIL_GET(src2, 0, double)); + t2 = _mm_loadh_pd(t2, &OIL_GET(src2, sstr2, double)); + + t1 = _mm_mul_pd(t1,t2); + sum1 = _mm_add_pd(sum1,t1); + OIL_INCREMENT(src1, 2*sstr1); + OIL_INCREMENT(src2, 2*sstr2); + n -= 2; + } + _mm_storel_pd(dest,sum1); + _mm_storeh_pd(&tmp,sum1); + *dest += tmp; + while (0 < n) { + *dest += *src1 * *src2; + OIL_INCREMENT(src1, sstr1); + OIL_INCREMENT(src2, sstr2); + n--; + } +} +OIL_DEFINE_IMPL_FULL(multsum_f64_sse2, multsum_f64, OIL_IMPL_FLAG_SSE2); Index: liboil/ref/multsum.c =================================================================== RCS file: /cvs/liboil/liboil/liboil/ref/multsum.c,v retrieving revision 1.3 diff -u -r1.3 multsum.c --- liboil/ref/multsum.c 16 Dec 2005 07:45:29 -0000 1.3 +++ liboil/ref/multsum.c 31 May 2006 18:09:18 -0000 @@ -34,7 +34,39 @@ #include +#define MULTSUM_NS_DEFINE_REF(type) \ +OIL_DEFINE_CLASS (multsum_ ## type ## _ns, \ + "oil_type_" #type " *dest, " \ + "oil_type_" #type " *src1, " \ + "oil_type_" #type " *src2, " \ + "int n"); \ +static void multsum_ ## type ## _ns_ref( \ + oil_type_ ## type *dest, \ + oil_type_ ## type *src1, \ + oil_type_ ## type *src2, \ + int n) \ +{ \ + int i; \ + double sum = 0; \ + double errsum = 0; \ + for(i=0;i