diff --git a/liboil/liboil-stdint.h b/liboil/liboil-stdint.h index 720851d..5826bb6 100644 --- a/liboil/liboil-stdint.h +++ b/liboil/liboil-stdint.h @@ -1,8 +1,8 @@ #ifndef _LIBOIL_LIBOIL_LIBOIL_STDINT_H #define _LIBOIL_LIBOIL_LIBOIL_STDINT_H 1 #ifndef _GENERATED_STDINT_H -#define _GENERATED_STDINT_H "liboil 0.3.13" -/* generated using gnu compiler gcc (GCC) 4.1.3 20070929 (prerelease) (Ubuntu 4.1.2-16ubuntu2) */ +#define _GENERATED_STDINT_H "liboil 0.3.14" +/* generated using gnu compiler gcc (GCC) 4.2.3 (Ubuntu 4.2.3-2ubuntu7) */ #define _STDINT_HAVE_STDINT_H 1 #include #endif diff --git a/liboil/sse/clamp_sse.c b/liboil/sse/clamp_sse.c index 5d34c6a..06c8ae7 100644 --- a/liboil/sse/clamp_sse.c +++ b/liboil/sse/clamp_sse.c @@ -32,7 +32,10 @@ #include #include -static void +/* TODO: If we have gcc 4.2 or above, do this. Otherwise, disable all SSE use */ +#define SSE_FUNCTION __attribute__((force_align_arg_pointer)) + +SSE_FUNCTION static void clamp_u8_sse (uint8_t *dest, uint8_t *src1, int n, uint8_t *src2_1, uint8_t *src3_1) { @@ -71,7 +74,7 @@ clamp_u8_sse (uint8_t *dest, uint8_t *src1, int n, uint8_t *src2_1, } OIL_DEFINE_IMPL_FULL (clamp_u8_sse, clamp_u8, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void clamp_s16_sse (int16_t *dest, int16_t *src1, int n, int16_t *src2_1, int16_t *src3_1) { @@ -110,7 +113,7 @@ clamp_s16_sse (int16_t *dest, int16_t *src1, int n, int16_t *src2_1, } OIL_DEFINE_IMPL_FULL (clamp_s16_sse, clamp_s16, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void clamp_f32_sse (float *dest, const float *src1, int n, const float *src2_1, const float *src3_1) { @@ -149,7 +152,7 @@ clamp_f32_sse (float *dest, const float *src1, int n, const float *src2_1, } OIL_DEFINE_IMPL_FULL (clamp_f32_sse, clamp_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void clamp_f64_sse (double *dest, const double *src1, int n, const double *src2_1, const double *src3_1) { @@ -189,7 +192,7 @@ clamp_f64_sse (double *dest, const double *src1, int n, const double *src2_1, OIL_DEFINE_IMPL_FULL (clamp_f64_sse, clamp_f64, OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void clamplow_u8_sse (uint8_t *dest, const uint8_t *src1, int n, const uint8_t *src2_1) { @@ -221,7 +224,7 @@ clamplow_u8_sse (uint8_t *dest, const uint8_t *src1, int n, } OIL_DEFINE_IMPL_FULL (clamplow_u8_sse, clamplow_u8, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void clamplow_s16_sse (int16_t *dest, const int16_t *src1, int n, const int16_t *src2_1) { @@ -253,7 +256,7 @@ clamplow_s16_sse (int16_t *dest, const int16_t *src1, int n, } OIL_DEFINE_IMPL_FULL (clamplow_s16_sse, clamplow_s16, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void clamplow_f32_sse (float *dest, const float *src1, int n, const float *src2_1) { __m128 xmm1; @@ -284,7 +287,7 @@ clamplow_f32_sse (float *dest, const float *src1, int n, const float *src2_1) } OIL_DEFINE_IMPL_FULL (clamplow_f32_sse, clamplow_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void clamplow_f64_sse (double *dest, const double *src1, int n, const double *src2_1) { __m128d xmm1; @@ -316,7 +319,7 @@ clamplow_f64_sse (double *dest, const double *src1, int n, const double *src2_1) OIL_DEFINE_IMPL_FULL (clamplow_f64_sse, clamplow_f64, OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void clamphigh_u8_sse (uint8_t *dest, const uint8_t *src1, int n, const uint8_t *src2_1) { @@ -348,7 +351,7 @@ clamphigh_u8_sse (uint8_t *dest, const uint8_t *src1, int n, } OIL_DEFINE_IMPL_FULL (clamphigh_u8_sse, clamphigh_u8, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void clamphigh_s16_sse (int16_t *dest, const int16_t *src1, int n, const int16_t *src2_1) { @@ -380,7 +383,7 @@ clamphigh_s16_sse (int16_t *dest, const int16_t *src1, int n, } OIL_DEFINE_IMPL_FULL (clamphigh_s16_sse, clamphigh_s16, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void clamphigh_f32_sse (float *dest, const float *src1, int n, const float *src2_1) { __m128 xmm1; @@ -411,7 +414,7 @@ clamphigh_f32_sse (float *dest, const float *src1, int n, const float *src2_1) } OIL_DEFINE_IMPL_FULL (clamphigh_f32_sse, clamphigh_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void clamphigh_f64_sse (double *dest, const double *src1, int n, const double *src2_1) { __m128d xmm1; diff --git a/liboil/sse/composite_sse.c b/liboil/sse/composite_sse.c index 307fd17..ce749cf 100644 --- a/liboil/sse/composite_sse.c +++ b/liboil/sse/composite_sse.c @@ -32,9 +32,11 @@ #include #include +#define SSE_FUNCTION __attribute__((force_align_arg_pointer)) + #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s)) -static void +SSE_FUNCTION static void composite_add_argb_sse (uint32_t *dest, const uint32_t *src, int n) { /* Initial operations to align the destination pointer */ @@ -67,7 +69,7 @@ composite_add_argb_sse (uint32_t *dest, const uint32_t *src, int n) OIL_DEFINE_IMPL_FULL (composite_add_argb_sse, composite_add_argb, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_add_argb_const_src_sse (uint32_t *dest, const uint32_t *src_1, int n) { __m128i s; @@ -103,7 +105,7 @@ composite_add_argb_const_src_sse (uint32_t *dest, const uint32_t *src_1, int n) OIL_DEFINE_IMPL_FULL (composite_add_argb_const_src_sse, composite_add_argb_const_src, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_add_u8_sse (uint8_t *dest, const uint8_t *src, int n) { /* Initial operations to align the destination pointer */ @@ -131,7 +133,7 @@ composite_add_u8_sse (uint8_t *dest, const uint8_t *src, int n) OIL_DEFINE_IMPL_FULL (composite_add_u8_sse, composite_add_u8, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_add_u8_const_src_sse (uint8_t *dest, const uint8_t *src_1, int n) { __m128i s; diff --git a/liboil/sse/composite_sse_2pix.c b/liboil/sse/composite_sse_2pix.c index 13f2cf4..2d19475 100644 --- a/liboil/sse/composite_sse_2pix.c +++ b/liboil/sse/composite_sse_2pix.c @@ -32,7 +32,7 @@ #include #include -#include "sse_wrapper.h" +#define SSE_FUNCTION __attribute__((force_align_arg_pointer)) /* non-SSE2 compositing support */ #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m))) @@ -63,7 +63,7 @@ static const struct _SSEData { /* Shuffles the given value such that the alpha for each pixel appears in each * channel of the pixel. */ -static inline __m128i +SSE_FUNCTION static inline __m128i argb_A_sse2(__m128i a) { a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3,3,3,3)); @@ -74,7 +74,7 @@ argb_A_sse2(__m128i a) /* Multiplies the pixel data in a channel-by-channel by b, and divides the * result by 255, with rounding. */ -static inline __m128i +SSE_FUNCTION static inline __m128i muldiv_255_sse2(__m128i a, __m128i b) { __m128i ret; @@ -88,14 +88,14 @@ muldiv_255_sse2(__m128i a, __m128i b) return ret; } -static inline __m128i +SSE_FUNCTION static inline __m128i negate_argb_sse2(__m128i a) { return _mm_xor_si128(a, MC(8x00ff)); } /* Loads the 2 (unaligned) pixels at *src into unpacked SSE2 registers */ -static inline __m128i +SSE_FUNCTION static inline __m128i load_argb_sse2(const uint32_t *src) { __m128i pix; @@ -105,7 +105,7 @@ load_argb_sse2(const uint32_t *src) return pix; } -static inline __m128i +SSE_FUNCTION static inline __m128i set1_argb_sse2(uint32_t src) { __m128i pix; @@ -115,32 +115,33 @@ set1_argb_sse2(uint32_t src) return pix; } -static inline __m128i +SSE_FUNCTION static inline __m128i load_u8_mask(const uint8_t *m) { return _mm_unpacklo_epi64(_mm_set1_epi16(m[0]), _mm_set1_epi16(m[1])); } -static inline __m128i +SSE_FUNCTION static inline __m128i set1_u8_mask(uint8_t m) { return _mm_unpacklo_epi8(_mm_set1_epi8(m), _mm_setzero_si128()); } /* Stores the 2 unpacked pixels in pix into the (unaligned) *dest */ -static void +SSE_FUNCTION static void store_argb_sse2(uint32_t *dest, __m128i pix) { pix = _mm_packus_epi16(pix, pix); _mm_storel_epi64((__m128i *)dest, pix); } -static __m128i over_argb_sse2(__m128i dest, __m128i src, __m128i srca) +SSE_FUNCTION static __m128i +over_argb_sse2(__m128i dest, __m128i src, __m128i srca) { return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca))); } -static void +SSE_FUNCTION static void composite_in_argb_sse_2pix (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n) { @@ -167,7 +168,7 @@ composite_in_argb_sse_2pix (uint32_t *dest, const uint32_t *src, OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_sse_2pix, composite_in_argb, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_in_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n) { @@ -196,7 +197,7 @@ OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_src_sse_2pix, composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2); #ifdef SSE_ALIGN -static void +SSE_FUNCTION static void composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n) { @@ -225,7 +226,7 @@ OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_mask_sse_2pix, composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2); #endif -static void +SSE_FUNCTION static void composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n) { for (; n >= 2; n -= 2) { @@ -251,7 +252,7 @@ composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n) OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_sse_2pix, composite_over_argb, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src, int n) { @@ -281,7 +282,7 @@ composite_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src, OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_const_src_sse_2pix, composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n) { @@ -318,7 +319,7 @@ composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_sse_2pix, composite_in_over_argb, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n) { @@ -357,7 +358,7 @@ composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src, OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_src_sse_2pix, composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n) { @@ -396,7 +397,7 @@ composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src, OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_mask_sse_2pix, composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n) { /* Initial operations to align the destination pointer */ diff --git a/liboil/sse/composite_sse_4pix.c b/liboil/sse/composite_sse_4pix.c index 6e03ff9..12f4d2b 100644 --- a/liboil/sse/composite_sse_4pix.c +++ b/liboil/sse/composite_sse_4pix.c @@ -32,9 +32,7 @@ #include #include -#include "sse_wrapper.h" - - +#define SSE_FUNCTION __attribute__((force_align_arg_pointer)) #ifdef ENABLE_BROKEN_IMPLS @@ -67,7 +65,7 @@ static const struct _SSEData { /* Shuffles the given value such that the alpha for each pixel appears in each * channel of the pixel. */ -static inline __m128i +SSE_FUNCTION static inline __m128i argb_A_sse2(__m128i a) { #if 0 @@ -92,7 +90,7 @@ argb_A_sse2(__m128i a) /* Multiplies the unpacked 16-bits-per-channel pixel data in a * channel-by-channel by b, and divides the result by 255, with rounding. */ -static inline __m128i +SSE_FUNCTION static inline __m128i inner_muldiv_255_sse2(__m128i a, __m128i b) { __m128i ret; @@ -106,7 +104,7 @@ inner_muldiv_255_sse2(__m128i a, __m128i b) return ret; } -static inline __m128i +SSE_FUNCTION static inline __m128i muldiv_255_sse2(__m128i a, __m128i b) { __m128i alow, blow, ahigh, bhigh, low, high; @@ -120,25 +118,25 @@ muldiv_255_sse2(__m128i a, __m128i b) return _mm_packus_epi16(low, high); } -static inline __m128i +SSE_FUNCTION static inline __m128i negate_argb_sse2(__m128i a) { return _mm_xor_si128(a, MC(16xff)); } -static inline __m128i +SSE_FUNCTION static inline __m128i load_argb_sse2(const uint32_t *src) { return _mm_loadu_si128((__m128i *)src); } -static inline __m128i +SSE_FUNCTION static inline __m128i set1_argb_sse2(uint32_t src) { return _mm_set1_epi32(src); } -static inline __m128i +SSE_FUNCTION static inline __m128i load_u8_mask(const uint8_t *m) { __m128i a; @@ -148,24 +146,25 @@ load_u8_mask(const uint8_t *m) return a; } -static inline __m128i +SSE_FUNCTION static inline __m128i set1_u8_mask(uint8_t m) { return _mm_set1_epi8(m); } -static void +SSE_FUNCTION static void store_argb_sse2(uint32_t *dest, __m128i pix) { _mm_store_si128((__m128i *)dest, pix); } -static __m128i over_argb_sse2(__m128i dest, __m128i src, __m128i srca) +SSE_FUNCTION static __m128i +over_argb_sse2(__m128i dest, __m128i src, __m128i srca) { return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca))); } -static void +SSE_FUNCTION static void composite_in_argb_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n) { @@ -202,7 +201,7 @@ composite_in_argb_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask, OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_sse, composite_in_argb, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n) { @@ -239,7 +238,7 @@ composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src, OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse, composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n) { @@ -276,7 +275,7 @@ composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src, OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse, composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n) { for (; ((long)dest & 15) && (n > 0); n--) { @@ -311,7 +310,7 @@ composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n) OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_sse, composite_over_argb, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, int n) { __m128i s, sa; @@ -348,7 +347,7 @@ composite_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, int n) OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse, composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n) { @@ -401,7 +400,7 @@ composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src, OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_sse, composite_in_over_argb, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_in_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n) { @@ -456,7 +455,7 @@ composite_in_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse, composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int n) { @@ -511,7 +510,7 @@ composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src, OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse, composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void composite_over_u8_sse (uint8_t *dest, const uint8_t *src, int n) { /* Initial operations to align the destination pointer */ diff --git a/liboil/sse/copy_sse.c b/liboil/sse/copy_sse.c index b695bc4..5509eae 100644 --- a/liboil/sse/copy_sse.c +++ b/liboil/sse/copy_sse.c @@ -31,7 +31,9 @@ #include #include -static void +#define SSE_FUNCTION __attribute__((force_align_arg_pointer)) + +SSE_FUNCTION static void copy_u8_sse (uint8_t *dest, const uint8_t *src, int n) { for (; ((long)dest & 15) && (n > 0); n--) { @@ -48,7 +50,7 @@ copy_u8_sse (uint8_t *dest, const uint8_t *src, int n) } OIL_DEFINE_IMPL_FULL (copy_u8_sse, copy_u8, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void copy_u8_sse_unroll2 (uint8_t *dest, const uint8_t *src, int n) { for (; ((long)dest & 15) && (n > 0); n--) { diff --git a/liboil/sse/math_sse.c b/liboil/sse/math_sse.c index e5d238d..0b70b42 100644 --- a/liboil/sse/math_sse.c +++ b/liboil/sse/math_sse.c @@ -32,7 +32,9 @@ #include #include -static void +#define SSE_FUNCTION __attribute__((force_align_arg_pointer)) + +SSE_FUNCTION static void add_f32_sse (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ @@ -55,7 +57,7 @@ add_f32_sse (float *dest, float *src1, float *src2, int n) } OIL_DEFINE_IMPL_FULL (add_f32_sse, add_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void add_f64_sse2 (double *dest, double *src1, double *src2, int n) { __m128d xmm0, xmm1; @@ -80,7 +82,7 @@ add_f64_sse2 (double *dest, double *src1, double *src2, int n) } OIL_DEFINE_IMPL_FULL (add_f64_sse2, add_f64, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n) { __m128d xmm0, xmm1; @@ -120,7 +122,7 @@ add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n) } OIL_DEFINE_IMPL_FULL (add_f64_sse2_unroll, add_f64, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void subtract_f32_sse (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ @@ -143,7 +145,7 @@ subtract_f32_sse (float *dest, float *src1, float *src2, int n) } OIL_DEFINE_IMPL_FULL (subtract_f32_sse, subtract_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void multiply_f32_sse (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ @@ -166,7 +168,7 @@ multiply_f32_sse (float *dest, float *src1, float *src2, int n) } OIL_DEFINE_IMPL_FULL (multiply_f32_sse, multiply_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void divide_f32_sse (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ @@ -189,7 +191,7 @@ divide_f32_sse (float *dest, float *src1, float *src2, int n) } OIL_DEFINE_IMPL_FULL (divide_f32_sse, divide_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void minimum_f32_sse (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ @@ -216,7 +218,7 @@ minimum_f32_sse (float *dest, float *src1, float *src2, int n) } OIL_DEFINE_IMPL_FULL (minimum_f32_sse, minimum_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void maximum_f32_sse (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ @@ -243,7 +245,7 @@ maximum_f32_sse (float *dest, float *src1, float *src2, int n) } OIL_DEFINE_IMPL_FULL (maximum_f32_sse, maximum_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void inverse_f32_sse (float *dest, float *src1, int n) { /* Initial operations to align the destination pointer */ @@ -268,7 +270,7 @@ inverse_f32_sse (float *dest, float *src1, int n) } OIL_DEFINE_IMPL_FULL (inverse_f32_sse, inverse_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void negative_f32_sse (float *dest, float *src1, int n) { /* Initial operations to align the destination pointer */ @@ -290,7 +292,7 @@ negative_f32_sse (float *dest, float *src1, int n) } OIL_DEFINE_IMPL_FULL (negative_f32_sse, negative_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void scalaradd_f32_ns_sse (float *dest, float *src1, float *val, int n) { __m128 xmm1; @@ -314,7 +316,7 @@ scalaradd_f32_ns_sse (float *dest, float *src1, float *val, int n) } OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void scalarmultiply_f32_ns_sse (float *dest, float *src1, float *val, int n) { __m128 xmm1; @@ -338,7 +340,7 @@ scalarmultiply_f32_ns_sse (float *dest, float *src1, float *val, int n) } OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void scalarmultiply_f64_ns_sse2 (double *dest, double *src1, double *val, int n) { __m128d xmm1; diff --git a/liboil/sse/math_sse_unroll2.c b/liboil/sse/math_sse_unroll2.c index 51dca09..cd4f55f 100644 --- a/liboil/sse/math_sse_unroll2.c +++ b/liboil/sse/math_sse_unroll2.c @@ -32,7 +32,9 @@ #include #include -static void +#define SSE_FUNCTION __attribute__((force_align_arg_pointer)) + +SSE_FUNCTION static void add_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ @@ -59,7 +61,7 @@ add_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) } OIL_DEFINE_IMPL_FULL (add_f32_sse_unroll2, add_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void subtract_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ @@ -86,7 +88,7 @@ subtract_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) } OIL_DEFINE_IMPL_FULL (subtract_f32_sse_unroll2, subtract_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void multiply_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ @@ -113,7 +115,7 @@ multiply_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) } OIL_DEFINE_IMPL_FULL (multiply_f32_sse_unroll2, multiply_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void divide_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ @@ -140,7 +142,7 @@ divide_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) } OIL_DEFINE_IMPL_FULL (divide_f32_sse_unroll2, divide_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void minimum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ @@ -171,7 +173,7 @@ minimum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) } OIL_DEFINE_IMPL_FULL (minimum_f32_sse_unroll2, minimum_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void maximum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ @@ -202,7 +204,7 @@ maximum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) } OIL_DEFINE_IMPL_FULL (maximum_f32_sse_unroll2, maximum_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void inverse_f32_sse_unroll2 (float *dest, float *src1, int n) { /* Initial operations to align the destination pointer */ @@ -231,7 +233,7 @@ inverse_f32_sse_unroll2 (float *dest, float *src1, int n) } OIL_DEFINE_IMPL_FULL (inverse_f32_sse_unroll2, inverse_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void negative_f32_sse_unroll2 (float *dest, float *src1, int n) { /* Initial operations to align the destination pointer */ @@ -257,7 +259,7 @@ negative_f32_sse_unroll2 (float *dest, float *src1, int n) } OIL_DEFINE_IMPL_FULL (negative_f32_sse_unroll2, negative_f32, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void scalaradd_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n) { __m128 xmm1; @@ -284,7 +286,7 @@ scalaradd_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n) } OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void scalarmultiply_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n) { __m128 xmm1; @@ -311,7 +313,7 @@ scalarmultiply_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n) } OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE); -static void +SSE_FUNCTION static void scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n) { __m128d xmm1; diff --git a/liboil/sse/multsum_sse.c b/liboil/sse/multsum_sse.c index 37238a5..382dd60 100644 --- a/liboil/sse/multsum_sse.c +++ b/liboil/sse/multsum_sse.c @@ -5,6 +5,8 @@ #include #include +#define SSE_FUNCTION __attribute__((force_align_arg_pointer)) + #define MULTSUM_SSE2_NSTRIDED(i) { \ t1 = _mm_load_pd(&OIL_GET(src1, i, double)); \ t2 = _mm_load_pd(&OIL_GET(src2, i, double)); \ @@ -29,7 +31,7 @@ #ifdef ENABLE_BROKEN_IMPLS -static void +SSE_FUNCTION static void multsum_f64_sse2_unroll4(double *dest, const double *src1, int sstr1, const double *src2, int sstr2, diff --git a/liboil/sse/sad8x8_sse.c b/liboil/sse/sad8x8_sse.c index 8795200..3c7615c 100644 --- a/liboil/sse/sad8x8_sse.c +++ b/liboil/sse/sad8x8_sse.c @@ -31,7 +31,7 @@ #include #include -#include "sse_wrapper.h" +#define SSE_FUNCTION __attribute__((force_align_arg_pointer)) #ifdef ENABLE_BROKEN_IMPLS union m128_int { @@ -40,7 +40,7 @@ union m128_int { uint16_t s[8]; }; -static void +SSE_FUNCTION static void sad8x8_u8_sse (uint32_t *dest, uint8_t *src1, int sstr1, uint8_t *src2, int sstr2) { diff --git a/liboil/sse/splat_sse.c b/liboil/sse/splat_sse.c index 14593a6..a6c0be7 100644 --- a/liboil/sse/splat_sse.c +++ b/liboil/sse/splat_sse.c @@ -31,7 +31,9 @@ #include #include -static void +#define SSE_FUNCTION __attribute__((force_align_arg_pointer)) + +SSE_FUNCTION static void splat_u32_ns_sse (uint32_t *dest, const uint32_t *param, int n) { __m128i v; @@ -51,7 +53,7 @@ splat_u32_ns_sse (uint32_t *dest, const uint32_t *param, int n) } OIL_DEFINE_IMPL_FULL (splat_u32_ns_sse, splat_u32_ns, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void splat_u32_ns_sse_unroll2 (uint32_t *dest, const uint32_t *param, int n) { __m128i v; @@ -76,7 +78,7 @@ splat_u32_ns_sse_unroll2 (uint32_t *dest, const uint32_t *param, int n) } OIL_DEFINE_IMPL_FULL (splat_u32_ns_sse_unroll2, splat_u32_ns, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void splat_u8_ns_sse (uint8_t *dest, const uint8_t *param, int n) { __m128i v; @@ -96,7 +98,7 @@ splat_u8_ns_sse (uint8_t *dest, const uint8_t *param, int n) } OIL_DEFINE_IMPL_FULL (splat_u8_ns_sse, splat_u8_ns, OIL_IMPL_FLAG_SSE2); -static void +SSE_FUNCTION static void splat_u8_ns_sse_unroll2 (uint8_t *dest, const uint8_t *param, int n) { __m128i v;