Index: programs/Xserver/fb/Imakefile =================================================================== RCS file: /cvs/xorg/xc/programs/Xserver/fb/Imakefile,v retrieving revision 1.7 diff -u -r1.7 Imakefile --- programs/Xserver/fb/Imakefile 6 Aug 2004 23:42:10 -0000 1.7 +++ programs/Xserver/fb/Imakefile 24 Dec 2004 02:33:45 -0000 @@ -3,13 +3,21 @@ XCOMM XCOMM Id: Imakefile,v 1.1 1999/11/02 03:54:44 keithp Exp $ -#if defined(i386Architecture) && defined(HasGcc34) && HasGcc34 +#if defined(HasGcc34) && HasGcc34 MMXOPTIONS= -mmmx -Winline --param inline-unit-growth=10000 \ - --param large-function-growth=10000 -DUSE_GCC34_MMX + --param large-function-growth=10000 -DUSE_MMX +SSEOPTIONS= $(MMXOPTIONS) -msse -DUSE_SSE +#if defined(i386Architecture) SpecialCObjectRule(fbmmx,fbmmx.c,$(MMXOPTIONS)) +#elif defined(AMD64Architecture) +SpecialCObjectRule(fbmmx,fbmmx.c,$(SSEOPTIONS)) +#endif + +#if defined(i386Architecture) || defined(AMD64Architecture) SpecialCObjectRule(fbpict,fbpict.c,$(MMXOPTIONS)) SpecialCObjectRule(fbfill,fbfill.c,$(MMXOPTIONS)) +#endif #endif Index: programs/Xserver/fb/fbfill.c =================================================================== RCS file: /cvs/xorg/xc/programs/Xserver/fb/fbfill.c,v retrieving revision 1.4 diff -u -r1.4 fbfill.c --- programs/Xserver/fb/fbfill.c 4 Dec 2004 00:42:50 -0000 1.4 +++ programs/Xserver/fb/fbfill.c 24 Dec 2004 02:33:45 -0000 @@ -44,7 +44,7 @@ switch (pGC->fillStyle) { case FillSolid: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (!pPriv->and && fbHaveMMX()) if (fbSolidFillmmx (pDrawable, x, y, width, height, pPriv->xor)) return; Index: programs/Xserver/fb/fbmmx.c =================================================================== RCS file: /cvs/xorg/xc/programs/Xserver/fb/fbmmx.c,v retrieving revision 1.2 diff -u -r1.2 fbmmx.c --- programs/Xserver/fb/fbmmx.c 4 Dec 2004 00:42:50 -0000 1.2 +++ programs/Xserver/fb/fbmmx.c 24 Dec 2004 02:33:45 -0000 @@ -1,5 +1,6 @@ /* * Copyright © 2004 Red Hat, Inc. + * Copyright © 2004 Nicholas Miell * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that @@ -19,13 +20,20 @@ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. * * Author: Søren Sandmann (sandmann@redhat.com) - * + * Minor Improvements: Nicholas Miell (nmiell@gmail.com) + * * Based on work by Owen Taylor */ + +#ifdef USE_MMX + #include "fb.h" +#include "fbmmx.h" + +#include +#include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ -#ifdef USE_GCC34_MMX #ifdef RENDER @@ -33,11 +41,6 @@ #include "mipict.h" #include "fbpict.h" -typedef int Vector1x64 __attribute__ ((mode(DI))); -typedef int Vector2x32 __attribute__ ((mode(V2SI))); -typedef int Vector4x16 __attribute__ ((mode(V4HI))); -typedef int Vector8x8 __attribute__ ((mode(V8QI))); - typedef unsigned long long ullong; #define noVERBOSE @@ -50,7 +53,6 @@ typedef struct { - ullong mmx_zero; ullong mmx_4x00ff; ullong mmx_4x0080; ullong mmx_565_rgb; @@ -70,7 +72,6 @@ static const MMXData c = { - .mmx_zero = 0x0000000000000000ULL, .mmx_4x00ff = 0x00ff00ff00ff00ffULL, .mmx_4x0080 = 0x0080008000800080ULL, .mmx_565_rgb = 0x000001f0003f001fULL, @@ -88,121 +89,112 @@ .mmx_000000000000ffff = 0x000000000000ffffULL, }; -static __inline__ Vector1x64 -shift (Vector1x64 v, int s) +#define MC(x) ((__m64) c.mmx_##x) + +static __inline__ __m64 +shift (__m64 v, int s) { - if (s > 0) - return __builtin_ia32_psllq (v, s); - else if (s < 0) - return __builtin_ia32_psrlq (v, -s); - else - return v; + if (s > 0) + return _mm_slli_si64 (v, s); + else if (s < 0) + return _mm_srli_si64 (v, -s); + else + return v; } -static __inline__ Vector4x16 -negate (Vector4x16 mask) +static __inline__ __m64 +negate (__m64 mask) { - return (Vector4x16)__builtin_ia32_pxor ( - (Vector1x64)mask, - (Vector1x64)c.mmx_4x00ff); + return _mm_xor_si64 (mask, MC(4x00ff)); } -static __inline__ Vector4x16 -pix_multiply (Vector4x16 a, Vector4x16 b) +static __inline__ __m64 +pix_multiply (__m64 a, __m64 b) { - Vector4x16 res; + __m64 res; - res = __builtin_ia32_pmullw (a, b); - res = __builtin_ia32_paddw (res, (Vector4x16)c.mmx_4x0080); - res = __builtin_ia32_psrlw (res, 8); + res = _mm_mullo_pi16 (a, b); + res = _mm_add_pi16 (res, MC(4x0080)); + res = _mm_srli_pi16 (res, 8); return res; } -#if 0 +#ifdef USE_SSE #define HAVE_PSHUFW #endif #ifdef HAVE_PSHUFW -static __inline__ Vector4x16 -expand_alpha (Vector4x16 pixel) +static __inline__ __m64 +expand_alpha (__m64 pixel) { - Vector4x16 result; - __asm__ ("pshufw $0xFF, %1, %0\n\t" : "=y" (result) : "y" (pixel)); - return result; + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3)); } -static __inline__ Vector4x16 -expand_alpha_rev (Vector4x16 pixel) +static __inline__ __m64 +expand_alpha_rev (__m64 pixel) { - Vector4x16 result; - __asm__ ("pshufw $0x00, %1, %0\n\t" : "=y" (result) : "y" (pixel)); - return result; + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0)); } -static __inline__ Vector4x16 -invert_colors (Vector4x16 pixel) +static __inline__ __m64 +invert_colors (__m64 pixel) { - Vector4x16 result; - - /* 0xC6 = 11000110 */ - /* 3 0 1 2 */ - - __asm__ ("pshufw $0xC6, %1, %0\n\t" : "=y" (result) : "y" (pixel)); - - return result; + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2)); } #else -static __inline__ Vector4x16 -expand_alpha (Vector4x16 pixel) +static __inline__ __m64 +expand_alpha (__m64 pixel) { - Vector1x64 t1, t2; + __m64 t1, t2; - t1 = shift ((Vector1x64)pixel, -48); + t1 = shift (pixel, -48); t2 = shift (t1, 16); - t1 = __builtin_ia32_por (t1, t2); + t1 = _mm_or_si64 (t1, t2); t2 = shift (t1, 32); - t1 = __builtin_ia32_por (t1, t2); + t1 = _mm_or_si64 (t1, t2); - return (Vector4x16)t1; + return t1; } -static __inline__ Vector4x16 -expand_alpha_rev (Vector4x16 pixel) +static __inline__ __m64 +expand_alpha_rev (__m64 pixel) { - Vector1x64 t1, t2; + __m64 t1, t2; - t1 = shift ((Vector1x64)pixel, 48); + /* move alpha to low 16 bits and zero the rest */ + t1 = shift (pixel, 48); t1 = shift (t1, -48); + t2 = shift (t1, 16); - t1 = __builtin_ia32_por (t1, t2); + t1 = _mm_or_si64 (t1, t2); t2 = shift (t1, 32); - t1 = __builtin_ia32_por (t1, t2); + t1 = _mm_or_si64 (t1, t2); - return (Vector4x16)t1; + return t1; } -static __inline__ Vector4x16 -invert_colors (Vector4x16 pixel) +static __inline__ __m64 +invert_colors (__m64 pixel) { - Vector1x64 x, y, z; + __m64 x, y, z; - x = y = z = (Vector1x64)pixel; + x = y = z = pixel; - x = __builtin_ia32_pand (x, (Vector1x64)c.mmx_ffff0000ffff0000); - y = __builtin_ia32_pand (y, (Vector1x64)c.mmx_000000000000ffff); - z = __builtin_ia32_pand (z, (Vector1x64)c.mmx_0000ffff00000000); + x = _mm_and_si64 (x, MC(ffff0000ffff0000)); + y = _mm_and_si64 (y, MC(000000000000ffff)); + z = _mm_and_si64 (z, MC(0000ffff00000000)); y = shift (y, 32); z = shift (z, -32); - x = __builtin_ia32_por (x, y); - x = __builtin_ia32_por (x, z); + x = _mm_or_si64 (x, y); + x = _mm_or_si64 (x, z); - return (Vector4x16)x; + return x; } #endif @@ -210,74 +202,69 @@ /* Notes about writing mmx code * * give memory operands as the second operand. If you give it as the - * first, gcc will first load it into a register, then use that register + * first, gcc will first load it into a register, then use that + * register * * ie. use * - * __builtin_pmullw (x, mmx_constant[8]); + * _mm_mullo_pi16 (x, mmx_constant); * * not * - * __builtin_pmullw (mmx_constant[8], x); + * _mm_mullo_pi16 (mmx_constant, x); * - * Also try to minimize dependencies. Ie. when you need a value, try to calculate - * it from a value that was calculated as early as possible. + * Also try to minimize dependencies. i.e. when you need a value, try + * to calculate it from a value that was calculated as early as + * possible. */ -static __inline__ Vector4x16 -over (Vector4x16 src, Vector4x16 srca, Vector4x16 dest) +static __inline__ __m64 +over (__m64 src, __m64 srca, __m64 dest) { - return (Vector4x16)__builtin_ia32_paddusb ((Vector8x8)src, (Vector8x8)pix_multiply(dest, negate(srca))); + return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca))); } -static __inline__ Vector4x16 -over_rev_non_pre (Vector4x16 src, Vector4x16 dest) +static __inline__ __m64 +over_rev_non_pre (__m64 src, __m64 dest) { - Vector4x16 srca = expand_alpha (src); - Vector4x16 srcfaaa = (Vector4x16)__builtin_ia32_por((Vector1x64)srca, (Vector1x64)c.mmx_full_alpha); + __m64 srca = expand_alpha (src); + __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha)); return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest); } -static __inline__ Vector4x16 -in (Vector4x16 src, - Vector4x16 mask) +static __inline__ __m64 +in (__m64 src, + __m64 mask) { return pix_multiply (src, mask); } -static __inline__ Vector4x16 -in_over (Vector4x16 src, - Vector4x16 srca, - Vector4x16 mask, - Vector4x16 dest) +static __inline__ __m64 +in_over (__m64 src, + __m64 srca, + __m64 mask, + __m64 dest) { return over(in(src, mask), pix_multiply(srca, mask), dest); } -static __inline__ Vector8x8 -cvt32to64 (CARD32 v) -{ - ullong r = v; - return (Vector8x8)r; -} - -static __inline__ Vector4x16 +static __inline__ __m64 load8888 (CARD32 v) { - return (Vector4x16)__builtin_ia32_punpcklbw (cvt32to64 (v), - (Vector8x8)c.mmx_zero); + return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64()); } -static __inline__ Vector8x8 -pack8888 (Vector4x16 lo, Vector4x16 hi) +static __inline__ __m64 +pack8888 (__m64 lo, __m64 hi) { - Vector8x8 r; - r = __builtin_ia32_packuswb ((Vector4x16)lo, (Vector4x16)hi); + __m64 r; + r = _mm_packs_pu16 (lo, hi); return r; } -/* Expand 16 bits positioned at @pos (0-3) of a mmx register into 00RR00GG00BB +/* Expand 16 bits positioned at @pos (0-3) of a mmx register into + 00RR00GG00BB --- Expanding 565 in the low word --- @@ -286,71 +273,65 @@ m = m * (008404100840); m = m >> 8; -Note the trick here - the top word is shifted by another nibble to avoid -it bumping into the middle word +Note the trick here - the top word is shifted by another nibble to +avoid it bumping into the middle word */ -static __inline__ Vector4x16 -expand565 (Vector4x16 pixel, int pos) +static __inline__ __m64 +expand565 (__m64 pixel, int pos) { - Vector1x64 p = (Vector1x64)pixel; + __m64 p = pixel; /* move pixel to low 16 bit and zero the rest */ p = shift (shift (p, (3 - pos) * 16), -48); - Vector1x64 t1 = shift (p, 36 - 11); - Vector1x64 t2 = shift (p, 16 - 5); + __m64 t1 = shift (p, 36 - 11); + __m64 t2 = shift (p, 16 - 5); - p = __builtin_ia32_por (t1, p); - p = __builtin_ia32_por (t2, p); - p = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_rgb); + p = _mm_or_si64 (t1, p); + p = _mm_or_si64 (t2, p); + p = _mm_and_si64 (p, MC(565_rgb)); - pixel = __builtin_ia32_pmullw ((Vector4x16)p, (Vector4x16)c.mmx_565_unpack_multiplier); - return __builtin_ia32_psrlw (pixel, 8); + pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier)); + return _mm_srli_pi16 (pixel, 8); } -static __inline__ Vector4x16 -expand8888 (Vector4x16 in, int pos) +static __inline__ __m64 +expand8888 (__m64 in, int pos) { if (pos == 0) - return (Vector4x16)__builtin_ia32_punpcklbw ((Vector8x8)in, (Vector8x8)c.mmx_zero); + return _mm_unpacklo_pi8 (in, _mm_setzero_si64()); else - return (Vector4x16)__builtin_ia32_punpckhbw ((Vector8x8)in, (Vector8x8)c.mmx_zero); + return _mm_unpackhi_pi8 (in, _mm_setzero_si64()); } -static __inline__ Vector4x16 -pack565 (Vector4x16 pixel, Vector4x16 target, int pos) +static __inline__ __m64 +pack565 (__m64 pixel, __m64 target, int pos) { - Vector1x64 p = (Vector1x64)pixel; - Vector1x64 t = (Vector1x64)target; - Vector1x64 r, g, b; + __m64 p = pixel; + __m64 t = target; + __m64 r, g, b; - r = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_r); - g = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_g); - b = __builtin_ia32_pand (p, (Vector1x64)c.mmx_565_b); + r = _mm_and_si64 (p, MC(565_r)); + g = _mm_and_si64 (p, MC(565_g)); + b = _mm_and_si64 (p, MC(565_b)); r = shift (r, - (32 - 8) + pos * 16); g = shift (g, - (16 - 3) + pos * 16); b = shift (b, - (0 + 3) + pos * 16); if (pos == 0) - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_0); + t = _mm_and_si64 (t, MC(mask_0)); else if (pos == 1) - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_1); + t = _mm_and_si64 (t, MC(mask_1)); else if (pos == 2) - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_2); + t = _mm_and_si64 (t, MC(mask_2)); else if (pos == 3) - t = __builtin_ia32_pand (t, (Vector1x64)c.mmx_mask_3); + t = _mm_and_si64 (t, MC(mask_3)); - p = __builtin_ia32_por (r, t); - p = __builtin_ia32_por (g, p); + p = _mm_or_si64 (r, t); + p = _mm_or_si64 (g, p); - return (Vector4x16)__builtin_ia32_por (b, p); -} - -static __inline__ void -emms (void) -{ - __asm__ __volatile__ ("emms"); + return _mm_or_si64 (b, p); } void @@ -371,7 +352,7 @@ CARD32 *dstLine, *dst; CARD16 w; FbStride dstStride; - Vector4x16 vsrc, vsrca; + __m64 vsrc, vsrca; CHECKPOINT(); @@ -395,7 +376,7 @@ while (w && (unsigned long)dst & 7) { - *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero); + *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), _mm_setzero_si64()); w--; dst++; @@ -403,15 +384,15 @@ while (w >= 2) { - Vector4x16 vdest; - Vector4x16 dest0, dest1; + __m64 vdest; + __m64 dest0, dest1; - vdest = *(Vector4x16 *)dst; + vdest = *(__m64 *)dst; dest0 = over(vsrc, vsrca, expand8888(vdest, 0)); dest1 = over(vsrc, vsrca, expand8888(vdest, 1)); - *(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1); + *(__m64 *)dst = pack8888(dest0, dest1); dst += 2; w -= 2; @@ -421,14 +402,14 @@ while (w) { - *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), (Vector4x16)c.mmx_zero); + *dst = (ullong) pack8888(over(vsrc, vsrca, load8888(*dst)), _mm_setzero_si64()); w--; dst++; } } - emms(); + _mm_empty(); } void @@ -449,7 +430,7 @@ CARD16 *dstLine, *dst; CARD16 w; FbStride dstStride; - Vector4x16 vsrc, vsrca; + __m64 vsrc, vsrca; CHECKPOINT(); @@ -474,7 +455,7 @@ while (w && (unsigned long)dst & 7) { ullong d = *dst; - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); + __m64 vdest = expand565 ((__m64)d, 0); vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); *dst = (ullong)vdest; @@ -484,16 +465,16 @@ while (w >= 4) { - Vector4x16 vdest; + __m64 vdest; - vdest = *(Vector4x16 *)dst; + vdest = *(__m64 *)dst; vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0); vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1); vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2); vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3); - *(Vector8x8 *)dst = (Vector8x8)vdest; + *(__m64 *)dst = vdest; dst += 4; w -= 4; @@ -504,7 +485,7 @@ while (w) { ullong d = *dst; - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); + __m64 vdest = expand565 ((__m64)d, 0); vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); *dst = (ullong)vdest; @@ -513,7 +494,7 @@ } } - emms(); + _mm_empty(); } void @@ -534,7 +515,7 @@ CARD32 *dstLine; CARD32 *maskLine; FbStride dstStride, maskStride; - Vector4x16 vsrc, vsrca; + __m64 vsrc, vsrca; CHECKPOINT(); @@ -562,9 +543,9 @@ if (m) { - Vector4x16 vdest = load8888(*q); + __m64 vdest = load8888(*q); vdest = in_over(vsrc, vsrca, load8888(m), vdest); - *q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); + *q = (ullong)pack8888(vdest, _mm_setzero_si64()); } twidth--; @@ -580,15 +561,15 @@ if (m0 | m1) { - Vector4x16 dest0, dest1; - Vector4x16 vdest = *(Vector4x16 *)q; + __m64 dest0, dest1; + __m64 vdest = *(__m64 *)q; dest0 = in_over(vsrc, vsrca, load8888(m0), expand8888 (vdest, 0)); dest1 = in_over(vsrc, vsrca, load8888(m1), expand8888 (vdest, 1)); - *(Vector8x8 *)q = (Vector8x8)pack8888(dest0, dest1); + *(__m64 *)q = pack8888(dest0, dest1); } p += 2; @@ -602,9 +583,9 @@ if (m) { - Vector4x16 vdest = load8888(*q); + __m64 vdest = load8888(*q); vdest = in_over(vsrc, vsrca, load8888(m), vdest); - *q = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); + *q = (ullong)pack8888(vdest, _mm_setzero_si64()); } twidth--; @@ -616,7 +597,7 @@ maskLine += maskStride; } - emms(); + _mm_empty(); } void @@ -638,7 +619,7 @@ CARD8 *maskLine, *mask; FbStride dstStride, maskStride; CARD16 w; - Vector4x16 vsrc, vsrca; + __m64 vsrc, vsrca; ullong srcsrc; CHECKPOINT(); @@ -673,8 +654,8 @@ if (m) { - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), load8888(*dst)); - *dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst)); + *dst = (ullong)pack8888(vdest, _mm_setzero_si64()); } w--; @@ -696,15 +677,15 @@ } else if (m0 | m1) { - Vector4x16 vdest; - Vector4x16 dest0, dest1; + __m64 vdest; + __m64 dest0, dest1; - vdest = *(Vector4x16 *)dst; + vdest = *(__m64 *)dst; - dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m0), expand8888(vdest, 0)); - dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m1), expand8888(vdest, 1)); + dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0)); + dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1)); - *(Vector8x8 *)dst = (Vector8x8)pack8888(dest0, dest1); + *(__m64 *)dst = pack8888(dest0, dest1); } mask += 2; @@ -720,9 +701,9 @@ if (m) { - Vector4x16 vdest = load8888(*dst); - vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), vdest); - *dst = (ullong)pack8888(vdest, (Vector4x16)c.mmx_zero); + __m64 vdest = load8888(*dst); + vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest); + *dst = (ullong)pack8888(vdest, _mm_setzero_si64()); } w--; @@ -731,7 +712,7 @@ } } - emms(); + _mm_empty(); } @@ -754,7 +735,7 @@ CARD8 *maskLine, *mask; FbStride dstStride, maskStride; CARD16 w; - Vector4x16 vsrc, vsrca; + __m64 vsrc, vsrca; unsigned long long srcsrcsrcsrc, src16; CHECKPOINT(); @@ -771,7 +752,7 @@ vsrc = load8888 (src); vsrca = expand_alpha (vsrc); - src16 = (ullong)pack565(vsrc, (Vector4x16)c.mmx_zero, 0); + src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0); srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 | (ullong)src16 << 16 | (ullong)src16; @@ -793,9 +774,9 @@ if (m) { ullong d = *dst; - Vector4x16 vd = (Vector4x16)d; - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0)); - *dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0); + __m64 vd = (__m64)d; + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); } w--; @@ -819,21 +800,21 @@ } else if (m0 | m1 | m2 | m3) { - Vector4x16 vdest; - Vector4x16 vm0, vm1, vm2, vm3; + __m64 vdest; + __m64 vm0, vm1, vm2, vm3; - vdest = *(Vector4x16 *)dst; + vdest = *(__m64 *)dst; - vm0 = (Vector4x16)m0; + vm0 = (__m64)m0; vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0); - vm1 = (Vector4x16)m1; + vm1 = (__m64)m1; vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1); - vm2 = (Vector4x16)m2; + vm2 = (__m64)m2; vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2); - vm3 = (Vector4x16)m3; + vm3 = (__m64)m3; vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3); - *(Vector4x16 *)dst = vdest; + *(__m64 *)dst = vdest; } w -= 4; @@ -850,9 +831,9 @@ if (m) { ullong d = *dst; - Vector4x16 vd = (Vector4x16)d; - Vector4x16 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((Vector4x16)m), expand565(vd, 0)); - *dst = (ullong)pack565(vdest, (Vector4x16)c.mmx_zero, 0); + __m64 vd = (__m64)d; + __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); + *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); } w--; @@ -861,7 +842,7 @@ } } - emms(); + _mm_empty(); } void @@ -902,9 +883,9 @@ while (w && (unsigned long)dst & 7) { - Vector4x16 vsrc = load8888 (*src); + __m64 vsrc = load8888 (*src); ullong d = *dst; - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); + __m64 vdest = expand565 ((__m64)d, 0); vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); @@ -934,24 +915,24 @@ if ((a0 & a1 & a2 & a3) == 0xFF) { - Vector4x16 vdest; - vdest = pack565(invert_colors(load8888(s0)), (Vector4x16)c.mmx_zero, 0); + __m64 vdest; + vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0); vdest = pack565(invert_colors(load8888(s1)), vdest, 1); vdest = pack565(invert_colors(load8888(s2)), vdest, 2); vdest = pack565(invert_colors(load8888(s3)), vdest, 3); - *(Vector4x16 *)dst = vdest; + *(__m64 *)dst = vdest; } else if (a0 | a1 | a2 | a3) { - Vector4x16 vdest = *(Vector4x16 *)dst; + __m64 vdest = *(__m64 *)dst; vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0); vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1); vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2); vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3); - *(Vector4x16 *)dst = vdest; + *(__m64 *)dst = vdest; } w -= 4; @@ -963,9 +944,9 @@ while (w) { - Vector4x16 vsrc = load8888 (*src); + __m64 vsrc = load8888 (*src); ullong d = *dst; - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); + __m64 vdest = expand565 ((__m64)d, 0); vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); @@ -977,7 +958,7 @@ } } - emms(); + _mm_empty(); } /* "888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ @@ -1018,10 +999,10 @@ while (w && (unsigned long)dst & 7) { - Vector4x16 s = load8888 (*src); - Vector4x16 d = load8888 (*dst); + __m64 s = load8888 (*src); + __m64 d = load8888 (*dst); - *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero); + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); w--; dst++; @@ -1032,7 +1013,7 @@ { ullong s0, s1; unsigned char a0, a1; - Vector4x16 d0, d1; + __m64 d0, d1; s0 = *src; s1 = *(src + 1); @@ -1045,16 +1026,16 @@ d0 = invert_colors(load8888(s0)); d1 = invert_colors(load8888(s1)); - *(Vector8x8 *)dst = pack8888 (d0, d1); + *(__m64 *)dst = pack8888 (d0, d1); } else if (a0 | a1) { - Vector4x16 vdest = *(Vector4x16 *)dst; + __m64 vdest = *(__m64 *)dst; d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0)); d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1)); - *(Vector8x8 *)dst = pack8888 (d0, d1); + *(__m64 *)dst = pack8888 (d0, d1); } w -= 2; @@ -1064,10 +1045,10 @@ while (w) { - Vector4x16 s = load8888 (*src); - Vector4x16 d = load8888 (*dst); + __m64 s = load8888 (*src); + __m64 d = load8888 (*dst); - *dst = (ullong)pack8888 (over_rev_non_pre (s, d), (Vector4x16)c.mmx_zero); + *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); w--; dst++; @@ -1075,7 +1056,7 @@ } } - emms(); + _mm_empty(); } void @@ -1096,7 +1077,7 @@ CARD16 *dstLine; CARD32 *maskLine; FbStride dstStride, maskStride; - Vector4x16 vsrc, vsrca; + __m64 vsrc, vsrca; CHECKPOINT(); @@ -1125,7 +1106,7 @@ if (m) { ullong d = *q; - Vector4x16 vdest = expand565 ((Vector4x16)d, 0); + __m64 vdest = expand565 ((__m64)d, 0); vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); *q = (ullong)vdest; } @@ -1146,14 +1127,14 @@ if ((m0 | m1 | m2 | m3)) { - Vector4x16 vdest = *(Vector4x16 *)q; + __m64 vdest = *(__m64 *)q; vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0); vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1); vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2); vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3); - *(Vector4x16 *)q = vdest; + *(__m64 *)q = vdest; } twidth -= 4; p += 4; @@ -1168,7 +1149,7 @@ if (m) { ullong d = *q; - Vector4x16 vdest = expand565((Vector4x16)d, 0); + __m64 vdest = expand565((__m64)d, 0); vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0); *q = (ullong)vdest; } @@ -1182,7 +1163,7 @@ dstLine += dstStride; } - emms (); + _mm_empty (); } void @@ -1234,13 +1215,7 @@ while (w >= 8) { - __asm__ __volatile__ ( - "movq (%0), %%mm2\n\t" - "movq (%1), %%mm3\n\t" - "paddusb %%mm2, %%mm3\n\t" - "movq %%mm3, (%1)\n\t" - : /* no output */ : "r" (src), "r" (dst)); - + *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); dst += 8; src += 8; w -= 8; @@ -1260,7 +1235,7 @@ } } - emms(); + _mm_empty(); } void @@ -1297,13 +1272,8 @@ while (w && (unsigned long)dst & 7) { - __asm__ __volatile__ ( - "movd %0, %%mm2\n\t" - "movd %1, %%mm3\n\t" - "paddusb %%mm2, %%mm3\n\t" - "movd %%mm3, %1\n\t" - : /* no output */ : "m" (*src), "m" (*dst)); - + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), + _mm_cvtsi32_si64(*dst))); dst++; src++; w--; @@ -1311,13 +1281,7 @@ while (w >= 2) { - __asm__ __volatile__ ( - "movq (%0), %%mm2\n\t" - "movq (%1), %%mm3\n\t" - "paddusb %%mm2, %%mm3\n\t" - "movq %%mm3, (%1)\n\t" - : /* no output */ : "r" (src), "r" (dst)); - + *(ullong*)dst = (ullong) _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); dst += 2; src += 2; w -= 2; @@ -1325,16 +1289,13 @@ if (w) { - __asm__ __volatile__ ( - "movd %0, %%mm2\n\t" - "movd %1, %%mm3\n\t" - "paddusb %%mm2, %%mm3\n\t" - "movd %%mm3, %1\n\t" - : /* no output */ : "m" (*src), "m" (*dst)); + *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), + _mm_cvtsi32_si64(*dst))); + } } - emms(); + _mm_empty(); } #define GetStart(drw,x,y,type,stride,line,bpp) {\ @@ -1358,7 +1319,7 @@ FbStride stride; int bpp; ullong fill; - Vector8x8 vfill; + __m64 vfill; CARD32 byte_width; CARD8 *byte_line; FbBits *bits; @@ -1390,7 +1351,7 @@ } fill = ((ullong)xor << 32) | xor; - vfill = (Vector8x8)fill; + vfill = (__m64)fill; while (height--) { @@ -1406,7 +1367,7 @@ d += 2; } - while (w >= 4 && ((unsigned int)d & 7)) + while (w >= 4 && ((unsigned long)d & 7)) { *(CARD32 *)d = xor; @@ -1416,18 +1377,15 @@ while (w >= 64) { - __asm__ __volatile ( - "movq %0, (%1)\n\t" - "movq %0, 8(%1)\n\t" - "movq %0, 16(%1)\n\t" - "movq %0, 24(%1)\n\t" - "movq %0, 32(%1)\n\t" - "movq %0, 40(%1)\n\t" - "movq %0, 48(%1)\n\t" - "movq %0, 56(%1)\n\t" - : /* no output */ - : "y" (vfill), "r" (d) - : "memory"); + *(__m64*) (d + 0) = vfill; + *(__m64*) (d + 8) = vfill; + *(__m64*) (d + 16) = vfill; + *(__m64*) (d + 24) = vfill; + *(__m64*) (d + 32) = vfill; + *(__m64*) (d + 40) = vfill; + *(__m64*) (d + 48) = vfill; + *(__m64*) (d + 56) = vfill; + w -= 64; d += 64; } @@ -1446,10 +1404,11 @@ } } - emms(); + _mm_empty(); return TRUE; } +#ifndef __amd64__ Bool fbHaveMMX (void) { @@ -1508,7 +1467,8 @@ return mmx_present; } +#endif /* __amd64__ */ #endif /* RENDER */ -#endif /* USE_GCC34_MMX */ +#endif /* USE_MMX */ Index: programs/Xserver/fb/fbmmx.h =================================================================== RCS file: /cvs/xorg/xc/programs/Xserver/fb/fbmmx.h,v retrieving revision 1.2 diff -u -r1.2 fbmmx.h --- programs/Xserver/fb/fbmmx.h 4 Dec 2004 00:42:50 -0000 1.2 +++ programs/Xserver/fb/fbmmx.h 24 Dec 2004 02:33:45 -0000 @@ -22,13 +22,19 @@ * * Based on work by Owen Taylor */ -#ifdef USE_GCC34_MMX +#ifdef USE_MMX + +#ifndef __amd64__ Bool fbHaveMMX(void); #else -#define fbHaveMMX FALSE +#define fbHaveMMX() TRUE +#endif + +#else +#define fbHaveMMX() FALSE #endif -#ifdef USE_GCC34_MMX +#ifdef USE_MMX void fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op, PicturePtr pSrc, @@ -157,4 +163,4 @@ int height, FbBits xor); -#endif /* USE_GCC34_MMX */ +#endif /* USE_MMX */ Index: programs/Xserver/fb/fbpict.c =================================================================== RCS file: /cvs/xorg/xc/programs/Xserver/fb/fbpict.c,v retrieving revision 1.6 diff -u -r1.6 fbpict.c --- programs/Xserver/fb/fbpict.c 4 Dec 2004 00:42:50 -0000 1.6 +++ programs/Xserver/fb/fbpict.c 24 Dec 2004 02:33:45 -0000 @@ -877,7 +877,7 @@ switch (pDst->format) { case PICT_r5g6b5: case PICT_b5g6r5: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) func = fbCompositeSolidMask_nx8x0565mmx; else @@ -892,7 +892,7 @@ case PICT_x8r8g8b8: case PICT_a8b8g8r8: case PICT_x8b8g8r8: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) func = fbCompositeSolidMask_nx8x8888mmx; else @@ -906,7 +906,7 @@ switch (pDst->format) { case PICT_a8r8g8b8: case PICT_x8r8g8b8: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) func = fbCompositeSolidMask_nx8888x8888Cmmx; else @@ -914,7 +914,7 @@ func = fbCompositeSolidMask_nx8888x8888C; break; case PICT_r5g6b5: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) func = fbCompositeSolidMask_nx8888x0565Cmmx; else @@ -929,7 +929,7 @@ switch (pDst->format) { case PICT_a8b8g8r8: case PICT_x8b8g8r8: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) func = fbCompositeSolidMask_nx8888x8888Cmmx; else @@ -937,7 +937,7 @@ func = fbCompositeSolidMask_nx8888x8888C; break; case PICT_b5g6r5: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) func = fbCompositeSolidMask_nx8888x0565Cmmx; else @@ -978,13 +978,13 @@ switch (pDst->format) { case PICT_a8r8g8b8: case PICT_x8r8g8b8: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) func = fbCompositeSrc_8888RevNPx8888mmx; #endif break; case PICT_r5g6b5: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) func = fbCompositeSrc_8888RevNPx0565mmx; #endif @@ -1000,13 +1000,13 @@ switch (pDst->format) { case PICT_a8b8g8r8: case PICT_x8b8g8r8: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) func = fbCompositeSrc_8888RevNPx8888mmx; #endif break; case PICT_r5g6b5: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) func = fbCompositeSrc_8888RevNPx0565mmx; #endif @@ -1032,7 +1032,7 @@ switch (pDst->format) { case PICT_a8r8g8b8: case PICT_x8r8g8b8: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) { srcRepeat = FALSE; @@ -1041,7 +1041,7 @@ #endif break; case PICT_r5g6b5: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) { srcRepeat = FALSE; @@ -1109,7 +1109,7 @@ case PICT_a8r8g8b8: switch (pDst->format) { case PICT_a8r8g8b8: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) func = fbCompositeSrcAdd_8888x8888mmx; else @@ -1121,7 +1121,7 @@ case PICT_a8b8g8r8: switch (pDst->format) { case PICT_a8b8g8r8: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) func = fbCompositeSrcAdd_8888x8888mmx; else @@ -1133,7 +1133,7 @@ case PICT_a8: switch (pDst->format) { case PICT_a8: -#ifdef USE_GCC34_MMX +#ifdef USE_MMX if (fbHaveMMX()) func = fbCompositeSrcAdd_8000x8000mmx; else