Index: Imakefile =================================================================== RCS file: /cvs/xorg/xc/programs/Xserver/fb/Imakefile,v retrieving revision 1.9 diff -u -p -r1.9 Imakefile --- Imakefile 19 Jan 2005 21:56:07 -0000 1.9 +++ Imakefile 22 Jan 2005 21:11:45 -0000 @@ -4,17 +4,23 @@ XCOMM XCOMM Id: Imakefile,v 1.1 1999/11/02 03:54:44 keithp Exp $ #if defined(HasGcc34) && HasGcc34 -MMXOPTIONS= -mmmx -Winline --param inline-unit-growth=10000 \ - --param large-function-growth=10000 -DUSE_MMX +INLINEOPTIONS = -Winline --param inline-unit-growth=10000 \ + --param max-inline-insns-single=10000 \ + --param large-function-growth=10000 +#if defined(AlphaArchitecture) +MMXOPTIONS= $(INLINEOPTIONS) -DUSE_MMX +#else +MMXOPTIONS= $(INLINEOPTIONS) -mmmx -DUSE_MMX +#endif SSEOPTIONS= $(MMXOPTIONS) -msse -DUSE_SSE -#if defined(i386Architecture) +#if defined(i386Architecture) || defined(AlphaArchitecture) SpecialCObjectRule(fbmmx,fbmmx.c,$(MMXOPTIONS)) #elif defined(AMD64Architecture) SpecialCObjectRule(fbmmx,fbmmx.c,$(SSEOPTIONS)) #endif -#if defined(i386Architecture) || defined(AMD64Architecture) +#if defined(i386Architecture) || defined(AMD64Architecture) || defined(AlphaArchitecture) SpecialCObjectRule(fbpict,fbpict.c,$(MMXOPTIONS)) SpecialCObjectRule(fbfill,fbfill.c,$(MMXOPTIONS)) SpecialCObjectRule(fbcopy,fbcopy.c,$(MMXOPTIONS)) Index: alpha_mmintrin.h =================================================================== RCS file: alpha_mmintrin.h diff -N alpha_mmintrin.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ alpha_mmintrin.h 22 Jan 2005 21:11:45 -0000 @@ -0,0 +1,208 @@ +#if (defined(__GNUC__) \ + && (__GNUC__ >= 4 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 4)) \ + && __alpha_max__) +#define __amask __builtin_alpha_amask +#define __extwl __builtin_alpha_extwl +#define __inswl __builtin_alpha_inswl +#define __maxsw4 __builtin_alpha_maxsw4 +#define __minub8 __builtin_alpha_minub8 +#define __minuw4 __builtin_alpha_minuw4 +#define __pkwb __builtin_alpha_pkwb +#define __unpkbw __builtin_alpha_unpkbw +#define __zapnot __builtin_alpha_zapnot +#elif defined(__GNUC__) +#define __amask(a) ({ uint64_t __r; asm (" amask %1,%0" : "=r" (__r) : "rI" (a)); __r; }) +#define __extwl(a, b) ({ uint64_t __r; asm (" extwl %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) +#define __inswl(a, b) ({ uint64_t __r; asm (" inswl %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) +#define __maxsw4(a, b) ({ uint64_t __r; asm (".arch ev6; maxsw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) +#define __minub8(a, b) ({ uint64_t __r; asm (".arch ev6; minub8 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) +#define __minuw4(a, b) ({ uint64_t __r; asm (".arch ev6; minuw4 %r1,%2,%0" : "=r" (__r) : "%rJ" (a), "rI" (b)); __r; }) +#define __pkwb(a) ({ uint64_t __r; asm (".arch ev6; pkwb %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) +#define __unpkbw(a) ({ uint64_t __r; asm (".arch ev6; unpkbw %r1,%0" : "=r" (__r) : "rJ" (a)); __r; }) +#define __zapnot(a, b) ({ uint64_t __r; asm (" zapnot %r1,%2,%0" : "=r" (__r) : "rJ" (a), "rI" (b)); __r; }) +#elif defined(__DECC) +#define __amask(a) asm ("amask %a0,%v0", a) +#define __extwl(a, b) asm ("extwl %a0,%a1,%v0", a, b) +#define __inswl(a, b) asm ("inswl %a0,%a1,%v0", a, b) +#define __maxsw4(a, b) asm ("maxsw4 %a0,%a1,%v0", a, b) +#define __minub8(a, b) asm ("minub8 %a0,%a1,%v0", a, b) +#define __minuw4(a, b) asm ("minuw4 %a0,%a1,%v0", a, b) +#define __pkwb(a) asm ("pkwb %a0,%v0", a) +#define __unpkbw(a) asm ("unpkbw %a0,%v0", a) +#define __zapnot(a, b) asm ("zapnot %a0,%a1,%v0", a, b) +#else +#error "unsupported compiler" +#endif + +typedef unsigned long __m64; + +/* Add the 16-bit values in M1 to the 16-bit values in M2 (paddw). */ +static __inline __m64 +_mm_add_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 __signs = (__m1 ^ __m2) & 0x8000800080008000; + __m1 &= ~__signs; /* ??? gcc doesn't use bic here */ + __m2 &= ~__signs; + __m1 += __m2; + __m1 ^= __signs; + return __m1; + +} + +/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned + saturated arithmetic (paddusb). */ +static __inline __m64 +_mm_adds_pu8 (__m64 __m1, __m64 __m2) +{ + return __m1 + __minub8(__m2, ~__m1); +} + +/* Bit-wise AND the 64-bit values in M1 and M2 (pand). */ +static __inline __m64 +_mm_and_si64 (__m64 __m1, __m64 __m2) +{ + return __m1 & __m2; +} + +/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ +static __inline __m64 +_mm_cvtsi32_si64 (int __i) +{ + return (unsigned int) __i; +} + +/* Convert the lower 32 bits of the __m64 object into an integer. */ +static __inline int +_mm_cvtsi64_si32 (__m64 __i) +{ + return __i; +} + +/* Empty the multimedia state (emms). */ +static __inline void +_mm_empty (void) +{ +} + +/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce + the low 16 bits of the results (pmullw). */ +static __inline __m64 +_mm_mullo_pi16 (__m64 __m1, __m64 __m2) +{ + __m64 __t0, __t2, __t4, __t6; + + __t0 = (int) __m1 * (int) __m2 ; + __t2 = (int) __extwl(__m1, 2) * (int) __extwl(__m2, 2); + __t4 = (int) __extwl(__m1, 4) * (int) __extwl(__m2, 4); + __t6 = (int) __extwl(__m1, 6) * (int) __extwl(__m2, 6); + + __t0 = __inswl(__t0, 0); + __t2 = __inswl(__t2, 2); + __t4 = __inswl(__t4, 4); + __t6 = __inswl(__t6, 6); + + return __t0 | __t2 | __t4 | __t6; +} + +/* Bit-wise inclusive OR the 64-bit values in M1 and M2 (por). */ +static __inline __m64 +_mm_or_si64 (__m64 __m1, __m64 __m2) +{ + return __m1 | __m2; +} + +/* Pack the four 16-bit values from M1 into the lower four 8-bit values of + the result, and the four 16-bit values from M2 into the upper four 8-bit + values of the result, all with unsigned saturation (packuswb). */ +static __inline __m64 +_mm_packs_pu16 (__m64 __m1, __m64 __m2) +{ + __m1 = __maxsw4(__m1, 0); + __m2 = __maxsw4(__m2, 0); + __m1 = __minuw4(__m1, 0x00ff00ff00ff00ff); + __m2 = __minuw4(__m2, 0x00ff00ff00ff00ff); + __m1 = __pkwb(__m1); + __m2 = __pkwb(__m2); + + return __m1 | (__m2 << 32); +} + +/* Creates a 64-bit zero. */ +static __inline __m64 +_mm_setzero_si64 (void) +{ + return 0; +} + +#if 0 +/* Return a combination of the four 16-bit values in A. The selector + must be an immediate. */ +#define _mm_shuffle_pi16(A, N) \ + ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N))) + +/* Create a selector for use with the SHUFPS instruction. */ +#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) + +static __inline __m64 +_mm_shuffle_pi16 (__m64 __A, int __N) +{ + __m64 __t0, __t1, __t2, __t3; + + __t0 = __extwl(__A, (__N >> 0) & 3); + __t1 = __extwl(__A, (__N >> 2) & 3); + __t2 = __extwl(__A, (__N >> 4) & 3); + __t3 = __extwl(__A, (__N >> 6) & 3); + + return __t0 | (__t1 << 16) | (__t2 << 32) | (__t3 << 48); +} +#define HAVE_PSHUFW +#endif + +/* psllq */ +static __inline __m64 +_mm_slli_si64 (__m64 __m, __m64 __count) +{ + return __count < 64 ? __m << __count : 0; +} + +/* psrlw */ +static __inline __m64 +_mm_srli_pi16 (__m64 __m, __m64 __count) +{ + __m64 __mask = 0xffff0000ffff0000; + __mask = __mask ^ (__mask >> __count); + return (__m >> __count) & ~__mask; +} + +/* psrlq */ +static __inline __m64 +_mm_srli_si64 (__m64 __m, int __count) +{ + return __count < 64 ? __m >> __count : 0; +} + +/* Interleave the four 8-bit values from the low half of M1 with the four + 8-bit values from the low half of M2 (punpcklbw). */ +static __inline __m64 +_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) +{ + __m1 = __unpkbw(__m1); + __m2 = __unpkbw(__m2); + return __m1 | (__m2 << 8); +} + +/* Interleave the four 8-bit values from the high half of M1 with the four + 8-bit values from the high half of M2 (punpckhbw). */ +static __inline __m64 +_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) +{ + return _mm_unpacklo_pi8 (__m1 >> 32, __m2 >> 32); +} + +/* Bit-wise exclusive OR the 64-bit values in M1 and M2 (pxor). */ +static __inline __m64 +_mm_xor_si64 (__m64 __m1, __m64 __m2) +{ + return __m1 ^ __m2; +} Index: fbmmx.c =================================================================== RCS file: /cvs/xorg/xc/programs/Xserver/fb/fbmmx.c,v retrieving revision 1.5 diff -u -p -r1.5 fbmmx.c --- fbmmx.c 16 Jan 2005 01:59:23 -0000 1.5 +++ fbmmx.c 22 Jan 2005 21:11:49 -0000 @@ -31,7 +31,11 @@ #include "fb.h" #include "fbmmx.h" +#ifdef __alpha__ +#include "alpha_mmintrin.h" +#else #include +#endif #ifdef USE_SSE #include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ @@ -53,6 +57,27 @@ typedef unsigned long long ullong; #define CHECKPOINT() #endif +#ifdef __alpha__ + +#define mmx_4x00ff 0x00ff00ff00ff00ff +#define mmx_4x0080 0x0080008000800080 +#define mmx_565_rgb 0x000001f0003f001f +#define mmx_565_r 0x000000f800000000 +#define mmx_565_g 0x0000000000fc0000 +#define mmx_565_b 0x00000000000000f8 +#define mmx_mask_0 0xffffffffffff0000 +#define mmx_mask_1 0xffffffff0000ffff +#define mmx_mask_2 0xffff0000ffffffff +#define mmx_mask_3 0x0000ffffffffffff +#define mmx_full_alpha 0x00ff000000000000 +#define mmx_565_unpack_multiplier 0x0000008404100840 +#define mmx_ffff0000ffff0000 0xffff0000ffff0000 +#define mmx_0000ffff00000000 0x0000ffff00000000 +#define mmx_000000000000ffff 0x000000000000ffff + +#define MC(x) ((__m64) mmx_##x) + +#else typedef struct { ullong mmx_4x00ff; @@ -92,6 +117,7 @@ static const MMXData c = }; #define MC(x) ((__m64) c.mmx_##x) +#endif static __inline__ __m64 shift (__m64 v, int s) @@ -1682,7 +1708,13 @@ fbCompositeCopyAreammx (CARD8 op, width, height); } -#ifndef __amd64__ +#if defined(__alpha__) && !defined(__alpha_max__) +Bool +fbHaveMMX (void) +{ + return __amask(1 << 8) == 0; +} +#elif !defined(__amd64__) && !defined(__alpha__) Bool fbHaveMMX (void) { Index: fbmmx.h =================================================================== RCS file: /cvs/xorg/xc/programs/Xserver/fb/fbmmx.h,v retrieving revision 1.4 diff -u -p -r1.4 fbmmx.h --- fbmmx.h 13 Jan 2005 20:49:21 -0000 1.4 +++ fbmmx.h 22 Jan 2005 21:11:49 -0000 @@ -24,10 +24,10 @@ */ #ifdef USE_MMX -#ifndef __amd64__ -Bool fbHaveMMX(void); -#else +#if defined(__amd64__) || defined(__alpha_max__) #define fbHaveMMX() TRUE +#else +Bool fbHaveMMX(void); #endif #else