public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, i386]: Committed: Fix PR target/22152
@ 2008-03-08  7:01 Uros Bizjak
  2008-03-08  9:41 ` Paolo Bonzini
  2008-03-08 10:21 ` Richard Guenther
  0 siblings, 2 replies; 5+ messages in thread
From: Uros Bizjak @ 2008-03-08  7:01 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 9004 bytes --]

Hello!

Attached patch substantially improves handling of MMX DImode values. It 
turns out, that in order to reliably separate "native" and MMX DImode 
values, a new mode is needed. Since we are using vector registers, the 
natural choice is V1DImode, vector mode with one DImode element. After 
this change, V1DI mode is considered to be native MMX mode (native MMX 
modes were successfully separated out from non-vector modes by the 
patch, committed a couple of weeks ago).

Patch rewrites all MMX DImode patterns into V1DImode, also updating 
mmintrin.h for the new/changed builtins on the way. The patch also 
rewrites MMX shift patterns in the same way as SSE shift patterns were 
changed some time ago.

The benefits of the patch can be seen from the original testcase from PR 
target/22152. For a slightly changed testcase (to avoid uninitialized 
variables), non-patched gcc creates barely recognizable code:

unsigned_add3:
        pushl   %ebp
        pxor    %mm1, %mm1
        movl    %esp, %ebp
        pushl   %edi
        pushl   %esi
        pushl   %ebx
        xorl    %ebx, %ebx
        subl    $84, %esp
        movl    12(%ebp), %edi
        movl    8(%ebp), %eax
        movl    20(%ebp), %esi
        movq    (%eax), %mm0
        cmpl    $1, %esi
        movq    %mm0, -56(%ebp)
        movl    (%edi), %eax
        movl    4(%edi), %edx
        movq    -56(%ebp), %mm0
        movl    %eax, -48(%ebp)
        movl    %edx, -44(%ebp)
        pcmpeqb %mm0, %mm0
        movq    -56(%ebp), %mm3
        psubq   %mm0, %mm1
        movl    $1, %eax
        movq    %mm1, %mm2
        movq    -56(%ebp), %mm0
        movq    -48(%ebp), %mm1
        paddq   %mm0, %mm1
        psrlq   $1, %mm0
        movq    %mm1, -72(%ebp)
        movq    -48(%ebp), %mm1
        pand    (%edi), %mm3
        psrlq   $1, %mm1
        pand    %mm2, %mm3
        paddq   %mm1, %mm0
        movl    -72(%ebp), %edx
        paddq   %mm3, %mm0
        movl    -68(%ebp), %ecx
        psrlq   $63, %mm0
        movq    %mm0, -32(%ebp)
        jbe     .L3
        .p2align 4,,10
        .p2align 3
.L6:
        movl    16(%ebp), %ebx
        movl    %edx, -8(%ebx,%eax,8)
        movl    %ecx, -4(%ebx,%eax,8)
        movl    8(%ebp), %edx
        movq    (%edx,%eax,8), %mm0
        movq    %mm0, -88(%ebp)
        pand    (%edi,%eax,8), %mm0
        movl    -88(%ebp), %edx
        pand    %mm2, %mm0
        movl    %edx, -40(%ebp)
        movl    -84(%ebp), %ecx
        movl    %ecx, -36(%ebp)
        movq    -40(%ebp), %mm1
        movq    -40(%ebp), %mm3
        paddq   (%edi,%eax,8), %mm1
        paddq   %mm0, %mm3
        paddq   -32(%ebp), %mm1
        movq    -32(%ebp), %mm0
        movq    %mm1, -96(%ebp)
        pand    %mm3, %mm0
        incl    %eax
        paddq   %mm0, %mm3
        movl    -96(%ebp), %edx
        movl    -92(%ebp), %ecx
        movq    %mm3, -32(%ebp)
        cmpl    %eax, %esi
        ja      .L6
        leal    -8(,%esi,8), %ebx
.L3:
        movl    16(%ebp), %eax
        movl    %edx, (%eax,%ebx)
        movl    %ecx, 4(%eax,%ebx)
        movq    -32(%ebp), %mm0
        addl    $84, %esp
        popl    %ebx
        popl    %esi
        popl    %edi
        leave
        ret

The difference with the patch is noticeable (please note the code in the 
loop):

unsigned_add3:
        pushl   %ebp
        pxor    %mm0, %mm0
        movl    %esp, %ebp
        xorl    %eax, %eax
        pushl   %edi
        movl    20(%ebp), %edx
        pushl   %esi
        leal    -1(%edx), %ecx
        pushl   %ebx
        movl    8(%ebp), %esi
        movl    12(%ebp), %ebx
        movq    (%esi), %mm2
        movq    (%ebx), %mm4
        movq    %mm2, %mm1
        movq    %mm4, %mm3
        pcmpeqb %mm2, %mm1
        psrlq   $1, %mm3
        psubq   %mm1, %mm0
        xorl    %edx, %edx
        movq    %mm0, %mm6
        movq    %mm2, %mm1
        movq    %mm2, %mm0
        paddq   %mm4, %mm1
        pand    %mm4, %mm2
        psrlq   $1, %mm0
        pand    %mm6, %mm2
        paddq   %mm3, %mm0
        movl    16(%ebp), %edi
        paddq   %mm2, %mm0
        movq    %mm1, %mm5
        psrlq   $63, %mm0
        cmpl    $1, 20(%ebp)
        movq    %mm0, %mm3
        jbe     .L3
        .align 16
.L6:
        movq    %mm5, (%edi,%eax,8)
        movq    8(%esi,%eax,8), %mm0
        movq    8(%ebx,%eax,8), %mm2
        movq    %mm0, %mm1
        incl    %eax
        pand    %mm2, %mm1
        cmpl    %ecx, %eax
        pand    %mm6, %mm1
        paddq   %mm0, %mm2
        paddq   %mm1, %mm0
        paddq   %mm3, %mm2
        movq    %mm3, %mm1
        movq    %mm2, %mm5
        pand    %mm0, %mm1
        paddq   %mm1, %mm0
        movq    %mm0, %mm3
        jne     .L6
        movl    20(%ebp), %eax
        leal    -8(,%eax,8), %edx
.L3:
        movq    %mm5, (%edi,%edx)
        movq    %mm3, %mm0
        popl    %ebx
        popl    %esi
        popl    %edi
        leave
        ret

(FWIW, the result is 50% shorter object code).

Fortunately, none of the changed builtins were documented in the 
documentation, so IMO we are free to change arguments to builtin 
functions, whereas intrinsic function were not changed at all.

2008-03-07  Uros Bizjak  <ubizjak@gmail.com>

        PR target/22152
        * config/i386/i386-modes.def (V1DI): New vector mode.
        * config/i386/i386.h (VALID_MMX_REG_MODE): Add V1DImode.
        * config/i386/mmx.md (MMXMODEI8): New mode iterator.
        (MMXMODE248): Ditto.
        (MMXMODE): Add V1DI mode.
        (mmxvecsize): Change DI mode to V1DI mode.
        ("mov<mode>): Use MMXMODEI8 mode iterator.
        ("*mov<mode>_internal_rex64"): Ditto.
        ("*mov<mode>_internal"): Ditto.
        ("mmx_add<mode>3"): Ditto.  Handle V1DImode for TARGET_SSE2.
        ("mmx_sub<mode>3"): Ditto.
        ("mmx_adddi3"): Remove insn pattern.
        ("mmx_subdi3"): Ditto.
        ("mmx_ashr<mode>3"): Use SImode and "yN" constraint for operand 2.
        ("mmx_lshr<mode>3"): Ditto. Use MMXMODE248 mode iterator.
        ("mmx_ashl<mode>3"): Ditto.
        ("mmx_lshrdi3"): Remove insn pattern.
        ("mmx_ashldi3"): Ditto.
        * config/i386/i386.c (classify_argument): Handle V1DImode.
        (function_arg_advance_32): Ditto.
        (function_arg_32): Ditto.
        (struct builtin_description) [IX86_BUILTIN_PADDQ]: Use
        mmx_addv1di3 insn pattern.
        [IX86_BUILTIN_PSUBQ]: Use mmx_subv1di3 insn pattern.
        [IX86_BUILTIN_PSLL?, IX86_BUILTIN_PSRL?, IX86_BUILTIN_PSRA?,
        IX86_BUILTIN_PSLL?I, IX86_BUILTIN_PSRL?I, IX86_BUILTIN_PSRA?I,
        IX86_BUILTIN_PSLL?I128, IX86_BUILTIN_PSRL?I128, 
IX86_BUILTIN_PSRA?I128]:
        Remove definitions.
        (V1DI_type_node): New node.
        (v1di_ftype_v1di_int): Ditto.
        (v1di_ftype_v1di_v1di): Ditto.
        (v2si_ftype_v2si_si): Ditto.
        (v4hi_ftype_v4hi_di): Remove node.
        (v2si_ftype_v2si_di): Ditto.
        (ix86_init_mmx_sse_builtins): Handle V1DImode.
        (__builtin_ia32_psll?, __builtin_ia32_psrl?, __builtin_ia32_psra?):
        Redefine builtins using def_builtin_const with *_ftype_*_int node.
        (__builtin_ia32_psll?i, __builtin_ia32_psrl?i, 
__builtin_ia32_psra?i):
        Add new builtins using def_builtin_const.
        (ix86_expand_builtin) [IX86_BUILTIN_PSLL?, IX86_BUILTIN_PSRL?,
        IX86_BUILTIN_PSRA?, IX86_BUILTIN_PSLL?I, IX86_BUILTIN_PSRL?I,
        IX86_BUILTIN_PSRA?I]: Handle builtin definitions.
        * config/i386/mmintrin.h (__v1di): New typedef.
        (_mm_add_si64): Cast arguments to __v1di type.
        (_mm_sub_si64): Ditto.
        (_mm_sll_pi16): Cast __count to __v4hi type.
        (_mm_sll_pi32): Cast __count to __v2si type.
        (_mm_sll_si64): Cast arguments to __v1di type.
        (_mm_srl_pi16): Cast __count to __v4hi type.
        (_mm_srl_pi32): Cast __count to __v2si type.
        (_mm_srl_si64): Cast arguments to __v1di type.
        (_mm_sra_pi16): Cast __count to __v4hi type.
        (_mm_sra_pi32): Cast __count to __v2si type.
        (_mm_slli_pi16): Use __builtin_ia32_psllwi.
        (_mm_slli_pi32): Use __builtin_ia32_pslldi.
        (_mm_slli_si64): Use __builtin_ia32_psllqi. Cast __m to __v1di type.
        (_mm_srli_pi16): Use __builtin_ia32_psrlwi.
        (_mm_srli_pi32): Use __builtin_ia32_psrldi.
        (_mm_srli_si64): Use __builtin_ia32_psrlqi. Cast __m to __v1di type.
        (_mm_srai_pi16): Use __builtin_ia32_psrawi.
        (_mm_srai_pi32): Use __builtin_ia32_psradi.
        * config/i386/i386.md (UNSPEC_NOP): Remove unspec definition.
        * doc/extend.texi (X86 Built-in Functions) [__builtin_ia32_psll?,
        __builtin_ia32_psrl?, __builtin_ia32_psra?, __builtin_ia32_psll?i,
        __builtin_ia32_psrl?i, __builtin_ia32_psra?i]: Add new builtins.

Patch was bootstrapped and regression tested on i686-pc-linux-gnu and 
x86_64-pc-linux-gnu {,-m32}. The testcase will be committed in a 
separate commit, as I have to clean it a bit.

The patch is committed to SVN.

Uros.


[-- Attachment #2: p.diff.txt --]
[-- Type: text/plain, Size: 30224 bytes --]

Index: config/i386/i386.h
===================================================================
--- config/i386/i386.h	(revision 133009)
+++ config/i386/i386.h	(working copy)
@@ -1120,8 +1120,9 @@ do {									\
   ((MODE) == V2SFmode || (MODE) == SFmode)
 
 #define VALID_MMX_REG_MODE(MODE)					\
-    ((MODE) == DImode || (MODE) == V8QImode || (MODE) == V4HImode	\
-     || (MODE) == V2SImode || (MODE) == SImode)
+  ((MODE == V1DImode) || (MODE) == DImode				\
+   || (MODE) == V2SImode || (MODE) == SImode				\
+   || (MODE) == V4HImode || (MODE) == V8QImode)
 
 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
    place emms and femms instructions.  */
Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 133009)
+++ config/i386/i386.md	(working copy)
@@ -95,7 +95,6 @@
    (UNSPEC_RCP			45)
    (UNSPEC_RSQRT		46)
    (UNSPEC_SFENCE		47)
-   (UNSPEC_NOP			48)	; prevents combiner cleverness
    (UNSPEC_PFRCP		49)
    (UNSPEC_PFRCPIT1		40)
    (UNSPEC_PFRCPIT2		41)
Index: config/i386/mmx.md
===================================================================
--- config/i386/mmx.md	(revision 133009)
+++ config/i386/mmx.md	(working copy)
@@ -32,16 +32,18 @@
 
 ;; 8 byte integral modes handled by MMX (and by extension, SSE)
 (define_mode_iterator MMXMODEI [V8QI V4HI V2SI])
+(define_mode_iterator MMXMODEI8 [V8QI V4HI V2SI V1DI])
 
 ;; All 8-byte vector modes handled by MMX
-(define_mode_iterator MMXMODE [V8QI V4HI V2SI V2SF])
+(define_mode_iterator MMXMODE [V8QI V4HI V2SI V1DI V2SF])
 
 ;; Mix-n-match
 (define_mode_iterator MMXMODE12 [V8QI V4HI])
 (define_mode_iterator MMXMODE24 [V4HI V2SI])
+(define_mode_iterator MMXMODE248 [V4HI V2SI V1DI])
 
 ;; Mapping from integer vector mode to mnemonic suffix
-(define_mode_attr mmxvecsize [(V8QI "b") (V4HI "w") (V2SI "d") (DI "q")])
+(define_mode_attr mmxvecsize [(V8QI "b") (V4HI "w") (V2SI "d") (V1DI "q")])
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
@@ -53,8 +55,8 @@
 ;; This is essential for maintaining stable calling conventions.
 
 (define_expand "mov<mode>"
-  [(set (match_operand:MMXMODEI 0 "nonimmediate_operand" "")
-	(match_operand:MMXMODEI 1 "nonimmediate_operand" ""))]
+  [(set (match_operand:MMXMODEI8 0 "nonimmediate_operand" "")
+	(match_operand:MMXMODEI8 1 "nonimmediate_operand" ""))]
   "TARGET_MMX"
 {
   ix86_expand_vector_move (<MODE>mode, operands);
@@ -62,9 +64,9 @@
 })
 
 (define_insn "*mov<mode>_internal_rex64"
-  [(set (match_operand:MMXMODEI 0 "nonimmediate_operand"
+  [(set (match_operand:MMXMODEI8 0 "nonimmediate_operand"
 				"=rm,r,!y,!y ,m ,!y,Y2,x,x ,m,r,x")
-	(match_operand:MMXMODEI 1 "vector_move_operand"
+	(match_operand:MMXMODEI8 1 "vector_move_operand"
 				"Cr ,m,C ,!ym,!y,Y2,!y,C,xm,x,x,r"))]
   "TARGET_64BIT && TARGET_MMX
    && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
@@ -86,9 +88,9 @@
    (set_attr "mode" "DI")])
 
 (define_insn "*mov<mode>_internal"
-  [(set (match_operand:MMXMODEI 0 "nonimmediate_operand"
+  [(set (match_operand:MMXMODEI8 0 "nonimmediate_operand"
 			"=!y,!y ,m ,!y ,*Y2,*Y2,*Y2 ,m  ,*x,*x,*x,m ,?r ,?m")
-	(match_operand:MMXMODEI 1 "vector_move_operand"
+	(match_operand:MMXMODEI8 1 "vector_move_operand"
 			"C  ,!ym,!y,*Y2,!y ,C  ,*Y2m,*Y2,C ,*x,m ,*x,irm,r"))]
   "TARGET_MMX
    && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
@@ -557,26 +559,16 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (define_insn "mmx_add<mode>3"
-  [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
-        (plus:MMXMODEI
-	  (match_operand:MMXMODEI 1 "nonimmediate_operand" "%0")
-	  (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")))]
-  "TARGET_MMX && ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
+  [(set (match_operand:MMXMODEI8 0 "register_operand" "=y")
+        (plus:MMXMODEI8
+	  (match_operand:MMXMODEI8 1 "nonimmediate_operand" "%0")
+	  (match_operand:MMXMODEI8 2 "nonimmediate_operand" "ym")))]
+  "(TARGET_MMX || (TARGET_SSE2 && <MODE>mode == V1DImode))
+   && ix86_binary_operator_ok (PLUS, <MODE>mode, operands)"
   "padd<mmxvecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "mmxadd")
    (set_attr "mode" "DI")])
 
-(define_insn "mmx_adddi3"
-  [(set (match_operand:DI 0 "register_operand" "=y")
-        (unspec:DI
-	 [(plus:DI (match_operand:DI 1 "nonimmediate_operand" "%0")
-		   (match_operand:DI 2 "nonimmediate_operand" "ym"))]
-	 UNSPEC_NOP))]
-  "TARGET_SSE2 && ix86_binary_operator_ok (PLUS, DImode, operands)"
-  "paddq\t{%2, %0|%0, %2}"
-  [(set_attr "type" "mmxadd")
-   (set_attr "mode" "DI")])
-
 (define_insn "mmx_ssadd<mode>3"
   [(set (match_operand:MMXMODE12 0 "register_operand" "=y")
         (ss_plus:MMXMODE12
@@ -598,26 +590,15 @@
    (set_attr "mode" "DI")])
 
 (define_insn "mmx_sub<mode>3"
-  [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
-        (minus:MMXMODEI
-	  (match_operand:MMXMODEI 1 "register_operand" "0")
-	  (match_operand:MMXMODEI 2 "nonimmediate_operand" "ym")))]
-  "TARGET_MMX"
+  [(set (match_operand:MMXMODEI8 0 "register_operand" "=y")
+        (minus:MMXMODEI8
+	  (match_operand:MMXMODEI8 1 "register_operand" "0")
+	  (match_operand:MMXMODEI8 2 "nonimmediate_operand" "ym")))]
+  "(TARGET_MMX || (TARGET_SSE2 && <MODE>mode == V1DImode))"
   "psub<mmxvecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "mmxadd")
    (set_attr "mode" "DI")])
 
-(define_insn "mmx_subdi3"
-  [(set (match_operand:DI 0 "register_operand" "=y")
-        (unspec:DI
-	 [(minus:DI (match_operand:DI 1 "register_operand" "0")
-		    (match_operand:DI 2 "nonimmediate_operand" "ym"))]
-	 UNSPEC_NOP))]
-  "TARGET_SSE2"
-  "psubq\t{%2, %0|%0, %2}"
-  [(set_attr "type" "mmxadd")
-   (set_attr "mode" "DI")])
-
 (define_insn "mmx_sssub<mode>3"
   [(set (match_operand:MMXMODE12 0 "register_operand" "=y")
         (ss_minus:MMXMODE12
@@ -778,54 +759,32 @@
   [(set (match_operand:MMXMODE24 0 "register_operand" "=y")
         (ashiftrt:MMXMODE24
 	  (match_operand:MMXMODE24 1 "register_operand" "0")
-	  (match_operand:DI 2 "nonmemory_operand" "yi")))]
+	  (match_operand:SI 2 "nonmemory_operand" "yN")))]
   "TARGET_MMX"
   "psra<mmxvecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "mmxshft")
    (set_attr "mode" "DI")])
 
 (define_insn "mmx_lshr<mode>3"
-  [(set (match_operand:MMXMODE24 0 "register_operand" "=y")
-        (lshiftrt:MMXMODE24
-	  (match_operand:MMXMODE24 1 "register_operand" "0")
-	  (match_operand:DI 2 "nonmemory_operand" "yi")))]
+  [(set (match_operand:MMXMODE248 0 "register_operand" "=y")
+        (lshiftrt:MMXMODE248
+	  (match_operand:MMXMODE248 1 "register_operand" "0")
+	  (match_operand:SI 2 "nonmemory_operand" "yN")))]
   "TARGET_MMX"
   "psrl<mmxvecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "mmxshft")
    (set_attr "mode" "DI")])
 
-(define_insn "mmx_lshrdi3"
-  [(set (match_operand:DI 0 "register_operand" "=y")
-        (unspec:DI
-	  [(lshiftrt:DI (match_operand:DI 1 "register_operand" "0")
-		       (match_operand:DI 2 "nonmemory_operand" "yi"))]
-	  UNSPEC_NOP))]
-  "TARGET_MMX"
-  "psrlq\t{%2, %0|%0, %2}"
-  [(set_attr "type" "mmxshft")
-   (set_attr "mode" "DI")])
-
 (define_insn "mmx_ashl<mode>3"
-  [(set (match_operand:MMXMODE24 0 "register_operand" "=y")
-        (ashift:MMXMODE24
-	  (match_operand:MMXMODE24 1 "register_operand" "0")
-	  (match_operand:DI 2 "nonmemory_operand" "yi")))]
+  [(set (match_operand:MMXMODE248 0 "register_operand" "=y")
+        (ashift:MMXMODE248
+	  (match_operand:MMXMODE248 1 "register_operand" "0")
+	  (match_operand:SI 2 "nonmemory_operand" "yN")))]
   "TARGET_MMX"
   "psll<mmxvecsize>\t{%2, %0|%0, %2}"
   [(set_attr "type" "mmxshft")
    (set_attr "mode" "DI")])
 
-(define_insn "mmx_ashldi3"
-  [(set (match_operand:DI 0 "register_operand" "=y")
-        (unspec:DI
-	 [(ashift:DI (match_operand:DI 1 "register_operand" "0")
-		     (match_operand:DI 2 "nonmemory_operand" "yi"))]
-	 UNSPEC_NOP))]
-  "TARGET_MMX"
-  "psllq\t{%2, %0|%0, %2}"
-  [(set_attr "type" "mmxshft")
-   (set_attr "mode" "DI")])
-
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;
 ;; Parallel integral comparisons
Index: config/i386/mmintrin.h
===================================================================
--- config/i386/mmintrin.h	(revision 133009)
+++ config/i386/mmintrin.h	(working copy)
@@ -42,6 +42,7 @@ typedef int __m64 __attribute__ ((__vect
 typedef int __v2si __attribute__ ((__vector_size__ (8)));
 typedef short __v4hi __attribute__ ((__vector_size__ (8)));
 typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef long long __v1di __attribute__ ((__vector_size__ (8)));
 
 /* Empty the multimedia state.  */
 static __inline void __attribute__((__always_inline__, __artificial__))
@@ -309,7 +310,7 @@ _m_paddd (__m64 __m1, __m64 __m2)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_add_si64 (__m64 __m1, __m64 __m2)
 {
-  return (__m64) __builtin_ia32_paddq ((long long)__m1, (long long)__m2);
+  return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2);
 }
 #endif
 
@@ -413,7 +414,7 @@ _m_psubd (__m64 __m1, __m64 __m2)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_sub_si64 (__m64 __m1, __m64 __m2)
 {
-  return (__m64) __builtin_ia32_psubq ((long long)__m1, (long long)__m2);
+  return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2);
 }
 #endif
 
@@ -520,7 +521,7 @@ _m_pmullw (__m64 __m1, __m64 __m2)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_sll_pi16 (__m64 __m, __m64 __count)
 {
-  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (long long)__count);
+  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -532,7 +533,7 @@ _m_psllw (__m64 __m, __m64 __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_slli_pi16 (__m64 __m, int __count)
 {
-  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, __count);
+  return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -545,7 +546,7 @@ _m_psllwi (__m64 __m, int __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_sll_pi32 (__m64 __m, __m64 __count)
 {
-  return (__m64) __builtin_ia32_pslld ((__v2si)__m, (long long)__count);
+  return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -557,7 +558,7 @@ _m_pslld (__m64 __m, __m64 __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_slli_pi32 (__m64 __m, int __count)
 {
-  return (__m64) __builtin_ia32_pslld ((__v2si)__m, __count);
+  return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -570,7 +571,7 @@ _m_pslldi (__m64 __m, int __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_sll_si64 (__m64 __m, __m64 __count)
 {
-  return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
+  return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -582,7 +583,7 @@ _m_psllq (__m64 __m, __m64 __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_slli_si64 (__m64 __m, int __count)
 {
-  return (__m64) __builtin_ia32_psllq ((long long)__m, (long long)__count);
+  return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -595,7 +596,7 @@ _m_psllqi (__m64 __m, int __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_sra_pi16 (__m64 __m, __m64 __count)
 {
-  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (long long)__count);
+  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -607,7 +608,7 @@ _m_psraw (__m64 __m, __m64 __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_srai_pi16 (__m64 __m, int __count)
 {
-  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, __count);
+  return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -620,7 +621,7 @@ _m_psrawi (__m64 __m, int __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_sra_pi32 (__m64 __m, __m64 __count)
 {
-  return (__m64) __builtin_ia32_psrad ((__v2si)__m, (long long)__count);
+  return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -632,7 +633,7 @@ _m_psrad (__m64 __m, __m64 __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_srai_pi32 (__m64 __m, int __count)
 {
-  return (__m64) __builtin_ia32_psrad ((__v2si)__m, __count);
+  return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -645,7 +646,7 @@ _m_psradi (__m64 __m, int __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_srl_pi16 (__m64 __m, __m64 __count)
 {
-  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (long long)__count);
+  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -657,7 +658,7 @@ _m_psrlw (__m64 __m, __m64 __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_srli_pi16 (__m64 __m, int __count)
 {
-  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, __count);
+  return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -670,7 +671,7 @@ _m_psrlwi (__m64 __m, int __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_srl_pi32 (__m64 __m, __m64 __count)
 {
-  return (__m64) __builtin_ia32_psrld ((__v2si)__m, (long long)__count);
+  return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -682,7 +683,7 @@ _m_psrld (__m64 __m, __m64 __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_srli_pi32 (__m64 __m, int __count)
 {
-  return (__m64) __builtin_ia32_psrld ((__v2si)__m, __count);
+  return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -695,7 +696,7 @@ _m_psrldi (__m64 __m, int __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_srl_si64 (__m64 __m, __m64 __count)
 {
-  return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
+  return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
@@ -707,7 +708,7 @@ _m_psrlq (__m64 __m, __m64 __count)
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
 _mm_srli_si64 (__m64 __m, int __count)
 {
-  return (__m64) __builtin_ia32_psrlq ((long long)__m, (long long)__count);
+  return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count);
 }
 
 static __inline __m64 __attribute__((__always_inline__, __artificial__))
Index: config/i386/i386-modes.def
===================================================================
--- config/i386/i386-modes.def	(revision 133009)
+++ config/i386/i386-modes.def	(working copy)
@@ -79,6 +79,7 @@ VECTOR_MODES (INT, 8);        /*       V
 VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI */
 VECTOR_MODES (FLOAT, 8);      /*            V4HF V2SF */
 VECTOR_MODES (FLOAT, 16);     /*       V8HF V4SF V2DF */
+VECTOR_MODE (INT, DI, 1);     /*                 V1DI */
 VECTOR_MODE (INT, QI, 2);     /*                 V2QI */
 VECTOR_MODE (INT, DI, 4);     /*                 V4DI */
 VECTOR_MODE (INT, SI, 8);     /*                 V8SI */
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 133009)
+++ config/i386/i386.c	(working copy)
@@ -3838,7 +3838,7 @@ classify_argument (enum machine_mode mod
     }
 
   /* for V1xx modes, just use the base mode */
-  if (VECTOR_MODE_P (mode)
+  if (VECTOR_MODE_P (mode) && mode != V1DImode
       && GET_MODE_SIZE (GET_MODE_INNER (mode)) == bytes)
     mode = GET_MODE_INNER (mode);
 
@@ -3910,6 +3910,7 @@ classify_argument (enum machine_mode mod
       classes[0] = X86_64_SSE_CLASS;
       classes[1] = X86_64_SSEUP_CLASS;
       return 2;
+    case V1DImode:
     case V2SFmode:
     case V2SImode:
     case V4HImode:
@@ -4211,6 +4212,7 @@ function_arg_advance_32 (CUMULATIVE_ARGS
     case V4HImode:
     case V2SImode:
     case V2SFmode:
+    case V1DImode:
       if (!type || !AGGREGATE_TYPE_P (type))
 	{
 	  cum->mmx_words += words;
@@ -4374,6 +4376,7 @@ function_arg_32 (CUMULATIVE_ARGS *cum, e
     case V4HImode:
     case V2SImode:
     case V2SFmode:
+    case V1DImode:
       if (!type || !AGGREGATE_TYPE_P (type))
 	{
 	  if (!TARGET_MMX && !warnedmmx && cum->warn_mmx)
@@ -17955,11 +17958,11 @@ static const struct builtin_description 
   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv8qi3, "__builtin_ia32_paddb", IX86_BUILTIN_PADDB, UNKNOWN, 0 },
   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv4hi3, "__builtin_ia32_paddw", IX86_BUILTIN_PADDW, UNKNOWN, 0 },
   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_addv2si3, "__builtin_ia32_paddd", IX86_BUILTIN_PADDD, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_adddi3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_addv1di3, "__builtin_ia32_paddq", IX86_BUILTIN_PADDQ, UNKNOWN, 0 },
   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv8qi3, "__builtin_ia32_psubb", IX86_BUILTIN_PSUBB, UNKNOWN, 0 },
   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv4hi3, "__builtin_ia32_psubw", IX86_BUILTIN_PSUBW, UNKNOWN, 0 },
   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_subv2si3, "__builtin_ia32_psubd", IX86_BUILTIN_PSUBD, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subdi3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, 0 },
+  { OPTION_MASK_ISA_SSE2, CODE_FOR_mmx_subv1di3, "__builtin_ia32_psubq", IX86_BUILTIN_PSUBQ, UNKNOWN, 0 },
 
   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv8qi3, "__builtin_ia32_paddsb", IX86_BUILTIN_PADDSB, UNKNOWN, 0 },
   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ssaddv4hi3, "__builtin_ia32_paddsw", IX86_BUILTIN_PADDSW, UNKNOWN, 0 },
@@ -18010,25 +18013,6 @@ static const struct builtin_description 
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvtsi2ss, 0, IX86_BUILTIN_CVTSI2SS, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvtsi2ssq, 0, IX86_BUILTIN_CVTSI642SS, UNKNOWN, 0 },
 
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLW, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv4hi3, 0, IX86_BUILTIN_PSLLWI, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLD, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashlv2si3, 0, IX86_BUILTIN_PSLLDI, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQ, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashldi3, 0, IX86_BUILTIN_PSLLQI, UNKNOWN, 0 },
-
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLW, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv4hi3, 0, IX86_BUILTIN_PSRLWI, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLD, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrv2si3, 0, IX86_BUILTIN_PSRLDI, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQ, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_lshrdi3, 0, IX86_BUILTIN_PSRLQI, UNKNOWN, 0 },
-
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAW, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv4hi3, 0, IX86_BUILTIN_PSRAWI, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRAD, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_ashrv2si3, 0, IX86_BUILTIN_PSRADI, UNKNOWN, 0 },
-
   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_psadbw, 0, IX86_BUILTIN_PSADBW, UNKNOWN, 0 },
   { OPTION_MASK_ISA_MMX, CODE_FOR_mmx_pmaddwd, 0, IX86_BUILTIN_PMADDWD, UNKNOWN, 0 },
 
@@ -18140,17 +18124,6 @@ static const struct builtin_description 
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulsidi3, 0, IX86_BUILTIN_PMULUDQ, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_umulv2siv2di3, 0, IX86_BUILTIN_PMULUDQ128, UNKNOWN, 0 },
 
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv8hi3, 0, IX86_BUILTIN_PSLLWI128, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv4si3, 0, IX86_BUILTIN_PSLLDI128, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashlv2di3, 0, IX86_BUILTIN_PSLLQI128, UNKNOWN, 0 },
-
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv8hi3, 0, IX86_BUILTIN_PSRLWI128, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv4si3, 0, IX86_BUILTIN_PSRLDI128, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_lshrv2di3, 0, IX86_BUILTIN_PSRLQI128, UNKNOWN, 0 },
-
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv8hi3, 0, IX86_BUILTIN_PSRAWI128, UNKNOWN, 0 },
-  { OPTION_MASK_ISA_SSE2, CODE_FOR_ashrv4si3, 0, IX86_BUILTIN_PSRADI128, UNKNOWN, 0 },
-
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmaddwd, 0, IX86_BUILTIN_PMADDWD128, UNKNOWN, 0 },
 
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_cvtsi2sd, 0, IX86_BUILTIN_CVTSI2SD, UNKNOWN, 0 },
@@ -18212,6 +18185,7 @@ static const struct builtin_description 
 
 static const struct builtin_description bdesc_1arg[] =
 {
+  /* SSE */
   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, CODE_FOR_mmx_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_movmskps, 0, IX86_BUILTIN_MOVMSKPS, UNKNOWN, 0 },
 
@@ -18228,6 +18202,7 @@ static const struct builtin_description 
   { OPTION_MASK_ISA_SSE, CODE_FOR_sse_cvttss2si, 0, IX86_BUILTIN_CVTTSS2SI, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_64BIT, CODE_FOR_sse_cvttss2siq, 0, IX86_BUILTIN_CVTTSS2SI64, UNKNOWN, 0 },
 
+  /* SSE2 */
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB128, UNKNOWN, 0 },
   { OPTION_MASK_ISA_SSE2, CODE_FOR_sse2_movmskpd, 0, IX86_BUILTIN_MOVMSKPD, UNKNOWN, 0 },
 
@@ -18585,6 +18560,8 @@ ix86_init_mmx_sse_builtins (void)
 
   tree V16QI_type_node = build_vector_type_for_mode (char_type_node, V16QImode);
   tree V2SI_type_node = build_vector_type_for_mode (intSI_type_node, V2SImode);
+  tree V1DI_type_node
+    = build_vector_type_for_mode (long_long_integer_type_node, V1DImode);
   tree V2SF_type_node = build_vector_type_for_mode (float_type_node, V2SFmode);
   tree V2DI_type_node
     = build_vector_type_for_mode (long_long_integer_type_node, V2DImode);
@@ -18649,14 +18626,13 @@ ix86_init_mmx_sse_builtins (void)
   tree v4hi_ftype_v4hi_int
     = build_function_type_list (V4HI_type_node,
 				V4HI_type_node, integer_type_node, NULL_TREE);
-  tree v4hi_ftype_v4hi_di
-    = build_function_type_list (V4HI_type_node,
-				V4HI_type_node, long_long_unsigned_type_node,
-				NULL_TREE);
-  tree v2si_ftype_v2si_di
+  tree v2si_ftype_v2si_int
     = build_function_type_list (V2SI_type_node,
-				V2SI_type_node, long_long_unsigned_type_node,
-				NULL_TREE);
+				V2SI_type_node, integer_type_node, NULL_TREE);
+  tree v1di_ftype_v1di_int
+    = build_function_type_list (V1DI_type_node,
+				V1DI_type_node, integer_type_node, NULL_TREE);
+
   tree void_ftype_void
     = build_function_type (void_type_node, void_list_node);
   tree void_ftype_unsigned
@@ -18723,10 +18699,9 @@ ix86_init_mmx_sse_builtins (void)
   tree v2si_ftype_v2si_v2si
     = build_function_type_list (V2SI_type_node,
 				V2SI_type_node, V2SI_type_node, NULL_TREE);
-  tree di_ftype_di_di
-    = build_function_type_list (long_long_unsigned_type_node,
-				long_long_unsigned_type_node,
-				long_long_unsigned_type_node, NULL_TREE);
+  tree v1di_ftype_v1di_v1di
+    = build_function_type_list (V1DI_type_node,
+				V1DI_type_node, V1DI_type_node, NULL_TREE);
 
   tree di_ftype_di_di_int
     = build_function_type_list (long_long_unsigned_type_node,
@@ -19182,8 +19157,8 @@ ix86_init_mmx_sse_builtins (void)
 	case V2SImode:
 	  type = v2si_ftype_v2si_v2si;
 	  break;
-	case DImode:
-	  type = di_ftype_di_di;
+	case V1DImode:
+	  type = v1di_ftype_v1di_v1di;
 	  break;
 
 	default:
@@ -19275,16 +19250,25 @@ ix86_init_mmx_sse_builtins (void)
 
   /* Add the remaining MMX insns with somewhat more complicated types.  */
   def_builtin (OPTION_MASK_ISA_MMX, "__builtin_ia32_emms", void_ftype_void, IX86_BUILTIN_EMMS);
-  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSLLW);
-  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pslld", v2si_ftype_v2si_di, IX86_BUILTIN_PSLLD);
-  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllq", di_ftype_di_di, IX86_BUILTIN_PSLLQ);
-
-  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRLW);
-  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrld", v2si_ftype_v2si_di, IX86_BUILTIN_PSRLD);
-  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlq", di_ftype_di_di, IX86_BUILTIN_PSRLQ);
 
-  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psraw", v4hi_ftype_v4hi_di, IX86_BUILTIN_PSRAW);
-  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrad", v2si_ftype_v2si_di, IX86_BUILTIN_PSRAD);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllwi", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSLLWI);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pslldi", v2si_ftype_v2si_int, IX86_BUILTIN_PSLLDI);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllqi", v1di_ftype_v1di_int, IX86_BUILTIN_PSLLQI);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PSLLW);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pslld", v2si_ftype_v2si_v2si, IX86_BUILTIN_PSLLD);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psllq", v1di_ftype_v1di_v1di, IX86_BUILTIN_PSLLQ);
+
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlwi", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSRLWI);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrldi", v2si_ftype_v2si_int, IX86_BUILTIN_PSRLDI);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlqi", v1di_ftype_v1di_int, IX86_BUILTIN_PSRLQI);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PSRLW);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrld", v2si_ftype_v2si_v2si, IX86_BUILTIN_PSRLD);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrlq", v1di_ftype_v1di_v1di, IX86_BUILTIN_PSRLQ);
+
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrawi", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSRAWI);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psradi", v2si_ftype_v2si_int, IX86_BUILTIN_PSRADI);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psraw", v4hi_ftype_v4hi_v4hi, IX86_BUILTIN_PSRAW);
+  def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_psrad", v2si_ftype_v2si_v2si, IX86_BUILTIN_PSRAD);
 
   def_builtin_const (OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A, "__builtin_ia32_pshufw", v4hi_ftype_v4hi_int, IX86_BUILTIN_PSHUFW);
   def_builtin_const (OPTION_MASK_ISA_MMX, "__builtin_ia32_pmaddwd", v2si_ftype_v4hi_v4hi, IX86_BUILTIN_PMADDWD);
@@ -20829,6 +20813,39 @@ ix86_expand_builtin (tree exp, rtx targe
       emit_insn (pat);
       return target;
 
+    case IX86_BUILTIN_PSLLW:
+    case IX86_BUILTIN_PSLLWI:
+      icode = CODE_FOR_mmx_ashlv4hi3;
+      goto do_pshift;
+    case IX86_BUILTIN_PSLLD:
+    case IX86_BUILTIN_PSLLDI:
+      icode = CODE_FOR_mmx_ashlv2si3;
+      goto do_pshift;
+    case IX86_BUILTIN_PSLLQ:
+    case IX86_BUILTIN_PSLLQI:
+      icode = CODE_FOR_mmx_ashlv1di3;
+      goto do_pshift;
+    case IX86_BUILTIN_PSRAW:
+    case IX86_BUILTIN_PSRAWI:
+      icode = CODE_FOR_mmx_ashrv4hi3;
+      goto do_pshift;
+    case IX86_BUILTIN_PSRAD:
+    case IX86_BUILTIN_PSRADI:
+      icode = CODE_FOR_mmx_ashrv2si3;
+      goto do_pshift;
+    case IX86_BUILTIN_PSRLW:
+    case IX86_BUILTIN_PSRLWI:
+      icode = CODE_FOR_mmx_lshrv4hi3;
+      goto do_pshift;
+    case IX86_BUILTIN_PSRLD:
+    case IX86_BUILTIN_PSRLDI:
+      icode = CODE_FOR_mmx_lshrv2si3;
+      goto do_pshift;
+    case IX86_BUILTIN_PSRLQ:
+    case IX86_BUILTIN_PSRLQI:
+      icode = CODE_FOR_mmx_lshrv1di3;
+      goto do_pshift;
+
     case IX86_BUILTIN_PSLLW128:
     case IX86_BUILTIN_PSLLWI128:
       icode = CODE_FOR_ashlv8hi3;
Index: doc/extend.texi
===================================================================
--- doc/extend.texi	(revision 133009)
+++ doc/extend.texi	(working copy)
@@ -7476,6 +7476,24 @@ v2si __builtin_ia32_punpckldq (v2si, v2s
 v8qi __builtin_ia32_packsswb (v4hi, v4hi)
 v4hi __builtin_ia32_packssdw (v2si, v2si)
 v8qi __builtin_ia32_packuswb (v4hi, v4hi)
+
+v8hi __builtin_ia32_psllw (v4hi, v4hi)
+v4si __builtin_ia32_pslld (v2si, v2si)
+v2di __builtin_ia32_psllq (v1di, v1di)
+v8hi __builtin_ia32_psrlw (v8hi, v2si)
+v4si __builtin_ia32_psrld (v4si, v2si)
+v2di __builtin_ia32_psrlq (v1di, v1di)
+v8hi __builtin_ia32_psraw (v4hi, v4hi)
+v4si __builtin_ia32_psrad (v2si, v2si)
+v8hi __builtin_ia32_psllwi (v4hi, int)
+v4si __builtin_ia32_pslldi (v2si, int)
+v2di __builtin_ia32_psllqi (v1di, int)
+v8hi __builtin_ia32_psrlwi (v4hi, int)
+v4si __builtin_ia32_psrldi (v2si, int)
+v2di __builtin_ia32_psrlqi (v1di, int)
+v8hi __builtin_ia32_psrawi (v4hi, int)
+v4si __builtin_ia32_psradi (v2si, int)
+
 @end smallexample
 
 The following built-in functions are made available either with


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH, i386]: Committed: Fix PR target/22152
  2008-03-08  7:01 [PATCH, i386]: Committed: Fix PR target/22152 Uros Bizjak
@ 2008-03-08  9:41 ` Paolo Bonzini
  2008-03-13 17:50   ` Uros Bizjak
  2008-03-08 10:21 ` Richard Guenther
  1 sibling, 1 reply; 5+ messages in thread
From: Paolo Bonzini @ 2008-03-08  9:41 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC Patches

> +v2di __builtin_ia32_psrlqi (v1di, int)

Something wrong in the return types?

Paolo

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH, i386]: Committed: Fix PR target/22152
  2008-03-08  7:01 [PATCH, i386]: Committed: Fix PR target/22152 Uros Bizjak
  2008-03-08  9:41 ` Paolo Bonzini
@ 2008-03-08 10:21 ` Richard Guenther
  2008-03-08 12:44   ` Uros Bizjak
  1 sibling, 1 reply; 5+ messages in thread
From: Richard Guenther @ 2008-03-08 10:21 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC Patches

On Fri, Mar 7, 2008 at 7:13 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> Hello!
>
>  Attached patch substantially improves handling of MMX DImode values. It
>  turns out, that in order to reliably separate "native" and MMX DImode
>  values, a new mode is needed. Since we are using vector registers, the
>  natural choice is V1DImode, vector mode with one DImode element. After
>  this change, V1DI mode is considered to be native MMX mode (native MMX
>  modes were successfully separated out from non-vector modes by the
>  patch, committed a couple of weeks ago).
>
>  Patch rewrites all MMX DImode patterns into V1DImode, also updating
>  mmintrin.h for the new/changed builtins on the way. The patch also
>  rewrites MMX shift patterns in the same way as SSE shift patterns were
>  changed some time ago.
>
>  The benefits of the patch can be seen from the original testcase from PR
>  target/22152. For a slightly changed testcase (to avoid uninitialized
>  variables), non-patched gcc creates barely recognizable code:

I realize this may be hard, but with all the many tweaking patches for SSE, MMX,
etc. how do we make sure to not regress in cases we fixed earlier?  So, may I
suggest you try to enter at least "something" into the testsuite?  For example
scan-assembler-not ".L6.*ebp.*.L6" (no stack operations between the .L6 loop
entry and the backedge)?  Maybe a little bit fragile, but at least
some confidence
would be there (and some testcases to eventually manually inspect) that
we won't regress again?

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH, i386]: Committed: Fix PR target/22152
  2008-03-08 10:21 ` Richard Guenther
@ 2008-03-08 12:44   ` Uros Bizjak
  0 siblings, 0 replies; 5+ messages in thread
From: Uros Bizjak @ 2008-03-08 12:44 UTC (permalink / raw)
  To: Richard Guenther; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 3433 bytes --]

Richard Guenther wrote:

> I realize this may be hard, but with all the many tweaking patches for SSE, MMX,
> etc. how do we make sure to not regress in cases we fixed earlier?  So, may I
> suggest you try to enter at least "something" into the testsuite?  For example
> scan-assembler-not ".L6.*ebp.*.L6" (no stack operations between the .L6 loop
> entry and the backedge)?  Maybe a little bit fragile, but at least
> some confidence
> would be there (and some testcases to eventually manually inspect) that
> we won't regress again?
>   

Heh, I _did_ say that "... The testcase will be committed in a separate 
commit, as I have to clean it a bit."

Attached to this message, please find a couple of testcases, derived 
from PR 22152:

- pr22152.c is a compile time test that checks if (long long) value 
stays inside MMX register. Due to the cast to (long long) in the 
mmintrin.h, the value was dragged to and from the memory (-O2 -m32 -msse2):

.L3:
        movl    (%ebx,%eax,8), %esi
        movl    4(%ebx,%eax,8), %edi
        movl    %esi, -24(%ebp)
        movl    %edi, -20(%ebp)
        movq    -24(%ebp), %mm0
        paddq   (%ecx,%eax,8), %mm0
        addl    $1, %eax
        cmpl    %eax, %edx
        movq    %mm0, -24(%ebp)
        movq    -24(%ebp), %mm0
        ja      .L3

 The situation is now much better:

.L3:
        movq    (%ebx,%eax,8), %mm0
        paddq   (%ecx,%eax,8), %mm0
        addl    $1, %eax
        cmpl    %eax, %edx
        ja      .L3

- sse2-mmx.c
This is a runtime test, based on the large testcase from the PR. The 
test should add two huge numbers together using MMX stuff, but 
unfortunately, it doesn't work correctly (carry propagation logic is 
fatally flawed). Attached test fixes this logic, so it can be used to 
increase the runtime coverage of SSE2 based MMX operations.

FWIW, the loop from the testcase is now:

.L3:
        movq    (%esi,%eax,8), %mm2     #* a, _a.37
        movq    (%ebx,%eax,8), %mm3     #* b, _b
        movq    %mm2, %mm0      # D.2452, tmp94
        paddq   %mm3, %mm0      # D.2451, tmp94
        movq    %mm2, %mm1      # _a.37, D.2452
        movq    %mm3, %mm4      # _b, D.2451
        paddq   %mm5, %mm0      # carry, tmp94
        psrlq   $1, %mm1        #, D.2452
        movq    %mm0, (%ecx,%eax,8)     # tmp94,* result
        movq    %mm2, %mm0      # _a.37, tmp96
        pxor    %mm3, %mm0      # _b, tmp96
        pand    %mm5, %mm0      # carry, tmp96
        pand    %mm3, %mm2      # _b, _a.37
        por     %mm0, %mm2      # tmp96, _a.37
        psrlq   $1, %mm4        #, D.2451
        pand    %mm6, %mm2      # one.38, _a.37
        paddq   %mm4, %mm1      # D.2451, D.2452
        paddq   %mm2, %mm1      # _a.37, D.2452
        addl    $1, %eax        #, i
        psrlq   $63, %mm1       #, D.2452
        cmpl    %eax, %edx      # i, count
        movq    %mm1, %mm5      # D.2452, carry
        ja      .L3     #,
.L2:

Other than that, previous changes to MMX patterns are covered by 
pr22076.c, pr34256.c. In addition, all vecinit-N.c tests check that no 
MMX register is used in vector initialization code (we had some problems 
with this in the past).

2008-03-08  Uros Bizjak  <ubizjak@gmail.com>

        PR target/22152
        * gcc.target/i386/pr22152.c: New test.
        * gcc.target/i386/sse2-mmx.c: Ditto.


These new tests were checked on x86_64-linux-gnu {,-m32} and are 
committed to mainline.

Uros.


[-- Attachment #2: p.diff.txt --]
[-- Type: text/plain, Size: 2429 bytes --]

Index: gcc.target/i386/sse2-mmx.c
===================================================================
--- gcc.target/i386/sse2-mmx.c	(revision 0)
+++ gcc.target/i386/sse2-mmx.c	(revision 0)
@@ -0,0 +1,75 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -msse2" } */
+
+#include "sse2-check.h"
+
+#include <mmintrin.h>
+
+#define N 4
+
+unsigned long long a[N], b[N], result[N];
+
+unsigned long long check[N] =
+  { 0x101010101010100full,
+    0x1010101010101010ull,
+    0x1010101010101010ull,
+    0x1010101010101010ull };
+
+__m64
+unsigned_add3 (const __m64 * a, const __m64 * b,
+	       __m64 * result, unsigned int count)
+{
+  __m64 _a, _b, one, sum, carry, onesCarry;
+
+  unsigned int i;
+
+  one = _mm_cmpeq_pi8 (_a, _a);
+  one = _mm_sub_si64 (_mm_xor_si64 (one, one), one);
+
+  carry = _mm_xor_si64 (one, one);
+
+  for (i = 0; i < count; i++)
+    {
+      _a = a[i];
+      _b = b[i];
+
+      sum = _mm_add_si64 (_a, _b);
+      sum = _mm_add_si64 (sum, carry);
+
+      result[i] = sum;
+
+      onesCarry = _mm_and_si64 (_mm_xor_si64 (_a, _b), carry);
+      onesCarry = _mm_or_si64 (_mm_and_si64 (_a, _b), onesCarry);
+      onesCarry = _mm_and_si64 (onesCarry, one);
+
+      _a = _mm_srli_si64 (_a, 1);
+      _b = _mm_srli_si64 (_b, 1);
+
+      carry = _mm_add_si64 (_mm_add_si64 (_a, _b), onesCarry);
+      carry = _mm_srli_si64 (carry, 63);
+    }
+
+  _mm_empty ();
+  return carry;
+}
+
+void __attribute__((noinline))
+sse2_test (void)
+{
+  unsigned long long carry;
+  int i;
+
+  /* Really long numbers.  */
+  a[3] = a[2] = a[1] = a[0] = 0xd3d3d3d3d3d3d3d3ull;
+  b[3] = b[2] = b[1] = b[0] = 0x3c3c3c3c3c3c3c3cull;
+
+  carry = (unsigned long long) unsigned_add3
+    ((__m64 *)a, (__m64 *)b, (__m64 *)result, N);
+
+  if (carry != 1)
+    abort ();
+
+  for (i = 0; i < N; i++)
+    if (result [i] != check[i])
+      abort ();
+}
Index: gcc.target/i386/pr22152.c
===================================================================
--- gcc.target/i386/pr22152.c	(revision 0)
+++ gcc.target/i386/pr22152.c	(revision 0)
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+
+#include <mmintrin.h>
+
+__m64
+unsigned_add3 (const __m64 * a, const __m64 * b, unsigned long count)
+{
+  __m64 sum;
+  unsigned int i;
+
+  for (i = 1; i < count; i++)
+    sum = _mm_add_si64 (a[i], b[i]);
+
+  return sum;
+}
+
+/* { dg-final { scan-assembler-times "movq\[ \\t\]+.*%mm" 1 } } */

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH, i386]: Committed: Fix PR target/22152
  2008-03-08  9:41 ` Paolo Bonzini
@ 2008-03-13 17:50   ` Uros Bizjak
  0 siblings, 0 replies; 5+ messages in thread
From: Uros Bizjak @ 2008-03-13 17:50 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 591 bytes --]

Paolo Bonzini wrote:

>> +v2di __builtin_ia32_psrlqi (v1di, int)
>
> Something wrong in the return types?

Fixed by a follow-up doc fix to match documents with reality:

2008-03-13  Uros Bizjak  <ubizjak@gmail.com>

        * doc/extend.texi (X86 Built-in Functions) [__builtin_ia32_psll?,
        __builtin_ia32_psrl?, __builtin_ia32_psra?, __builtin_ia32_psll?i,
        __builtin_ia32_psrl?i, __builtin_ia32_psra?i, 
__builtin_ia32_psll?128,
        __builtin_ia32_psrl?128, __builtin_ia32_psra?128]: Fix the mode of
        input arguments and the mode of return value.

Thanks,
Uros.



[-- Attachment #2: p.diff.txt --]
[-- Type: text/plain, Size: 2525 bytes --]

Index: doc/extend.texi
===================================================================
--- doc/extend.texi	(revision 133170)
+++ doc/extend.texi	(working copy)
@@ -7477,22 +7477,22 @@
 v4hi __builtin_ia32_packssdw (v2si, v2si)
 v8qi __builtin_ia32_packuswb (v4hi, v4hi)
 
-v8hi __builtin_ia32_psllw (v4hi, v4hi)
-v4si __builtin_ia32_pslld (v2si, v2si)
-v2di __builtin_ia32_psllq (v1di, v1di)
-v8hi __builtin_ia32_psrlw (v8hi, v2si)
-v4si __builtin_ia32_psrld (v4si, v2si)
-v2di __builtin_ia32_psrlq (v1di, v1di)
-v8hi __builtin_ia32_psraw (v4hi, v4hi)
-v4si __builtin_ia32_psrad (v2si, v2si)
-v8hi __builtin_ia32_psllwi (v4hi, int)
-v4si __builtin_ia32_pslldi (v2si, int)
-v2di __builtin_ia32_psllqi (v1di, int)
-v8hi __builtin_ia32_psrlwi (v4hi, int)
-v4si __builtin_ia32_psrldi (v2si, int)
-v2di __builtin_ia32_psrlqi (v1di, int)
-v8hi __builtin_ia32_psrawi (v4hi, int)
-v4si __builtin_ia32_psradi (v2si, int)
+v4hi __builtin_ia32_psllw (v4hi, v4hi)
+v2si __builtin_ia32_pslld (v2si, v2si)
+v1di __builtin_ia32_psllq (v1di, v1di)
+v4hi __builtin_ia32_psrlw (v4hi, v4hi)
+v2si __builtin_ia32_psrld (v2si, v2si)
+v1di __builtin_ia32_psrlq (v1di, v1di)
+v4hi __builtin_ia32_psraw (v4hi, v4hi)
+v2si __builtin_ia32_psrad (v2si, v2si)
+v4hi __builtin_ia32_psllwi (v4hi, int)
+v2si __builtin_ia32_pslldi (v2si, int)
+v1di __builtin_ia32_psllqi (v1di, int)
+v4hi __builtin_ia32_psrlwi (v4hi, int)
+v2si __builtin_ia32_psrldi (v2si, int)
+v1di __builtin_ia32_psrlqi (v1di, int)
+v4hi __builtin_ia32_psrawi (v4hi, int)
+v2si __builtin_ia32_psradi (v2si, int)
 
 @end smallexample
 
@@ -7755,14 +7755,14 @@
 void __builtin_ia32_storedqu (char *, v16qi)
 unsigned long long __builtin_ia32_pmuludq (v2si, v2si)
 v2di __builtin_ia32_pmuludq128 (v4si, v4si)
-v8hi __builtin_ia32_psllw128 (v8hi, v2di)
-v4si __builtin_ia32_pslld128 (v4si, v2di)
-v2di __builtin_ia32_psllq128 (v4si, v2di)
-v8hi __builtin_ia32_psrlw128 (v8hi, v2di)
-v4si __builtin_ia32_psrld128 (v4si, v2di)
+v8hi __builtin_ia32_psllw128 (v8hi, v8hi)
+v4si __builtin_ia32_pslld128 (v4si, v4si)
+v2di __builtin_ia32_psllq128 (v2di, v2di)
+v8hi __builtin_ia32_psrlw128 (v8hi, v8hi)
+v4si __builtin_ia32_psrld128 (v4si, v4si)
 v2di __builtin_ia32_psrlq128 (v2di, v2di)
-v8hi __builtin_ia32_psraw128 (v8hi, v2di)
-v4si __builtin_ia32_psrad128 (v4si, v2di)
+v8hi __builtin_ia32_psraw128 (v8hi, v8hi)
+v4si __builtin_ia32_psrad128 (v4si, v4si)
 v2di __builtin_ia32_pslldqi128 (v2di, int)
 v8hi __builtin_ia32_psllwi128 (v8hi, int)
 v4si __builtin_ia32_pslldi128 (v4si, int)

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2008-03-13 17:50 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-03-08  7:01 [PATCH, i386]: Committed: Fix PR target/22152 Uros Bizjak
2008-03-08  9:41 ` Paolo Bonzini
2008-03-13 17:50   ` Uros Bizjak
2008-03-08 10:21 ` Richard Guenther
2008-03-08 12:44   ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).