public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [i386] Replace builtins with vector extensions
@ 2014-04-11 20:10 Marc Glisse
  2014-04-28 11:39 ` Marc Glisse
  0 siblings, 1 reply; 35+ messages in thread
From: Marc Glisse @ 2014-04-11 20:10 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: TEXT/PLAIN, Size: 890 bytes --]

Hello,

the previous discussion on the topic was before we added all those #pragma 
target in *mmintrin.h:

http://gcc.gnu.org/ml/gcc-patches/2013-04/msg00374.html

I believe that removes a large part of the arguments against it. Note that 
I only did a few of the more obvious intrinsics, I am waiting to see if 
this patch is accepted before doing more.

Bootstrap+testsuite on x86_64-linux-gnu.

2014-04-11  Marc Glisse  <marc.glisse@inria.fr>

 	* config/i386/xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps,
 	_mm_div_ps, _mm_store_ss, _mm_cvtss_f32): Use vector extensions
 	instead of builtins.
 	* config/i386/emmintrin.h (_mm_store_sd, _mm_cvtsd_f64, _mm_storeh_pd,
 	_mm_cvtsi128_si64, _mm_cvtsi128_si64x, _mm_add_pd, _mm_sub_pd,
 	_mm_mul_pd, _mm_div_pd, _mm_storel_epi64, _mm_movepi64_pi64,
 	_mm_loadh_pd, _mm_loadl_pd): Likewise.
 	(_mm_sqrt_sd): Fix comment.

-- 
Marc Glisse

[-- Attachment #2: Type: TEXT/PLAIN, Size: 9773 bytes --]

Index: gcc/config/i386/emmintrin.h
===================================================================
--- gcc/config/i386/emmintrin.h	(revision 209323)
+++ gcc/config/i386/emmintrin.h	(working copy)
@@ -161,40 +161,40 @@ _mm_store_pd (double *__P, __m128d __A)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storeu_pd (double *__P, __m128d __A)
 {
   __builtin_ia32_storeupd (__P, __A);
 }
 
 /* Stores the lower DPFP value.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_store_sd (double *__P, __m128d __A)
 {
-  *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
+  *__P = __A[0];
 }
 
 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsd_f64 (__m128d __A)
 {
-  return __builtin_ia32_vec_ext_v2df (__A, 0);
+  return __A[0];
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storel_pd (double *__P, __m128d __A)
 {
   _mm_store_sd (__P, __A);
 }
 
 /* Stores the upper DPFP value.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storeh_pd (double *__P, __m128d __A)
 {
-  *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
+  *__P = __A[1];
 }
 
 /* Store the lower DPFP value across two words.
    The address must be 16-byte aligned.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_store1_pd (double *__P, __m128d __A)
 {
   _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
 }
 
@@ -215,86 +215,86 @@ extern __inline int __attribute__((__gnu
 _mm_cvtsi128_si32 (__m128i __A)
 {
   return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
 }
 
 #ifdef __x86_64__
 /* Intel intrinsic.  */
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsi128_si64 (__m128i __A)
 {
-  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+  return __A[0];
 }
 
 /* Microsoft intrinsic.  */
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsi128_si64x (__m128i __A)
 {
-  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+  return __A[0];
 }
 #endif
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
+  return __A + __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
+  return __A - __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
+  return __A * __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
+  return __A / __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_pd (__m128d __A)
 {
   return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
 }
 
-/* Return pair {sqrt (A[0), B[1]}.  */
+/* Return pair {sqrt (B[0]), A[1]}.  */
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_sd (__m128d __A, __m128d __B)
 {
   __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
   return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_min_pd (__m128d __A, __m128d __B)
 {
@@ -708,27 +708,27 @@ _mm_store_si128 (__m128i *__P, __m128i _
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storeu_si128 (__m128i *__P, __m128i __B)
 {
   __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storel_epi64 (__m128i *__P, __m128i __B)
 {
-  *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+  *(long long *)__P = __B[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movepi64_pi64 (__m128i __B)
 {
-  return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+  return (__m64) __B[0];
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movpi64_epi64 (__m64 __A)
 {
   return _mm_set_epi64 ((__m64)0LL, __A);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_epi64 (__m128i __A)
@@ -915,27 +915,27 @@ _mm_unpackhi_pd (__m128d __A, __m128d __
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_unpacklo_pd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_loadh_pd (__m128d __A, double const *__B)
 {
-  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
+  return __extension__ (__m128d){ __A[0], __B[0] };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_loadl_pd (__m128d __A, double const *__B)
 {
-  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
+  return __extension__ (__m128d){ __B[0], __A[1] };
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movemask_pd (__m128d __A)
 {
   return __builtin_ia32_movmskpd ((__v2df)__A);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_packs_epi16 (__m128i __A, __m128i __B)
Index: gcc/config/i386/xmmintrin.h
===================================================================
--- gcc/config/i386/xmmintrin.h	(revision 209323)
+++ gcc/config/i386/xmmintrin.h	(working copy)
@@ -173,39 +173,39 @@ extern __inline __m128 __attribute__((__
 _mm_max_ss (__m128 __A, __m128 __B)
 {
   return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
 }
 
 /* Perform the respective operation on the four SPFP values in A and B.  */
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
+  return __A + __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
+  return __A - __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
+  return __A * __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
+  return __A / __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_ps (__m128 __A)
 {
   return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_rcp_ps (__m128 __A)
@@ -950,27 +950,27 @@ _mm_set_ps (const float __Z, const float
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 {
   return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
 }
 
 /* Stores the lower SPFP value.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_store_ss (float *__P, __m128 __A)
 {
-  *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+  *__P = __A[0];
 }
 
 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtss_f32 (__m128 __A)
 {
-  return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+  return __A[0];
 }
 
 /* Store four SPFP values.  The address must be 16-byte aligned.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_store_ps (float *__P, __m128 __A)
 {
   *(__v4sf *)__P = (__v4sf)__A;
 }
 
 /* Store four SPFP values.  The address need not be 16-byte aligned.  */

^ permalink raw reply	[flat|nested] 35+ messages in thread
* [i386] Replace builtins with vector extensions
@ 2013-04-07 18:03 Marc Glisse
  2013-04-07 19:38 ` Marc Glisse
                   ` (2 more replies)
  0 siblings, 3 replies; 35+ messages in thread
From: Marc Glisse @ 2013-04-07 18:03 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1508 bytes --]

Hello,

the attached patch is very incomplete (it passes bootstrap+testsuite on 
x86_64-linux-gnu), but it raises a number of questions that I'd like to 
settle before continuing.

* Is there any chance of a patch in this direction being accepted?

* May I remove the builtins (from i386.c and the doc) when they become unused?

* Do we want to keep the casts even when they don't seem strictly 
necessary? For instance for _mm_add_ps, we can write:
 	return __A + __B;
or:
 	return (__m128) ((__v4sf)__A + (__v4sf)__B);
Note that for _mm_add_epi8 for instance we do need the casts.

* For integer operations like _mm_add_epi16 I should probably use the 
unsigned typedefs to make it clear overflow is well defined? (the patch 
still has the signed version)

* Any better name than __v4su for the unsigned version of __v4si?

* Other comments?


2013-04-07  Marc Glisse  <marc.glisse@inria.fr>

 	* emmintrin.h (__v2du, __v4su, __v8hu): New typedefs.
 	(_mm_add_pd, _mm_sub_pd, _mm_mul_pd, _mm_div_pd,
 	_mm_cmpeq_pd, _mm_cmplt_pd, _mm_cmple_pd, _mm_cmpgt_pd, _mm_cmpge_pd,
 	_mm_cmpneq_pd, _mm_add_epi8, _mm_add_epi16, _mm_add_epi32,
 	_mm_add_epi64, _mm_slli_epi16, _mm_slli_epi32, _mm_slli_epi64,
 	_mm_srai_epi16, _mm_srai_epi32, _mm_srli_epi16, _mm_srli_epi32,
 	_mm_srli_epi64): Replace builtins with vector extensions.
 	* xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps, _mm_div_ps,
 	_mm_cmpeq_ps, _mm_cmplt_ps, _mm_cmple_ps, _mm_cmpgt_ps, _mm_cmpge_ps,
 	_mm_cmpneq_ps): Likewise.


-- 
Marc Glisse

[-- Attachment #2: Type: TEXT/PLAIN, Size: 15088 bytes --]

Index: config/i386/xmmintrin.h
===================================================================
--- config/i386/xmmintrin.h	(revision 197549)
+++ config/i386/xmmintrin.h	(working copy)
@@ -147,39 +147,39 @@ extern __inline __m128 __attribute__((__
 _mm_max_ss (__m128 __A, __m128 __B)
 {
   return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
 }
 
 /* Perform the respective operation on the four SPFP values in A and B.  */
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
+  return __A + __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
+  return __A - __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
+  return __A * __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
+  return __A / __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_ps (__m128 __A)
 {
   return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_rcp_ps (__m128 __A)
@@ -323,51 +323,51 @@ _mm_cmpunord_ss (__m128 __A, __m128 __B)
   return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
 }
 
 /* Perform a comparison on the four SPFP values of A and B.  For each
    element, if the comparison is true, place a mask of all ones in the
    result, otherwise a mask of zeros.  */
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpeq_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) (__A == __B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmplt_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) (__A < __B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmple_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) (__A <= __B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpgt_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) (__A > __B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpge_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) (__A >= __B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpneq_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) (__A != __B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
 {
   return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnle_ps (__m128 __A, __m128 __B)
Index: config/i386/emmintrin.h
===================================================================
--- config/i386/emmintrin.h	(revision 197549)
+++ config/i386/emmintrin.h	(working copy)
@@ -30,22 +30,25 @@
 #ifndef __SSE2__
 # error "SSE2 instruction set not enabled"
 #else
 
 /* We need definitions from the SSE header files*/
 #include <xmmintrin.h>
 
 /* SSE2 */
 typedef double __v2df __attribute__ ((__vector_size__ (16)));
 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
 typedef int __v4si __attribute__ ((__vector_size__ (16)));
+typedef unsigned int __v4su __attribute__ ((__vector_size__ (16)));
 typedef short __v8hi __attribute__ ((__vector_size__ (16)));
+typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16)));
 typedef char __v16qi __attribute__ ((__vector_size__ (16)));
 
 /* The Intel API is flexible enough that we must allow aliasing with other
    vector types, and their scalar components.  */
 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
 
 /* Create a selector for use with the SHUFPD instruction.  */
 #define _MM_SHUFFLE2(fp1,fp0) \
  (((fp1) << 1) | (fp0))
@@ -219,72 +222,72 @@ _mm_cvtsi128_si64 (__m128i __A)
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsi128_si64x (__m128i __A)
 {
   return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
 }
 #endif
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
+  return __A + __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
+  return __A - __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
+  return __A * __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
+  return __A / __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_pd (__m128d __A)
 {
   return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
 }
 
-/* Return pair {sqrt (A[0), B[1]}.  */
+/* Return pair {sqrt (A[0]), B[1]}.  */
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_sd (__m128d __A, __m128d __B)
 {
   __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
   return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_min_pd (__m128d __A, __m128d __B)
 {
@@ -329,51 +332,51 @@ _mm_or_pd (__m128d __A, __m128d __B)
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_xor_pd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpeq_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
+  return (__m128d) (__A == __B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmplt_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
+  return (__m128d) (__A < __B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmple_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
+  return (__m128d) (__A <= __B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpgt_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
+  return (__m128d) (__A > __B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpge_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
+  return (__m128d) (__A >= __B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpneq_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
+  return (__m128d) (__A != __B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnle_pd (__m128d __A, __m128d __B)
@@ -981,39 +984,39 @@ _mm_unpacklo_epi32 (__m128i __A, __m128i
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_epi8 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
+  return (__m128i) ((__v16qi)__A + (__v16qi)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_epi16 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
+  return (__m128i) ((__v8hi)__A + (__v8hi)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_epi32 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
+  return (__m128i) ((__v4si)__A + (__v4si)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_epi64 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
+  return (__m128i) ((__v2di)__A + (__v2di)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_adds_epi8 (__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_adds_epi16 (__m128i __A, __m128i __B)
@@ -1107,45 +1110,45 @@ _mm_mul_su32 (__m64 __A, __m64 __B)
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_epu32 (__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
+  return (__m128i) ((__v8hi)__A << __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
+  return (__m128i) ((__v4si)__A << __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi64 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
+  return (__m128i) ((__v2di)__A << __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srai_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
+  return (__m128i) ((__v8hi)__A >> __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srai_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
+  return (__m128i) ((__v4si)__A >> __B);
 }
 
 #ifdef __OPTIMIZE__
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_si128 (__m128i __A, const int __N)
 {
   return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1156,33 +1159,33 @@ _mm_slli_si128 (__m128i __A, const int _
 #else
 #define _mm_srli_si128(A, N) \
   ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
 #define _mm_slli_si128(A, N) \
   ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
 #endif
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
+  return (__m128i) ((__v8hu)__A >> __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
+  return (__m128i) ((__v4su)__A >> __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi64 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
+  return (__m128i) ((__v2du)__A >> __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sll_epi16 (__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sll_epi32 (__m128i __A, __m128i __B)

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2014-10-09 18:04 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-04-11 20:10 [i386] Replace builtins with vector extensions Marc Glisse
2014-04-28 11:39 ` Marc Glisse
2014-05-17 13:35   ` Marc Glisse
2014-06-28 10:42     ` Marc Glisse
2014-06-28 13:37       ` Ulrich Drepper
2014-06-28 22:53         ` Marc Glisse
2014-06-29  9:41           ` Ulrich Drepper
2014-06-29 11:06             ` Marc Glisse
2014-07-03 10:17       ` Kirill Yukhin
2014-07-04 19:12         ` Marc Glisse
2014-07-08 11:14           ` Kirill Yukhin
2014-07-08 11:17             ` Jakub Jelinek
2014-07-08 16:02               ` Mike Stump
2014-07-26 17:35             ` Marc Glisse
2014-07-29 11:07               ` Kirill Yukhin
2014-10-09 10:35               ` Marc Glisse
2014-10-09 11:40                 ` Uros Bizjak
2014-10-09 12:34                   ` Marc Glisse
2014-10-09 13:25                     ` Uros Bizjak
2014-10-09 15:14                       ` Kirill Yukhin
2014-10-09 15:35                       ` H.J. Lu
2014-10-09 17:02                 ` Olivier Hainque
2014-10-09 17:56                   ` Marc Glisse
2014-10-09 18:01                     ` Uros Bizjak
2014-10-09 18:05                       ` Marc Glisse
  -- strict thread matches above, loose matches on Subject: below --
2013-04-07 18:03 Marc Glisse
2013-04-07 19:38 ` Marc Glisse
2013-04-08 22:44 ` Marc Glisse
2013-04-09  9:39   ` Richard Biener
2013-04-09 11:10     ` Marc Glisse
2013-04-09 11:25       ` Jakub Jelinek
2013-04-09 12:33         ` Marc Glisse
2013-04-10  7:46       ` Marc Glisse
2013-04-10  9:46         ` Richard Biener
2014-02-05 21:51 ` Marc Glisse

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).