public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [i386] Replace builtins with vector extensions
@ 2014-04-11 20:10 Marc Glisse
  2014-04-28 11:39 ` Marc Glisse
  0 siblings, 1 reply; 35+ messages in thread
From: Marc Glisse @ 2014-04-11 20:10 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: TEXT/PLAIN, Size: 890 bytes --]

Hello,

the previous discussion on the topic was before we added all those #pragma 
target in *mmintrin.h:

http://gcc.gnu.org/ml/gcc-patches/2013-04/msg00374.html

I believe that removes a large part of the arguments against it. Note that 
I only did a few of the more obvious intrinsics, I am waiting to see if 
this patch is accepted before doing more.

Bootstrap+testsuite on x86_64-linux-gnu.

2014-04-11  Marc Glisse  <marc.glisse@inria.fr>

 	* config/i386/xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps,
 	_mm_div_ps, _mm_store_ss, _mm_cvtss_f32): Use vector extensions
 	instead of builtins.
 	* config/i386/emmintrin.h (_mm_store_sd, _mm_cvtsd_f64, _mm_storeh_pd,
 	_mm_cvtsi128_si64, _mm_cvtsi128_si64x, _mm_add_pd, _mm_sub_pd,
 	_mm_mul_pd, _mm_div_pd, _mm_storel_epi64, _mm_movepi64_pi64,
 	_mm_loadh_pd, _mm_loadl_pd): Likewise.
 	(_mm_sqrt_sd): Fix comment.

-- 
Marc Glisse

[-- Attachment #2: Type: TEXT/PLAIN, Size: 9773 bytes --]

Index: gcc/config/i386/emmintrin.h
===================================================================
--- gcc/config/i386/emmintrin.h	(revision 209323)
+++ gcc/config/i386/emmintrin.h	(working copy)
@@ -161,40 +161,40 @@ _mm_store_pd (double *__P, __m128d __A)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storeu_pd (double *__P, __m128d __A)
 {
   __builtin_ia32_storeupd (__P, __A);
 }
 
 /* Stores the lower DPFP value.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_store_sd (double *__P, __m128d __A)
 {
-  *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
+  *__P = __A[0];
 }
 
 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsd_f64 (__m128d __A)
 {
-  return __builtin_ia32_vec_ext_v2df (__A, 0);
+  return __A[0];
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storel_pd (double *__P, __m128d __A)
 {
   _mm_store_sd (__P, __A);
 }
 
 /* Stores the upper DPFP value.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storeh_pd (double *__P, __m128d __A)
 {
-  *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
+  *__P = __A[1];
 }
 
 /* Store the lower DPFP value across two words.
    The address must be 16-byte aligned.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_store1_pd (double *__P, __m128d __A)
 {
   _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
 }
 
@@ -215,86 +215,86 @@ extern __inline int __attribute__((__gnu
 _mm_cvtsi128_si32 (__m128i __A)
 {
   return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
 }
 
 #ifdef __x86_64__
 /* Intel intrinsic.  */
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsi128_si64 (__m128i __A)
 {
-  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+  return __A[0];
 }
 
 /* Microsoft intrinsic.  */
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsi128_si64x (__m128i __A)
 {
-  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+  return __A[0];
 }
 #endif
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
+  return __A + __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
+  return __A - __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
+  return __A * __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
+  return __A / __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_pd (__m128d __A)
 {
   return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
 }
 
-/* Return pair {sqrt (A[0), B[1]}.  */
+/* Return pair {sqrt (B[0]), A[1]}.  */
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_sd (__m128d __A, __m128d __B)
 {
   __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
   return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_min_pd (__m128d __A, __m128d __B)
 {
@@ -708,27 +708,27 @@ _mm_store_si128 (__m128i *__P, __m128i _
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storeu_si128 (__m128i *__P, __m128i __B)
 {
   __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storel_epi64 (__m128i *__P, __m128i __B)
 {
-  *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+  *(long long *)__P = __B[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movepi64_pi64 (__m128i __B)
 {
-  return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+  return (__m64) __B[0];
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movpi64_epi64 (__m64 __A)
 {
   return _mm_set_epi64 ((__m64)0LL, __A);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_epi64 (__m128i __A)
@@ -915,27 +915,27 @@ _mm_unpackhi_pd (__m128d __A, __m128d __
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_unpacklo_pd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_loadh_pd (__m128d __A, double const *__B)
 {
-  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
+  return __extension__ (__m128d){ __A[0], __B[0] };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_loadl_pd (__m128d __A, double const *__B)
 {
-  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
+  return __extension__ (__m128d){ __B[0], __A[1] };
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movemask_pd (__m128d __A)
 {
   return __builtin_ia32_movmskpd ((__v2df)__A);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_packs_epi16 (__m128i __A, __m128i __B)
Index: gcc/config/i386/xmmintrin.h
===================================================================
--- gcc/config/i386/xmmintrin.h	(revision 209323)
+++ gcc/config/i386/xmmintrin.h	(working copy)
@@ -173,39 +173,39 @@ extern __inline __m128 __attribute__((__
 _mm_max_ss (__m128 __A, __m128 __B)
 {
   return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
 }
 
 /* Perform the respective operation on the four SPFP values in A and B.  */
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
+  return __A + __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
+  return __A - __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
+  return __A * __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
+  return __A / __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_ps (__m128 __A)
 {
   return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_rcp_ps (__m128 __A)
@@ -950,27 +950,27 @@ _mm_set_ps (const float __Z, const float
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 {
   return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
 }
 
 /* Stores the lower SPFP value.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_store_ss (float *__P, __m128 __A)
 {
-  *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+  *__P = __A[0];
 }
 
 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtss_f32 (__m128 __A)
 {
-  return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+  return __A[0];
 }
 
 /* Store four SPFP values.  The address must be 16-byte aligned.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_store_ps (float *__P, __m128 __A)
 {
   *(__v4sf *)__P = (__v4sf)__A;
 }
 
 /* Store four SPFP values.  The address need not be 16-byte aligned.  */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-04-11 20:10 [i386] Replace builtins with vector extensions Marc Glisse
@ 2014-04-28 11:39 ` Marc Glisse
  2014-05-17 13:35   ` Marc Glisse
  0 siblings, 1 reply; 35+ messages in thread
From: Marc Glisse @ 2014-04-28 11:39 UTC (permalink / raw)
  To: gcc-patches; +Cc: ubizjak

Ping
http://gcc.gnu.org/ml/gcc-patches/2014-04/msg00590.html

(note that ARM seems to be doing the same thing for their neon 
intrinsics, see Ramana's patch series posted today)

On Fri, 11 Apr 2014, Marc Glisse wrote:

> Hello,
>
> the previous discussion on the topic was before we added all those #pragma 
> target in *mmintrin.h:
>
> http://gcc.gnu.org/ml/gcc-patches/2013-04/msg00374.html
>
> I believe that removes a large part of the arguments against it. Note that I 
> only did a few of the more obvious intrinsics, I am waiting to see if this 
> patch is accepted before doing more.
>
> Bootstrap+testsuite on x86_64-linux-gnu.
>
> 2014-04-11  Marc Glisse  <marc.glisse@inria.fr>
>
> 	* config/i386/xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps,
> 	_mm_div_ps, _mm_store_ss, _mm_cvtss_f32): Use vector extensions
> 	instead of builtins.
> 	* config/i386/emmintrin.h (_mm_store_sd, _mm_cvtsd_f64, 
> _mm_storeh_pd,
> 	_mm_cvtsi128_si64, _mm_cvtsi128_si64x, _mm_add_pd, _mm_sub_pd,
> 	_mm_mul_pd, _mm_div_pd, _mm_storel_epi64, _mm_movepi64_pi64,
> 	_mm_loadh_pd, _mm_loadl_pd): Likewise.
> 	(_mm_sqrt_sd): Fix comment.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-04-28 11:39 ` Marc Glisse
@ 2014-05-17 13:35   ` Marc Glisse
  2014-06-28 10:42     ` Marc Glisse
  0 siblings, 1 reply; 35+ messages in thread
From: Marc Glisse @ 2014-05-17 13:35 UTC (permalink / raw)
  To: gcc-patches; +Cc: rth

Ping

On Mon, 28 Apr 2014, Marc Glisse wrote:

> Ping
> http://gcc.gnu.org/ml/gcc-patches/2014-04/msg00590.html
>
> (note that ARM seems to be doing the same thing for their neon intrinsics, 
> see Ramana's patch series posted today)
>
> On Fri, 11 Apr 2014, Marc Glisse wrote:
>
>> Hello,
>> 
>> the previous discussion on the topic was before we added all those #pragma 
>> target in *mmintrin.h:
>> 
>> http://gcc.gnu.org/ml/gcc-patches/2013-04/msg00374.html
>> 
>> I believe that removes a large part of the arguments against it. Note that 
>> I only did a few of the more obvious intrinsics, I am waiting to see if 
>> this patch is accepted before doing more.
>> 
>> Bootstrap+testsuite on x86_64-linux-gnu.
>> 
>> 2014-04-11  Marc Glisse  <marc.glisse@inria.fr>
>>
>> 	* config/i386/xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps,
>> 	_mm_div_ps, _mm_store_ss, _mm_cvtss_f32): Use vector extensions
>> 	instead of builtins.
>> 	* config/i386/emmintrin.h (_mm_store_sd, _mm_cvtsd_f64, 
>> _mm_storeh_pd,
>> 	_mm_cvtsi128_si64, _mm_cvtsi128_si64x, _mm_add_pd, _mm_sub_pd,
>> 	_mm_mul_pd, _mm_div_pd, _mm_storel_epi64, _mm_movepi64_pi64,
>> 	_mm_loadh_pd, _mm_loadl_pd): Likewise.
>> 	(_mm_sqrt_sd): Fix comment.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-05-17 13:35   ` Marc Glisse
@ 2014-06-28 10:42     ` Marc Glisse
  2014-06-28 13:37       ` Ulrich Drepper
  2014-07-03 10:17       ` Kirill Yukhin
  0 siblings, 2 replies; 35+ messages in thread
From: Marc Glisse @ 2014-06-28 10:42 UTC (permalink / raw)
  To: gcc-patches

Ping,

nobody has an opinion on this? Or some explanation why I am mistaken to 
believe that #pragma target makes it safer now?

It would enable a number of optimizations, like constant propagation, FMA 
contraction, etc. It would also allow us to remove several builtins.

On Sat, 17 May 2014, Marc Glisse wrote:

> Ping
>
> On Mon, 28 Apr 2014, Marc Glisse wrote:
>
>> Ping
>> http://gcc.gnu.org/ml/gcc-patches/2014-04/msg00590.html
>> 
>> (note that ARM seems to be doing the same thing for their neon intrinsics, 
>> see Ramana's patch series posted today)
>> 
>> On Fri, 11 Apr 2014, Marc Glisse wrote:
>> 
>>> Hello,
>>> 
>>> the previous discussion on the topic was before we added all those #pragma 
>>> target in *mmintrin.h:
>>> 
>>> http://gcc.gnu.org/ml/gcc-patches/2013-04/msg00374.html
>>> 
>>> I believe that removes a large part of the arguments against it. Note that 
>>> I only did a few of the more obvious intrinsics, I am waiting to see if 
>>> this patch is accepted before doing more.
>>> 
>>> Bootstrap+testsuite on x86_64-linux-gnu.
>>> 
>>> 2014-04-11  Marc Glisse  <marc.glisse@inria.fr>
>>>
>>> 	* config/i386/xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps,
>>> 	_mm_div_ps, _mm_store_ss, _mm_cvtss_f32): Use vector extensions
>>> 	instead of builtins.
>>> 	* config/i386/emmintrin.h (_mm_store_sd, _mm_cvtsd_f64, _mm_storeh_pd,
>>> 	_mm_cvtsi128_si64, _mm_cvtsi128_si64x, _mm_add_pd, _mm_sub_pd,
>>> 	_mm_mul_pd, _mm_div_pd, _mm_storel_epi64, _mm_movepi64_pi64,
>>> 	_mm_loadh_pd, _mm_loadl_pd): Likewise.
>>> 	(_mm_sqrt_sd): Fix comment.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-06-28 10:42     ` Marc Glisse
@ 2014-06-28 13:37       ` Ulrich Drepper
  2014-06-28 22:53         ` Marc Glisse
  2014-07-03 10:17       ` Kirill Yukhin
  1 sibling, 1 reply; 35+ messages in thread
From: Ulrich Drepper @ 2014-06-28 13:37 UTC (permalink / raw)
  To: Marc Glisse; +Cc: GCC Patches

On Sat, Jun 28, 2014 at 6:42 AM, Marc Glisse <marc.glisse@inria.fr> wrote:
> Ping,
>
> nobody has an opinion on this? Or some explanation why I am mistaken to
> believe that #pragma target makes it safer now?
>
> It would enable a number of optimizations, like constant propagation, FMA
> contraction, etc. It would also allow us to remove several builtins.

I see no problem with using the array-type access to the registers.

As for replacing the builtins with arithmetic operators: I appreciate
the possibility for optimization.  But is there any chance the calls
could not end up being implemented with a vector instruction?  I think
that would be bad.  The intrinsics should be a way to guarantee that
the programmer can create vector instructions.  Otherwise we might
just not support them.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-06-28 13:37       ` Ulrich Drepper
@ 2014-06-28 22:53         ` Marc Glisse
  2014-06-29  9:41           ` Ulrich Drepper
  0 siblings, 1 reply; 35+ messages in thread
From: Marc Glisse @ 2014-06-28 22:53 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: GCC Patches

On Sat, 28 Jun 2014, Ulrich Drepper wrote:

> On Sat, Jun 28, 2014 at 6:42 AM, Marc Glisse <marc.glisse@inria.fr> wrote:
>> Ping,
>>
>> nobody has an opinion on this? Or some explanation why I am mistaken to
>> believe that #pragma target makes it safer now?
>>
>> It would enable a number of optimizations, like constant propagation, FMA
>> contraction, etc. It would also allow us to remove several builtins.
>
> I see no problem with using the array-type access to the registers.
>
> As for replacing the builtins with arithmetic operators: I appreciate
> the possibility for optimization.  But is there any chance the calls
> could not end up being implemented with a vector instruction?  I think
> that would be bad.  The intrinsics should be a way to guarantee that
> the programmer can create vector instructions.  Otherwise we might
> just not support them.

There is always a risk, but then even with builtins I think there was a 
small risk that an RTL optimization would mess things up. It is indeed 
higher if we expose the operation to the optimizers earlier, but it would 
be a bug if an "optimization" replaced a vector operation by something 
worse. Also, I am only proposing to handle the most trivial operations 
this way, not more complicated ones (like v[0]+=s) where we would be 
likely to fail generating the right instruction. And the pragma should 
ensure that the function will always be compiled in a mode where the 
vector instruction is available.

ARM did the same and I don't think I have seen a bug reporting a 
regression about it (I haven't really looked though).

Thanks,

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-06-28 22:53         ` Marc Glisse
@ 2014-06-29  9:41           ` Ulrich Drepper
  2014-06-29 11:06             ` Marc Glisse
  0 siblings, 1 reply; 35+ messages in thread
From: Ulrich Drepper @ 2014-06-29  9:41 UTC (permalink / raw)
  To: Marc Glisse; +Cc: GCC Patches

On Sat, Jun 28, 2014 at 6:53 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> There is always a risk, but then even with builtins I think there was a
> small risk that an RTL optimization would mess things up. It is indeed
> higher if we expose the operation to the optimizers earlier, but it would be
> a bug if an "optimization" replaced a vector operation by something worse.
> Also, I am only proposing to handle the most trivial operations this way,
> not more complicated ones (like v[0]+=s) where we would be likely to fail
> generating the right instruction. And the pragma should ensure that the
> function will always be compiled in a mode where the vector instruction is
> available.
>
> ARM did the same and I don't think I have seen a bug reporting a regression
> about it (I haven't really looked though).

I think the Arm definitions come from a different angle.  It's new,
there is no assumed semantics.  For the x86 intrinsics Intel defines
that _mm_xxx() generates one of a given opcodes if there is a match.
If I want to generate a specific code sequence I use the intrinsics.
Otherwise I could already today use the vector type semantics myself.

Don't get me wrong, I like the idea to have the optimization of the
intrinsics happening.  But perhaps not unconditionally or at least not
without preventing them.

I know this will look ugly, but how about a macro
__GCC_X86_HONOR_INTRINSICS to enable the current code and have by
default your proposed use of the vector arithmetic in place?  This
wouldn't allow removing support for the built-ins but it would also
open the door to some more risky optimizations to be enabled by
default.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-06-29  9:41           ` Ulrich Drepper
@ 2014-06-29 11:06             ` Marc Glisse
  0 siblings, 0 replies; 35+ messages in thread
From: Marc Glisse @ 2014-06-29 11:06 UTC (permalink / raw)
  To: Ulrich Drepper; +Cc: GCC Patches

On Sun, 29 Jun 2014, Ulrich Drepper wrote:

> I think the Arm definitions come from a different angle.  It's new,
> there is no assumed semantics.

Is it that new? I thought it was implemented based on a rather precise 
specification by ARM. Again, I don't really know arm.

> For the x86 intrinsics Intel defines
> that _mm_xxx() generates one of a given opcodes if there is a match.
> If I want to generate a specific code sequence I use the intrinsics.

We already sometimes generate a different instruction than the name of the 
instrinsic suggests, or combine consecutive intrinsics into something 
else. I use inline asm when I want a specific code sequence.

> Otherwise I could already today use the vector type semantics myself.

Well, the main reasons I use the intrinsics are:
1) the code compiles with visual studio
2) use the esoteric instructions (anything without a trivial mapping in C)

> Don't get me wrong, I like the idea to have the optimization of the
> intrinsics happening.  But perhaps not unconditionally or at least not
> without preventing them.
>
> I know this will look ugly, but how about a macro
> __GCC_X86_HONOR_INTRINSICS to enable the current code and have by
> default your proposed use of the vector arithmetic in place?  This
> wouldn't allow removing support for the built-ins but it would also
> open the door to some more risky optimizations to be enabled by
> default.

That's a pretty big drawback. Instead of simplifying the implementation, 
it makes it more complicated. We also have to document the macro, update 
the testsuite so it tests the intrinsics in both modes, etc.

I understand the concern, and I would probably implement 
__GCC_X86_HONOR_INTRINSICS (though the testsuite part scares me as I have 
so little understanding of how it works, so I may need help), but I'd like 
to make sure first that the simpler approach is not acceptable, possibly 
with strong constraints on which operations are ok (_mm_load[hl]_pd could 
be removed from the patch for instance).

As another comparison, clang's version of *intrin.h uses the vector 
extensions much more than I am proposing.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-06-28 10:42     ` Marc Glisse
  2014-06-28 13:37       ` Ulrich Drepper
@ 2014-07-03 10:17       ` Kirill Yukhin
  2014-07-04 19:12         ` Marc Glisse
  1 sibling, 1 reply; 35+ messages in thread
From: Kirill Yukhin @ 2014-07-03 10:17 UTC (permalink / raw)
  To: Marc Glisse; +Cc: gcc-patches

Hello Marc,
On 28 Jun 12:42, Marc Glisse wrote:
> It would enable a number of optimizations, like constant
> propagation, FMA contraction, etc. It would also allow us to remove
> several builtins.
This should be main motivation for replacing built-ins.
But this approach IMHO should only be used for `obvious' cases only.
I mean: + - / * and friends.
Think that this shouldn't apply for shuffles, broadcasts.
But we have to define border between `obvious' and rest intrinsics.

On the over hand, updated in such a way intrinsic
may actually generate different instruction then intended (e.g. FMA case).

For ICC this is generally OK to generate different instructions, only
semantics should be obeyed.

--
Thanks, K

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-07-03 10:17       ` Kirill Yukhin
@ 2014-07-04 19:12         ` Marc Glisse
  2014-07-08 11:14           ` Kirill Yukhin
  0 siblings, 1 reply; 35+ messages in thread
From: Marc Glisse @ 2014-07-04 19:12 UTC (permalink / raw)
  To: Kirill Yukhin; +Cc: gcc-patches

On Thu, 3 Jul 2014, Kirill Yukhin wrote:

> Hello Marc,
> On 28 Jun 12:42, Marc Glisse wrote:
>> It would enable a number of optimizations, like constant
>> propagation, FMA contraction, etc. It would also allow us to remove
>> several builtins.
> This should be main motivation for replacing built-ins.
> But this approach IMHO should only be used for `obvious' cases only.
> I mean: + - / * and friends.
> Think that this shouldn't apply for shuffles, broadcasts.
> But we have to define border between `obvious' and rest intrinsics.

We don't have a syntax in the front-end for broadcasts anyway, but are you 
sure about shuffles? __builtin_shuffle directly translates to 
VEC_PERM_EXPR, on which we are careful to avoid optimizations like 
combining 2 shuffles unless the result is the identity. And expanding 
shuffles that can be done in a single instruction works well.

But I am happy not doing them yet. To be very specific, could you list 
which intrinsics you would like to remove from the posted patch?

> On the over hand, updated in such a way intrinsic
> may actually generate different instruction then intended (e.g. FMA case).

It is the same with scalars, we have -ffp-contract for that.

> For ICC this is generally OK to generate different instructions, only
> semantics should be obeyed.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-07-04 19:12         ` Marc Glisse
@ 2014-07-08 11:14           ` Kirill Yukhin
  2014-07-08 11:17             ` Jakub Jelinek
  2014-07-26 17:35             ` Marc Glisse
  0 siblings, 2 replies; 35+ messages in thread
From: Kirill Yukhin @ 2014-07-08 11:14 UTC (permalink / raw)
  To: Marc Glisse; +Cc: GCC Patches, Uros Bizjak

Hello Marc.
On 04 Jul 21:11, Marc Glisse wrote:
> On Thu, 3 Jul 2014, Kirill Yukhin wrote:
> like combining 2 shuffles unless the result is the identity. And
> expanding shuffles that can be done in a single instruction works
> well.
> 
> But I am happy not doing them yet. To be very specific, could you
> list which intrinsics you would like to remove from the posted
> patch?
I am not a x86 maintainer, however while such a replacements produce
correct semantics and probably enable optimizations, I support your patch.

Probably you could try such your approach on AVX2, AVX-512 whose intrinsics
are well covered by tests?

> >On the over hand, updated in such a way intrinsic
> >may actually generate different instruction then intended (e.g. FMA case).
> 
> It is the same with scalars, we have -ffp-contract for that.
Agreed.

--
Thanks, K

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-07-08 11:14           ` Kirill Yukhin
@ 2014-07-08 11:17             ` Jakub Jelinek
  2014-07-08 16:02               ` Mike Stump
  2014-07-26 17:35             ` Marc Glisse
  1 sibling, 1 reply; 35+ messages in thread
From: Jakub Jelinek @ 2014-07-08 11:17 UTC (permalink / raw)
  To: Kirill Yukhin; +Cc: Marc Glisse, GCC Patches, Uros Bizjak

On Tue, Jul 08, 2014 at 03:14:04PM +0400, Kirill Yukhin wrote:
> > >On the over hand, updated in such a way intrinsic
> > >may actually generate different instruction then intended (e.g. FMA case).
> > 
> > It is the same with scalars, we have -ffp-contract for that.
> Agreed.

I don't think we actually always guarantee using the particular instructions
for the intrinsics even when they are implemented using builtins, at least
if they don't use UNSPECs, e.g. if combiner or peephole2 manage to combine
something into some other insn, we'll happily do that.

	Jakub

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-07-08 11:17             ` Jakub Jelinek
@ 2014-07-08 16:02               ` Mike Stump
  0 siblings, 0 replies; 35+ messages in thread
From: Mike Stump @ 2014-07-08 16:02 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Kirill Yukhin, Marc Glisse, GCC Patches, Uros Bizjak

On Jul 8, 2014, at 4:17 AM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Tue, Jul 08, 2014 at 03:14:04PM +0400, Kirill Yukhin wrote:
>>>> On the over hand, updated in such a way intrinsic
>>>> may actually generate different instruction then intended (e.g. FMA case).
>>> 
>>> It is the same with scalars, we have -ffp-contract for that.
>> Agreed.
> 
> I don't think we actually always guarantee using the particular instructions
> for the intrinsics even when they are implemented using builtins, at least
> if they don't use UNSPECs, e.g. if combiner or peephole2 manage to combine
> something into some other insn, we'll happily do that.

In a testcase, one is free to hide the inputs and the output from the optimizer using standard tricks and take one step closer to having a 1-1 mapping.  Of course, wether or not the port even offers a 1-1 mapping for any particular builtin is completely dependent upon the port.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-07-08 11:14           ` Kirill Yukhin
  2014-07-08 11:17             ` Jakub Jelinek
@ 2014-07-26 17:35             ` Marc Glisse
  2014-07-29 11:07               ` Kirill Yukhin
  2014-10-09 10:35               ` Marc Glisse
  1 sibling, 2 replies; 35+ messages in thread
From: Marc Glisse @ 2014-07-26 17:35 UTC (permalink / raw)
  To: Kirill Yukhin; +Cc: GCC Patches, Uros Bizjak

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1628 bytes --]

On Tue, 8 Jul 2014, Kirill Yukhin wrote:

> Hello Marc.
> On 04 Jul 21:11, Marc Glisse wrote:
>> On Thu, 3 Jul 2014, Kirill Yukhin wrote:
>> like combining 2 shuffles unless the result is the identity. And
>> expanding shuffles that can be done in a single instruction works
>> well.
>>
>> But I am happy not doing them yet. To be very specific, could you
>> list which intrinsics you would like to remove from the posted
>> patch?
> I am not a x86 maintainer, however while such a replacements produce
> correct semantics and probably enable optimizations, I support your patch.
>
> Probably you could try such your approach on AVX2, AVX-512 whose intrinsics
> are well covered by tests?

I did some AVX and AVX512F intrinsics, and it still passes the testsuite 
(on my old pre-AVX x86_64-linux-gnu).


2014-07-26  Marc Glisse  <marc.glisse@inria.fr>

 	* config/i386/xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps,
 	_mm_div_ps, _mm_store_ss, _mm_cvtss_f32): Use vector extensions
 	instead of builtins.
 	* config/i386/avxintrin.h (_mm256_add_pd, _mm256_add_ps,
 	_mm256_div_pd, _mm256_div_ps, _mm256_mul_pd, _mm256_mul_ps,
 	_mm256_sub_pd, _mm256_sub_ps): Likewise.
 	* config/i386/avx512fintrin.h (_mm512_add_pd, _mm512_add_ps,
 	_mm512_sub_pd, _mm512_sub_ps, _mm512_mul_pd, _mm512_mul_ps,
 	_mm512_div_pd, _mm512_div_ps): Likewise.
 	* config/i386/emmintrin.h (_mm_store_sd, _mm_cvtsd_f64, _mm_storeh_pd,
 	_mm_cvtsi128_si64, _mm_cvtsi128_si64x, _mm_add_pd, _mm_sub_pd,
 	_mm_mul_pd, _mm_div_pd, _mm_storel_epi64, _mm_movepi64_pi64,
 	_mm_loadh_pd, _mm_loadl_pd): Likewise.
 	(_mm_sqrt_sd): Fix comment.


-- 
Marc Glisse

[-- Attachment #2: Type: TEXT/PLAIN, Size: 21495 bytes --]

Index: gcc/config/i386/avx512fintrin.h
===================================================================
--- gcc/config/i386/avx512fintrin.h	(revision 213083)
+++ gcc/config/i386/avx512fintrin.h	(working copy)
@@ -10598,26 +10598,21 @@ _mm512_maskz_sqrt_ps (__mmask16 __U, __m
 						 (__v16sf)
 						 _mm512_setzero_ps (),
 						 (__mmask16) __U,
 						 _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_add_pd (__m512d __A, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
-						 (__v8df) __B,
-						 (__v8df)
-						 _mm512_undefined_pd (),
-						 (__mmask8) -1,
-						 _MM_FROUND_CUR_DIRECTION);
+  return __A + __B;
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_add_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
   return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
 						 (__v8df) __B,
 						 (__v8df) __W,
 						 (__mmask8) __U,
@@ -10633,26 +10628,21 @@ _mm512_maskz_add_pd (__mmask8 __U, __m51
 						 (__v8df)
 						 _mm512_setzero_pd (),
 						 (__mmask8) __U,
 						 _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_add_ps (__m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
-						(__v16sf) __B,
-						(__v16sf)
-						_mm512_undefined_ps (),
-						(__mmask16) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __A + __B;
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_add_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
 						(__v16sf) __B,
 						(__v16sf) __W,
 						(__mmask16) __U,
@@ -10668,26 +10658,21 @@ _mm512_maskz_add_ps (__mmask16 __U, __m5
 						(__v16sf)
 						_mm512_setzero_ps (),
 						(__mmask16) __U,
 						_MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_sub_pd (__m512d __A, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
-						 (__v8df) __B,
-						 (__v8df)
-						 _mm512_undefined_pd (),
-						 (__mmask8) -1,
-						 _MM_FROUND_CUR_DIRECTION);
+  return __A - __B;
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_sub_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
   return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
 						 (__v8df) __B,
 						 (__v8df) __W,
 						 (__mmask8) __U,
@@ -10703,26 +10688,21 @@ _mm512_maskz_sub_pd (__mmask8 __U, __m51
 						 (__v8df)
 						 _mm512_setzero_pd (),
 						 (__mmask8) __U,
 						 _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_sub_ps (__m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
-						(__v16sf) __B,
-						(__v16sf)
-						_mm512_undefined_ps (),
-						(__mmask16) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __A - __B;
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_sub_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
 						(__v16sf) __B,
 						(__v16sf) __W,
 						(__mmask16) __U,
@@ -10738,26 +10718,21 @@ _mm512_maskz_sub_ps (__mmask16 __U, __m5
 						(__v16sf)
 						_mm512_setzero_ps (),
 						(__mmask16) __U,
 						_MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mul_pd (__m512d __A, __m512d __B)
 {
-  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
-						 (__v8df) __B,
-						 (__v8df)
-						 _mm512_undefined_pd (),
-						 (__mmask8) -1,
-						 _MM_FROUND_CUR_DIRECTION);
+  return __A * __B;
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_mul_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
 {
   return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
 						 (__v8df) __B,
 						 (__v8df) __W,
 						 (__mmask8) __U,
@@ -10773,26 +10748,21 @@ _mm512_maskz_mul_pd (__mmask8 __U, __m51
 						 (__v8df)
 						 _mm512_setzero_pd (),
 						 (__mmask8) __U,
 						 _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mul_ps (__m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
-						(__v16sf) __B,
-						(__v16sf)
-						_mm512_undefined_ps (),
-						(__mmask16) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __A * __B;
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_mul_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
 						(__v16sf) __B,
 						(__v16sf) __W,
 						(__mmask16) __U,
@@ -10808,26 +10778,21 @@ _mm512_maskz_mul_ps (__mmask16 __U, __m5
 						(__v16sf)
 						_mm512_setzero_ps (),
 						(__mmask16) __U,
 						_MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_div_pd (__m512d __M, __m512d __V)
 {
-  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
-						 (__v8df) __V,
-						 (__v8df)
-						 _mm512_undefined_pd (),
-						 (__mmask8) -1,
-						 _MM_FROUND_CUR_DIRECTION);
+  return __M / __V;
 }
 
 extern __inline __m512d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_div_pd (__m512d __W, __mmask8 __U, __m512d __M, __m512d __V)
 {
   return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
 						 (__v8df) __V,
 						 (__v8df) __W,
 						 (__mmask8) __U,
@@ -10843,26 +10808,21 @@ _mm512_maskz_div_pd (__mmask8 __U, __m51
 						 (__v8df)
 						 _mm512_setzero_pd (),
 						 (__mmask8) __U,
 						 _MM_FROUND_CUR_DIRECTION);
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_div_ps (__m512 __A, __m512 __B)
 {
-  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
-						(__v16sf) __B,
-						(__v16sf)
-						_mm512_undefined_ps (),
-						(__mmask16) -1,
-						_MM_FROUND_CUR_DIRECTION);
+  return __A / __B;
 }
 
 extern __inline __m512
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm512_mask_div_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
 {
   return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
 						(__v16sf) __B,
 						(__v16sf) __W,
 						(__mmask16) __U,
Index: gcc/config/i386/avxintrin.h
===================================================================
--- gcc/config/i386/avxintrin.h	(revision 213083)
+++ gcc/config/i386/avxintrin.h	(working copy)
@@ -117,27 +117,27 @@ typedef double __m256d __attribute__ ((_
 /* Greater-than-or-equal (ordered, non-signaling)  */
 #define _CMP_GE_OQ	0x1d
 /* Greater-than (ordered, non-signaling)  */
 #define _CMP_GT_OQ	0x1e
 /* True (unordered, signaling)  */
 #define _CMP_TRUE_US	0x1f
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_add_pd (__m256d __A, __m256d __B)
 {
-  return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
+  return __A + __B;
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_add_ps (__m256 __A, __m256 __B)
 {
-  return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
+  return __A + __B;
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_addsub_pd (__m256d __A, __m256d __B)
 {
   return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_addsub_ps (__m256 __A, __m256 __B)
@@ -211,27 +211,27 @@ extern __inline __m256 __attribute__((__
 _mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
 {
   return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
 					      (__v8sf)__Y,
 					      (__v8sf)__M);
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_div_pd (__m256d __A, __m256d __B)
 {
-  return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
+  return __A / __B;
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_div_ps (__m256 __A, __m256 __B)
 {
-  return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
+  return __A / __B;
 }
 
 /* Dot product instructions with mask-defined summing and zeroing parts
    of result.  */
 
 #ifdef __OPTIMIZE__
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
 {
   return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
@@ -288,27 +288,27 @@ _mm256_min_pd (__m256d __A, __m256d __B)
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_min_ps (__m256 __A, __m256 __B)
 {
   return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mul_pd (__m256d __A, __m256d __B)
 {
-  return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
+  return __A * __B;
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_mul_ps (__m256 __A, __m256 __B)
 {
-  return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
+  return __A * __B;
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_or_pd (__m256d __A, __m256d __B)
 {
   return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_or_ps (__m256 __A, __m256 __B)
@@ -336,27 +336,27 @@ _mm256_shuffle_ps (__m256 __A, __m256 __
 				      (__v4df)(__m256d)(B), (int)(N)))
 
 #define _mm256_shuffle_ps(A, B, N)					\
   ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),		\
 				      (__v8sf)(__m256)(B), (int)(N)))
 #endif
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_sub_pd (__m256d __A, __m256d __B)
 {
-  return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
+  return __A - __B;
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_sub_ps (__m256 __A, __m256 __B)
 {
-  return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
+  return __A - __B;
 }
 
 extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_xor_pd (__m256d __A, __m256d __B)
 {
   return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
 }
 
 extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_xor_ps (__m256 __A, __m256 __B)
Index: gcc/config/i386/emmintrin.h
===================================================================
--- gcc/config/i386/emmintrin.h	(revision 213083)
+++ gcc/config/i386/emmintrin.h	(working copy)
@@ -161,40 +161,40 @@ _mm_store_pd (double *__P, __m128d __A)
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storeu_pd (double *__P, __m128d __A)
 {
   __builtin_ia32_storeupd (__P, __A);
 }
 
 /* Stores the lower DPFP value.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_store_sd (double *__P, __m128d __A)
 {
-  *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
+  *__P = __A[0];
 }
 
 extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsd_f64 (__m128d __A)
 {
-  return __builtin_ia32_vec_ext_v2df (__A, 0);
+  return __A[0];
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storel_pd (double *__P, __m128d __A)
 {
   _mm_store_sd (__P, __A);
 }
 
 /* Stores the upper DPFP value.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storeh_pd (double *__P, __m128d __A)
 {
-  *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
+  *__P = __A[1];
 }
 
 /* Store the lower DPFP value across two words.
    The address must be 16-byte aligned.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_store1_pd (double *__P, __m128d __A)
 {
   _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
 }
 
@@ -215,86 +215,86 @@ extern __inline int __attribute__((__gnu
 _mm_cvtsi128_si32 (__m128i __A)
 {
   return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
 }
 
 #ifdef __x86_64__
 /* Intel intrinsic.  */
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsi128_si64 (__m128i __A)
 {
-  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+  return __A[0];
 }
 
 /* Microsoft intrinsic.  */
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsi128_si64x (__m128i __A)
 {
-  return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
+  return __A[0];
 }
 #endif
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
+  return __A + __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
+  return __A - __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
+  return __A * __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
+  return __A / __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_pd (__m128d __A)
 {
   return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
 }
 
-/* Return pair {sqrt (A[0), B[1]}.  */
+/* Return pair {sqrt (B[0]), A[1]}.  */
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_sd (__m128d __A, __m128d __B)
 {
   __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
   return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_min_pd (__m128d __A, __m128d __B)
 {
@@ -708,27 +708,27 @@ _mm_store_si128 (__m128i *__P, __m128i _
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storeu_si128 (__m128i *__P, __m128i __B)
 {
   __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
 }
 
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_storel_epi64 (__m128i *__P, __m128i __B)
 {
-  *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+  *(long long *)__P = __B[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movepi64_pi64 (__m128i __B)
 {
-  return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
+  return (__m64) __B[0];
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movpi64_epi64 (__m64 __A)
 {
   return _mm_set_epi64 ((__m64)0LL, __A);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_move_epi64 (__m128i __A)
@@ -915,27 +915,27 @@ _mm_unpackhi_pd (__m128d __A, __m128d __
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_unpacklo_pd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_loadh_pd (__m128d __A, double const *__B)
 {
-  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
+  return __extension__ (__m128d){ __A[0], __B[0] };
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_loadl_pd (__m128d __A, double const *__B)
 {
-  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
+  return __extension__ (__m128d){ __B[0], __A[1] };
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movemask_pd (__m128d __A)
 {
   return __builtin_ia32_movmskpd ((__v2df)__A);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_packs_epi16 (__m128i __A, __m128i __B)
Index: gcc/config/i386/xmmintrin.h
===================================================================
--- gcc/config/i386/xmmintrin.h	(revision 213083)
+++ gcc/config/i386/xmmintrin.h	(working copy)
@@ -173,39 +173,39 @@ extern __inline __m128 __attribute__((__
 _mm_max_ss (__m128 __A, __m128 __B)
 {
   return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
 }
 
 /* Perform the respective operation on the four SPFP values in A and B.  */
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
+  return __A + __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
+  return __A - __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
+  return __A * __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
+  return __A / __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_ps (__m128 __A)
 {
   return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_rcp_ps (__m128 __A)
@@ -950,27 +950,27 @@ _mm_set_ps (const float __Z, const float
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_setr_ps (float __Z, float __Y, float __X, float __W)
 {
   return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
 }
 
 /* Stores the lower SPFP value.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_store_ss (float *__P, __m128 __A)
 {
-  *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+  *__P = __A[0];
 }
 
 extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtss_f32 (__m128 __A)
 {
-  return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0);
+  return __A[0];
 }
 
 /* Store four SPFP values.  The address must be 16-byte aligned.  */
 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_store_ps (float *__P, __m128 __A)
 {
   *(__v4sf *)__P = (__v4sf)__A;
 }
 
 /* Store four SPFP values.  The address need not be 16-byte aligned.  */

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-07-26 17:35             ` Marc Glisse
@ 2014-07-29 11:07               ` Kirill Yukhin
  2014-10-09 10:35               ` Marc Glisse
  1 sibling, 0 replies; 35+ messages in thread
From: Kirill Yukhin @ 2014-07-29 11:07 UTC (permalink / raw)
  To: Marc Glisse; +Cc: GCC Patches, Uros Bizjak

Hello Marc,
On 26 Jul 19:34, Marc Glisse wrote:
> I did some AVX and AVX512F intrinsics, and it still passes the
> testsuite (on my old pre-AVX x86_64-linux-gnu).

I've performed testing of your patch using functional simulator of
AVX*. And see no regressions as well.

--
Thanks, K

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-07-26 17:35             ` Marc Glisse
  2014-07-29 11:07               ` Kirill Yukhin
@ 2014-10-09 10:35               ` Marc Glisse
  2014-10-09 11:40                 ` Uros Bizjak
  2014-10-09 17:02                 ` Olivier Hainque
  1 sibling, 2 replies; 35+ messages in thread
From: Marc Glisse @ 2014-10-09 10:35 UTC (permalink / raw)
  To: GCC Patches; +Cc: Uros Bizjak

Ping https://gcc.gnu.org/ml/gcc-patches/2014-07/msg01812.html

(another part of the discussion is around 
https://gcc.gnu.org/ml/gcc-patches/2014-06/msg02288.html )

Most people who commented seem cautiously in favor. The least favorable 
was Ulrich who suggested to go with it but keep the old behavior 
accessible if the user defines some macro (which imho would lose a large 
part of the simplification benefits of the patch)
https://gcc.gnu.org/ml/gcc-patches/2014-06/msg02328.html

If this is accepted, I will gladly prepare patches removing the unused 
builtins and extending this to a few more operations (integer vectors in 
particular). If this is not the direction we want to go, I'd like to hear 
it clearly so I can move on...

My main doubt with the current patch is whether it is better to write 
simply (both variables have type __m128d):
__A + __B
or, as we will have to do for integers:
(__m128d)((__v2df)__A + (__v2df)__B)

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-10-09 10:35               ` Marc Glisse
@ 2014-10-09 11:40                 ` Uros Bizjak
  2014-10-09 12:34                   ` Marc Glisse
  2014-10-09 17:02                 ` Olivier Hainque
  1 sibling, 1 reply; 35+ messages in thread
From: Uros Bizjak @ 2014-10-09 11:40 UTC (permalink / raw)
  To: Marc Glisse; +Cc: GCC Patches

On Thu, Oct 9, 2014 at 12:33 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> Ping https://gcc.gnu.org/ml/gcc-patches/2014-07/msg01812.html
>
> (another part of the discussion is around
> https://gcc.gnu.org/ml/gcc-patches/2014-06/msg02288.html )
>
> Most people who commented seem cautiously in favor. The least favorable was
> Ulrich who suggested to go with it but keep the old behavior accessible if
> the user defines some macro (which imho would lose a large part of the
> simplification benefits of the patch)
> https://gcc.gnu.org/ml/gcc-patches/2014-06/msg02328.html
>
> If this is accepted, I will gladly prepare patches removing the unused
> builtins and extending this to a few more operations (integer vectors in
> particular). If this is not the direction we want to go, I'd like to hear it
> clearly so I can move on...

Well, I'm undecided.

The current approach is proven to work OK, there is no bugs reported
in this area and the performance is apparently OK. There should be
clear benefits in order to change something that "ain't broken", and
at least some proof that we won't regress in this area with the new
approach.

On the other hand, if the new approach opens new optimization
opportunities (without regression!), I'm in favor of it, including the
fact that new code won't produce equivalent assembly - as long as
functionality of the optimized asm stays the same (obviously, I'd
say).

Please also note that this is quite big project. There are plenty of
intrinsics and I for one don't want another partial transition ...

TL/DR: If there are benefits, no regressions and you think you'll
finish the transition, let's go for it.

Uros.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-10-09 11:40                 ` Uros Bizjak
@ 2014-10-09 12:34                   ` Marc Glisse
  2014-10-09 13:25                     ` Uros Bizjak
  0 siblings, 1 reply; 35+ messages in thread
From: Marc Glisse @ 2014-10-09 12:34 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC Patches

On Thu, 9 Oct 2014, Uros Bizjak wrote:

> On Thu, Oct 9, 2014 at 12:33 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
>> Ping https://gcc.gnu.org/ml/gcc-patches/2014-07/msg01812.html
>>
>> (another part of the discussion is around
>> https://gcc.gnu.org/ml/gcc-patches/2014-06/msg02288.html )
>>
>> Most people who commented seem cautiously in favor. The least favorable was
>> Ulrich who suggested to go with it but keep the old behavior accessible if
>> the user defines some macro (which imho would lose a large part of the
>> simplification benefits of the patch)
>> https://gcc.gnu.org/ml/gcc-patches/2014-06/msg02328.html
>>
>> If this is accepted, I will gladly prepare patches removing the unused
>> builtins and extending this to a few more operations (integer vectors in
>> particular). If this is not the direction we want to go, I'd like to hear it
>> clearly so I can move on...
>
> Well, I'm undecided.

First, thanks for answering, it helps me a lot to know what others think.

> The current approach is proven to work OK, there is no bugs reported
> in this area and the performance is apparently OK. There should be
> clear benefits in order to change something that "ain't broken", and
> at least some proof that we won't regress in this area with the new
> approach.

There are quite a few enhancement PRs asking for more performance, but 
indeed no (or very few) complaints about correctness or about gcc turning 
their code into something worse than what they wrote, which I completely 
agree weighs more.

> On the other hand, if the new approach opens new optimization
> opportunities (without regression!), I'm in favor of it, including the
> fact that new code won't produce equivalent assembly - as long as
> functionality of the optimized asm stays the same (obviously, I'd
> say).
>
> Please also note that this is quite big project. There are plenty of
> intrinsics and I for one don't want another partial transition ...

That might be an issue : this transition is partial by nature. Many 
intrinsics cannot (easily) be expressed in GIMPLE, and among those that 
can be represented, we only want to change those for which we are 
confident that we will not regress the quality of the code. From the 
reactions, I would assume that we want to be quite conservative at the 
beginning, and maybe we can reconsider some other intrinsics later.

The best I can offer is consistency: if addition of v2df is changed, 
addition of v4df is changed as well (and say any +-*/ of float/double 
vectors of any supported size). Another block would be +-*/% for integer 
vectors. And construction / access (most construction is already 
builtin-free). And remove the unused builtins in the same patch that makes 
them unused. If you don't like those blocks, I can write one mega-patch 
that does all these, if we roughly agree on the list beforehand, so it 
goes in all at once.

Would that be good enough?

> TL/DR: If there are benefits, no regressions and you think you'll
> finish the transition, let's go for it.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-10-09 12:34                   ` Marc Glisse
@ 2014-10-09 13:25                     ` Uros Bizjak
  2014-10-09 15:14                       ` Kirill Yukhin
  2014-10-09 15:35                       ` H.J. Lu
  0 siblings, 2 replies; 35+ messages in thread
From: Uros Bizjak @ 2014-10-09 13:25 UTC (permalink / raw)
  To: Marc Glisse; +Cc: GCC Patches

On Thu, Oct 9, 2014 at 2:28 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> On Thu, 9 Oct 2014, Uros Bizjak wrote:
>
>> On Thu, Oct 9, 2014 at 12:33 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
>>>
>>> Ping https://gcc.gnu.org/ml/gcc-patches/2014-07/msg01812.html
>>>
>>> (another part of the discussion is around
>>> https://gcc.gnu.org/ml/gcc-patches/2014-06/msg02288.html )
>>>
>>> Most people who commented seem cautiously in favor. The least favorable
>>> was
>>> Ulrich who suggested to go with it but keep the old behavior accessible
>>> if
>>> the user defines some macro (which imho would lose a large part of the
>>> simplification benefits of the patch)
>>> https://gcc.gnu.org/ml/gcc-patches/2014-06/msg02328.html
>>>
>>> If this is accepted, I will gladly prepare patches removing the unused
>>> builtins and extending this to a few more operations (integer vectors in
>>> particular). If this is not the direction we want to go, I'd like to hear
>>> it
>>> clearly so I can move on...
>>
>>
>> Well, I'm undecided.
>
>
> First, thanks for answering, it helps me a lot to know what others think.
>
>> The current approach is proven to work OK, there is no bugs reported
>> in this area and the performance is apparently OK. There should be
>> clear benefits in order to change something that "ain't broken", and
>> at least some proof that we won't regress in this area with the new
>> approach.
>
>
> There are quite a few enhancement PRs asking for more performance, but
> indeed no (or very few) complaints about correctness or about gcc turning
> their code into something worse than what they wrote, which I completely
> agree weighs more.
>
>> On the other hand, if the new approach opens new optimization
>> opportunities (without regression!), I'm in favor of it, including the
>> fact that new code won't produce equivalent assembly - as long as
>> functionality of the optimized asm stays the same (obviously, I'd
>> say).
>>
>> Please also note that this is quite big project. There are plenty of
>> intrinsics and I for one don't want another partial transition ...
>
>
> That might be an issue : this transition is partial by nature. Many
> intrinsics cannot (easily) be expressed in GIMPLE, and among those that can
> be represented, we only want to change those for which we are confident that
> we will not regress the quality of the code. From the reactions, I would
> assume that we want to be quite conservative at the beginning, and maybe we
> can reconsider some other intrinsics later.
>
> The best I can offer is consistency: if addition of v2df is changed,
> addition of v4df is changed as well (and say any +-*/ of float/double
> vectors of any supported size). Another block would be +-*/% for integer
> vectors. And construction / access (most construction is already
> builtin-free). And remove the unused builtins in the same patch that makes
> them unused. If you don't like those blocks, I can write one mega-patch that
> does all these, if we roughly agree on the list beforehand, so it goes in
> all at once.
>
> Would that be good enough?

OK, let's go in the proposed way, more detailed:

- we begin with +-*/ of float/double vectors. IMO, this would result
in a relatively small and easily reviewable patch to iron out the
details of the approach. Alternatively, we can begin with floats only.
- commit the patch and wait for the sky to fall down.
- we play a bit with the compiler to check generated code and corner
cases (some kind of Q/A) and wait if someone finds a problem (say, a
couple of weeks).
- if there are no problems, continue with integer builtins following
the established approach, otherwise we revert everything and go back
to the drawing board.
- repeat the procedure for other builtins.

I propose to wait a couple of days for possible comments before we get
the ball rolling.

Uros.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-10-09 13:25                     ` Uros Bizjak
@ 2014-10-09 15:14                       ` Kirill Yukhin
  2014-10-09 15:35                       ` H.J. Lu
  1 sibling, 0 replies; 35+ messages in thread
From: Kirill Yukhin @ 2014-10-09 15:14 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Marc Glisse, GCC Patches

Hello folks,
On 09 Oct 14:57, Uros Bizjak wrote:
> On Thu, Oct 9, 2014 at 2:28 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> > On Thu, 9 Oct 2014, Uros Bizjak wrote:
> OK, let's go in the proposed way, more detailed:
> 
> - we begin with +-*/ of float/double vectors. IMO, this would result
> in a relatively small and easily reviewable patch to iron out the
> details of the approach. Alternatively, we can begin with floats only.
> - commit the patch and wait for the sky to fall down.
> - we play a bit with the compiler to check generated code and corner
> cases (some kind of Q/A) and wait if someone finds a problem (say, a
> couple of weeks).
> - if there are no problems, continue with integer builtins following
> the established approach, otherwise we revert everything and go back
> to the drawing board.
> - repeat the procedure for other builtins.
> 
> I propose to wait a couple of days for possible comments before we get
> the ball rolling.
Let me repeat, I think this is good idea to do.
I just wanted to kindly ask you wait for about 1-2ww before checking-in
this things.
I hope in that time AVX-512VL,BW,DQ will hit trunk completely 
and *lots* more intrinsics will be added (I think intrinsics is
subject of ~[85/n] patch).

--
Thanks, K

> 
> Uros.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-10-09 13:25                     ` Uros Bizjak
  2014-10-09 15:14                       ` Kirill Yukhin
@ 2014-10-09 15:35                       ` H.J. Lu
  1 sibling, 0 replies; 35+ messages in thread
From: H.J. Lu @ 2014-10-09 15:35 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Marc Glisse, GCC Patches

On Thu, Oct 9, 2014 at 5:57 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Thu, Oct 9, 2014 at 2:28 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
>> On Thu, 9 Oct 2014, Uros Bizjak wrote:
>>
>>> On Thu, Oct 9, 2014 at 12:33 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
>>>>
>>>> Ping https://gcc.gnu.org/ml/gcc-patches/2014-07/msg01812.html
>>>>
>>>> (another part of the discussion is around
>>>> https://gcc.gnu.org/ml/gcc-patches/2014-06/msg02288.html )
>>>>
>>>> Most people who commented seem cautiously in favor. The least favorable
>>>> was
>>>> Ulrich who suggested to go with it but keep the old behavior accessible
>>>> if
>>>> the user defines some macro (which imho would lose a large part of the
>>>> simplification benefits of the patch)
>>>> https://gcc.gnu.org/ml/gcc-patches/2014-06/msg02328.html
>>>>
>>>> If this is accepted, I will gladly prepare patches removing the unused
>>>> builtins and extending this to a few more operations (integer vectors in
>>>> particular). If this is not the direction we want to go, I'd like to hear
>>>> it
>>>> clearly so I can move on...
>>>
>>>
>>> Well, I'm undecided.
>>
>>
>> First, thanks for answering, it helps me a lot to know what others think.
>>
>>> The current approach is proven to work OK, there is no bugs reported
>>> in this area and the performance is apparently OK. There should be
>>> clear benefits in order to change something that "ain't broken", and
>>> at least some proof that we won't regress in this area with the new
>>> approach.
>>
>>
>> There are quite a few enhancement PRs asking for more performance, but
>> indeed no (or very few) complaints about correctness or about gcc turning
>> their code into something worse than what they wrote, which I completely
>> agree weighs more.
>>
>>> On the other hand, if the new approach opens new optimization
>>> opportunities (without regression!), I'm in favor of it, including the
>>> fact that new code won't produce equivalent assembly - as long as
>>> functionality of the optimized asm stays the same (obviously, I'd
>>> say).
>>>
>>> Please also note that this is quite big project. There are plenty of
>>> intrinsics and I for one don't want another partial transition ...
>>
>>
>> That might be an issue : this transition is partial by nature. Many
>> intrinsics cannot (easily) be expressed in GIMPLE, and among those that can
>> be represented, we only want to change those for which we are confident that
>> we will not regress the quality of the code. From the reactions, I would
>> assume that we want to be quite conservative at the beginning, and maybe we
>> can reconsider some other intrinsics later.
>>
>> The best I can offer is consistency: if addition of v2df is changed,
>> addition of v4df is changed as well (and say any +-*/ of float/double
>> vectors of any supported size). Another block would be +-*/% for integer
>> vectors. And construction / access (most construction is already
>> builtin-free). And remove the unused builtins in the same patch that makes
>> them unused. If you don't like those blocks, I can write one mega-patch that
>> does all these, if we roughly agree on the list beforehand, so it goes in
>> all at once.
>>
>> Would that be good enough?
>
> OK, let's go in the proposed way, more detailed:
>
> - we begin with +-*/ of float/double vectors. IMO, this would result
> in a relatively small and easily reviewable patch to iron out the
> details of the approach. Alternatively, we can begin with floats only.
> - commit the patch and wait for the sky to fall down.
> - we play a bit with the compiler to check generated code and corner
> cases (some kind of Q/A) and wait if someone finds a problem (say, a
> couple of weeks).
> - if there are no problems, continue with integer builtins following
> the established approach, otherwise we revert everything and go back
> to the drawing board.
> - repeat the procedure for other builtins.
>
> I propose to wait a couple of days for possible comments before we get
> the ball rolling.
>

We should also include some testcases to show code improvement
for each change.

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-10-09 10:35               ` Marc Glisse
  2014-10-09 11:40                 ` Uros Bizjak
@ 2014-10-09 17:02                 ` Olivier Hainque
  2014-10-09 17:56                   ` Marc Glisse
  1 sibling, 1 reply; 35+ messages in thread
From: Olivier Hainque @ 2014-10-09 17:02 UTC (permalink / raw)
  To: Marc Glisse; +Cc: GCC Patches, Uros Bizjak

Hello Marc,

On Oct 9, 2014, at 12:33 PM, Marc Glisse wrote:
> If this is accepted, I will gladly prepare patches removing the unused builtins and extending this to a few more operations (integer vectors in particular). If this is not the direction we want to go, I'd like to hear it clearly so I can move on...

As we discussed offlist, removing all the builtins would be problematic for
Ada as they are the only medium allowing flexible access to vector instructions
(aside autovectorization) for users.

Today, the model is very simple: people who want to build on top of vector
operations just bind to the builtins they need and expose higher level
interfaces if they like, provided proper type definitions (see g-sse.ads for
example).

We could provide an Ada version of the standard APIs for example, as we do for
Altivec on powerpc, and we have offered this capability out of customer requests.

Without the builtins, we'd need to define syntax + semantics for vector
operations in the language. While this is an interesting perspective, we
don't have that today and this would be a fair amount of non-trivial work
I'm afraid, not something we can take on just like that.

Note that this doesn't mean that we need all the builtins to remain there.
Just at least one of those providing access to a given machine insn at some
point. We can implement various sorts of always_inline wrappers to perform
type conversions as needed, and builtins are understood as very low level
devices so changes in the interface aren't an issue. The real issue would be
if access to a given insn becomes impossible out of the removals.

Thanks!

With Kind Regards,

Olivier


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-10-09 17:02                 ` Olivier Hainque
@ 2014-10-09 17:56                   ` Marc Glisse
  2014-10-09 18:01                     ` Uros Bizjak
  0 siblings, 1 reply; 35+ messages in thread
From: Marc Glisse @ 2014-10-09 17:56 UTC (permalink / raw)
  To: Olivier Hainque; +Cc: GCC Patches, Uros Bizjak

On Thu, 9 Oct 2014, Olivier Hainque wrote:

> On Oct 9, 2014, at 12:33 PM, Marc Glisse wrote:
>> If this is accepted, I will gladly prepare patches removing the unused builtins and extending this to a few more operations (integer vectors in particular). If this is not the direction we want to go, I'd like to hear it clearly so I can move on...
>
> As we discussed offlist, removing all the builtins would be problematic for
> Ada as they are the only medium allowing flexible access to vector instructions
> (aside autovectorization) for users.
>
> Today, the model is very simple: people who want to build on top of vector
> operations just bind to the builtins they need and expose higher level
> interfaces if they like, provided proper type definitions (see g-sse.ads for
> example).

It is sad that this prevents us from removing the builtins, but I agree 
that we can't just drop ada+sse users like that. Well, less work for me if 
I don't have to remove the builtins, and my main motivation is 
optimization, even if I tried to sell the clean up to convince people.

Uros, is it still ok if I change the intrinsics without removing the 
builtins? (with testcases for HJ and not before Kirill says it is ok)

> Without the builtins, we'd need to define syntax + semantics for vector
> operations in the language. While this is an interesting perspective, we
> don't have that today and this would be a fair amount of non-trivial work
> I'm afraid, not something we can take on just like that.

I think it is an interesting possibility to keep in mind (maybe in a few 
years?). Basic support in the C front-end is surprisingly simple (C++ 
templates are a different story), and doesn't need to be duplicated for 
sse/altivec/neon... only the "weird" operations really need builtins.

Thanks for posting this,

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-10-09 17:56                   ` Marc Glisse
@ 2014-10-09 18:01                     ` Uros Bizjak
  2014-10-09 18:05                       ` Marc Glisse
  0 siblings, 1 reply; 35+ messages in thread
From: Uros Bizjak @ 2014-10-09 18:01 UTC (permalink / raw)
  To: Marc Glisse; +Cc: Olivier Hainque, GCC Patches

On Thu, Oct 9, 2014 at 7:46 PM, Marc Glisse <marc.glisse@inria.fr> wrote:

>>> If this is accepted, I will gladly prepare patches removing the unused
>>> builtins and extending this to a few more operations (integer vectors in
>>> particular). If this is not the direction we want to go, I'd like to hear it
>>> clearly so I can move on...
>>
>>
>> As we discussed offlist, removing all the builtins would be problematic
>> for
>> Ada as they are the only medium allowing flexible access to vector
>> instructions
>> (aside autovectorization) for users.
>>
>> Today, the model is very simple: people who want to build on top of vector
>> operations just bind to the builtins they need and expose higher level
>> interfaces if they like, provided proper type definitions (see g-sse.ads
>> for
>> example).
>
>
> It is sad that this prevents us from removing the builtins, but I agree that
> we can't just drop ada+sse users like that. Well, less work for me if I
> don't have to remove the builtins, and my main motivation is optimization,
> even if I tried to sell the clean up to convince people.
>
> Uros, is it still ok if I change the intrinsics without removing the
> builtins? (with testcases for HJ and not before Kirill says it is ok)

Given that this will be a substantial work and considering the request
from Kirill, what do you think about separate development branch until
AVXn stuff is finished? This will give a couple of weeks and a
playground to finalize the approach for the conversion. Maybe even ada
can be tested there to not regress with the compatibility stuff.

Uros.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2014-10-09 18:01                     ` Uros Bizjak
@ 2014-10-09 18:05                       ` Marc Glisse
  0 siblings, 0 replies; 35+ messages in thread
From: Marc Glisse @ 2014-10-09 18:05 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: Olivier Hainque, GCC Patches

On Thu, 9 Oct 2014, Uros Bizjak wrote:

> Given that this will be a substantial work and considering the request
> from Kirill, what do you think about separate development branch until
> AVXn stuff is finished? This will give a couple of weeks and a
> playground to finalize the approach for the conversion. Maybe even ada
> can be tested there to not regress with the compatibility stuff.

No problem. We can also wait until next stage1 if you believe the release 
of gcc-5 is too close.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2013-04-07 18:03 Marc Glisse
  2013-04-07 19:38 ` Marc Glisse
  2013-04-08 22:44 ` Marc Glisse
@ 2014-02-05 21:51 ` Marc Glisse
  2 siblings, 0 replies; 35+ messages in thread
From: Marc Glisse @ 2014-02-05 21:51 UTC (permalink / raw)
  To: gcc-patches

Hello,

I was wondering if the new #pragma target in *mmintrin.h make this 
approach more acceptable for 4.10?

http://gcc.gnu.org/ml/gcc-patches/2013-04/msg00374.html

On Sun, 7 Apr 2013, Marc Glisse wrote:

> Hello,
>
> the attached patch is very incomplete (it passes bootstrap+testsuite on 
> x86_64-linux-gnu), but it raises a number of questions that I'd like to 
> settle before continuing.
>
> * Is there any chance of a patch in this direction being accepted?
>
> * May I remove the builtins (from i386.c and the doc) when they become 
> unused?
>
> * Do we want to keep the casts even when they don't seem strictly necessary? 
> For instance for _mm_add_ps, we can write:
> 	return __A + __B;
> or:
> 	return (__m128) ((__v4sf)__A + (__v4sf)__B);
> Note that for _mm_add_epi8 for instance we do need the casts.
>
> * For integer operations like _mm_add_epi16 I should probably use the 
> unsigned typedefs to make it clear overflow is well defined? (the patch still 
> has the signed version)
>
> * Any better name than __v4su for the unsigned version of __v4si?
>
> * Other comments?
>
>
> 2013-04-07  Marc Glisse  <marc.glisse@inria.fr>
>
> 	* emmintrin.h (__v2du, __v4su, __v8hu): New typedefs.
> 	(_mm_add_pd, _mm_sub_pd, _mm_mul_pd, _mm_div_pd,
> 	_mm_cmpeq_pd, _mm_cmplt_pd, _mm_cmple_pd, _mm_cmpgt_pd, _mm_cmpge_pd,
> 	_mm_cmpneq_pd, _mm_add_epi8, _mm_add_epi16, _mm_add_epi32,
> 	_mm_add_epi64, _mm_slli_epi16, _mm_slli_epi32, _mm_slli_epi64,
> 	_mm_srai_epi16, _mm_srai_epi32, _mm_srli_epi16, _mm_srli_epi32,
> 	_mm_srli_epi64): Replace builtins with vector extensions.
> 	* xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps, _mm_div_ps,
> 	_mm_cmpeq_ps, _mm_cmplt_ps, _mm_cmple_ps, _mm_cmpgt_ps, _mm_cmpge_ps,
> 	_mm_cmpneq_ps): Likewise.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2013-04-10  7:46       ` Marc Glisse
@ 2013-04-10  9:46         ` Richard Biener
  0 siblings, 0 replies; 35+ messages in thread
From: Richard Biener @ 2013-04-10  9:46 UTC (permalink / raw)
  To: Marc Glisse; +Cc: gcc-patches

On Tue, Apr 9, 2013 at 9:15 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> On Tue, 9 Apr 2013, Marc Glisse wrote:
>
>> On Tue, 9 Apr 2013, Richard Biener wrote:
>>
>>> I seem to remember discussion in the PR(s) that the intrinsics should
>>> (and do for other compilers) expand to the desired instructions even when
>>> the corresponding instruction set is disabled.
>>
>>
>> emmintrin.h starts with:
>> #ifndef __SSE2__
>> # error "SSE2 instruction set not enabled"
>
>
> Oh, re-reading your post, it looks like you mean we should change the
> current behavior, not just avoid regressions...
>
> My opinion on the intrinsics is that they are the portable way to use
> vectors on x86, but they are not equivalent to asm (which people should use
> if they don't want the compiler looking at their code). Knowingly generating
> SSE code with -mno-sse is not very appealing.
>
> However, the arguments in:
> http://gcc.gnu.org/bugzilla/show_bug.cgi?id=56298
> make sense. I guess I'll forget about this patch.

Note that to fully support emitting intrinsics "correctly" even without -msse
x86 specific builtins need to be used and they need to conditionally expand
to either UNSPECs (if the required instriuction set / modes are not available)
or regular RTL (where they can be folded to generic GIMPLE earlier
then as well).
A complication is register allocation which would need to understand how to
allocate registers for the UNSPECs - even if some of the modes would not
be "available".  So it's indeed a mess ...

That said, folding of the x86 builtins to GIMPLE looks like a more
viable approach
that would not interfere too much with any possible route we would go here.
As suggested previously please add a new target hook with the same interface
as fold_stmt in case you want to work on this.

Thanks,
Richard.

> --
> Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2013-04-09 11:10     ` Marc Glisse
  2013-04-09 11:25       ` Jakub Jelinek
@ 2013-04-10  7:46       ` Marc Glisse
  2013-04-10  9:46         ` Richard Biener
  1 sibling, 1 reply; 35+ messages in thread
From: Marc Glisse @ 2013-04-10  7:46 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches

On Tue, 9 Apr 2013, Marc Glisse wrote:

> On Tue, 9 Apr 2013, Richard Biener wrote:
>
>> I seem to remember discussion in the PR(s) that the intrinsics should
>> (and do for other compilers) expand to the desired instructions even when
>> the corresponding instruction set is disabled.
>
> emmintrin.h starts with:
> #ifndef __SSE2__
> # error "SSE2 instruction set not enabled"

Oh, re-reading your post, it looks like you mean we should change the 
current behavior, not just avoid regressions...

My opinion on the intrinsics is that they are the portable way to use 
vectors on x86, but they are not equivalent to asm (which people should 
use if they don't want the compiler looking at their code). Knowingly 
generating SSE code with -mno-sse is not very appealing.

However, the arguments in:
http://gcc.gnu.org/bugzilla/show_bug.cgi?id=56298
make sense. I guess I'll forget about this patch.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2013-04-09 11:25       ` Jakub Jelinek
@ 2013-04-09 12:33         ` Marc Glisse
  0 siblings, 0 replies; 35+ messages in thread
From: Marc Glisse @ 2013-04-09 12:33 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Biener, gcc-patches

On Tue, 9 Apr 2013, Jakub Jelinek wrote:

> On Tue, Apr 09, 2013 at 11:08:38AM +0200, Marc Glisse wrote:
>> The *intrin.h files already use __extension__ to create vectors, like:
>>   return __extension__ (__m128d){ __F, 0.0 };
>> but even when I remove it it does not warn with -std=c89 -pedantic.
>
> Even with -Wsystem-headers ?

Oups ;-)

Ok, removing the existing __extension__ causes warnings (note that it can 
easily be worked around by initializing a variable instead of this 
compound literal, so it isn't vectors that pedantic complains about), but 
my changes do not warn.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2013-04-09 11:10     ` Marc Glisse
@ 2013-04-09 11:25       ` Jakub Jelinek
  2013-04-09 12:33         ` Marc Glisse
  2013-04-10  7:46       ` Marc Glisse
  1 sibling, 1 reply; 35+ messages in thread
From: Jakub Jelinek @ 2013-04-09 11:25 UTC (permalink / raw)
  To: Marc Glisse; +Cc: Richard Biener, gcc-patches

On Tue, Apr 09, 2013 at 11:08:38AM +0200, Marc Glisse wrote:
> The *intrin.h files already use __extension__ to create vectors, like:
>   return __extension__ (__m128d){ __F, 0.0 };
> but even when I remove it it does not warn with -std=c89 -pedantic.

Even with -Wsystem-headers ?

	Jakub

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2013-04-09  9:39   ` Richard Biener
@ 2013-04-09 11:10     ` Marc Glisse
  2013-04-09 11:25       ` Jakub Jelinek
  2013-04-10  7:46       ` Marc Glisse
  0 siblings, 2 replies; 35+ messages in thread
From: Marc Glisse @ 2013-04-09 11:10 UTC (permalink / raw)
  To: Richard Biener; +Cc: gcc-patches

On Tue, 9 Apr 2013, Richard Biener wrote:

> On Mon, Apr 8, 2013 at 10:47 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
>> On Sun, 7 Apr 2013, Marc Glisse wrote:
>>
>>>  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
>>> __artificial__))
>>>  _mm_slli_epi16 (__m128i __A, int __B)
>>>  {
>>> -  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
>>> +  return (__m128i) ((__v8hi)__A << __B);
>>>  }
>>
>>
>> Actually, I believe I have to keep using the builtins for shifts, because
>> the intrinsics have well defined behavior for large __B whereas << and >>
>> don't.
>
> I seem to remember discussion in the PR(s) that the intrinsics should
> (and do for other compilers) expand to the desired instructions even when
> the corresponding instruction set is disabled.

emmintrin.h starts with:
#ifndef __SSE2__
# error "SSE2 instruction set not enabled"

The closest thing I can think of is issues with -mfpmath=387, but that 
shouldn't matter for full vector ops.

> Using vector extension
> makes that harder to achieve.  Other than that I am all for using the
> vector extensions, but I think you need carefully wrapped __extension__
> markers so that with -std=c89 -pedantic you still can compile programs
> using the intrinsics?

The *intrin.h files already use __extension__ to create vectors, like:
   return __extension__ (__m128d){ __F, 0.0 };
but even when I remove it it does not warn with -std=c89 -pedantic.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2013-04-08 22:44 ` Marc Glisse
@ 2013-04-09  9:39   ` Richard Biener
  2013-04-09 11:10     ` Marc Glisse
  0 siblings, 1 reply; 35+ messages in thread
From: Richard Biener @ 2013-04-09  9:39 UTC (permalink / raw)
  To: Marc Glisse; +Cc: gcc-patches

On Mon, Apr 8, 2013 at 10:47 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> On Sun, 7 Apr 2013, Marc Glisse wrote:
>
>>  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
>> __artificial__))
>>  _mm_slli_epi16 (__m128i __A, int __B)
>>  {
>> -  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
>> +  return (__m128i) ((__v8hi)__A << __B);
>>  }
>
>
> Actually, I believe I have to keep using the builtins for shifts, because
> the intrinsics have well defined behavior for large __B whereas << and >>
> don't.

I seem to remember discussion in the PR(s) that the intrinsics should
(and do for other compilers) expand to the desired instructions even when
the corresponding instruction set is disabled.  Using vector extension
makes that harder to achieve.  Other than that I am all for using the
vector extensions, but I think you need carefully wrapped __extension__
markers so that with -std=c89 -pedantic you still can compile programs
using the intrinsics?

Richard.

> --
> Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2013-04-07 18:03 Marc Glisse
  2013-04-07 19:38 ` Marc Glisse
@ 2013-04-08 22:44 ` Marc Glisse
  2013-04-09  9:39   ` Richard Biener
  2014-02-05 21:51 ` Marc Glisse
  2 siblings, 1 reply; 35+ messages in thread
From: Marc Glisse @ 2013-04-08 22:44 UTC (permalink / raw)
  To: gcc-patches

On Sun, 7 Apr 2013, Marc Glisse wrote:

>  extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__,
> __artificial__))
>  _mm_slli_epi16 (__m128i __A, int __B)
>  {
> -  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
> +  return (__m128i) ((__v8hi)__A << __B);
>  }

Actually, I believe I have to keep using the builtins for shifts, because 
the intrinsics have well defined behavior for large __B whereas << and >> 
don't.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [i386] Replace builtins with vector extensions
  2013-04-07 18:03 Marc Glisse
@ 2013-04-07 19:38 ` Marc Glisse
  2013-04-08 22:44 ` Marc Glisse
  2014-02-05 21:51 ` Marc Glisse
  2 siblings, 0 replies; 35+ messages in thread
From: Marc Glisse @ 2013-04-07 19:38 UTC (permalink / raw)
  To: gcc-patches

By the way, the comment in emmintrin.h in front of _mm_sqrt_sd seems 
wrong:

/* Return pair {sqrt (A[0), B[1]}.  */

It should be instead:

/* Return pair {sqrt (B[0]), A[1]}.  */

If you agree I'll fix that independently.

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [i386] Replace builtins with vector extensions
@ 2013-04-07 18:03 Marc Glisse
  2013-04-07 19:38 ` Marc Glisse
                   ` (2 more replies)
  0 siblings, 3 replies; 35+ messages in thread
From: Marc Glisse @ 2013-04-07 18:03 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1508 bytes --]

Hello,

the attached patch is very incomplete (it passes bootstrap+testsuite on 
x86_64-linux-gnu), but it raises a number of questions that I'd like to 
settle before continuing.

* Is there any chance of a patch in this direction being accepted?

* May I remove the builtins (from i386.c and the doc) when they become unused?

* Do we want to keep the casts even when they don't seem strictly 
necessary? For instance for _mm_add_ps, we can write:
 	return __A + __B;
or:
 	return (__m128) ((__v4sf)__A + (__v4sf)__B);
Note that for _mm_add_epi8 for instance we do need the casts.

* For integer operations like _mm_add_epi16 I should probably use the 
unsigned typedefs to make it clear overflow is well defined? (the patch 
still has the signed version)

* Any better name than __v4su for the unsigned version of __v4si?

* Other comments?


2013-04-07  Marc Glisse  <marc.glisse@inria.fr>

 	* emmintrin.h (__v2du, __v4su, __v8hu): New typedefs.
 	(_mm_add_pd, _mm_sub_pd, _mm_mul_pd, _mm_div_pd,
 	_mm_cmpeq_pd, _mm_cmplt_pd, _mm_cmple_pd, _mm_cmpgt_pd, _mm_cmpge_pd,
 	_mm_cmpneq_pd, _mm_add_epi8, _mm_add_epi16, _mm_add_epi32,
 	_mm_add_epi64, _mm_slli_epi16, _mm_slli_epi32, _mm_slli_epi64,
 	_mm_srai_epi16, _mm_srai_epi32, _mm_srli_epi16, _mm_srli_epi32,
 	_mm_srli_epi64): Replace builtins with vector extensions.
 	* xmmintrin.h (_mm_add_ps, _mm_sub_ps, _mm_mul_ps, _mm_div_ps,
 	_mm_cmpeq_ps, _mm_cmplt_ps, _mm_cmple_ps, _mm_cmpgt_ps, _mm_cmpge_ps,
 	_mm_cmpneq_ps): Likewise.


-- 
Marc Glisse

[-- Attachment #2: Type: TEXT/PLAIN, Size: 15088 bytes --]

Index: config/i386/xmmintrin.h
===================================================================
--- config/i386/xmmintrin.h	(revision 197549)
+++ config/i386/xmmintrin.h	(working copy)
@@ -147,39 +147,39 @@ extern __inline __m128 __attribute__((__
 _mm_max_ss (__m128 __A, __m128 __B)
 {
   return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
 }
 
 /* Perform the respective operation on the four SPFP values in A and B.  */
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B);
+  return __A + __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B);
+  return __A - __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B);
+  return __A * __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B);
+  return __A / __B;
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_ps (__m128 __A)
 {
   return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_rcp_ps (__m128 __A)
@@ -323,51 +323,51 @@ _mm_cmpunord_ss (__m128 __A, __m128 __B)
   return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
 }
 
 /* Perform a comparison on the four SPFP values of A and B.  For each
    element, if the comparison is true, place a mask of all ones in the
    result, otherwise a mask of zeros.  */
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpeq_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) (__A == __B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmplt_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) (__A < __B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmple_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) (__A <= __B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpgt_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) (__A > __B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpge_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) (__A >= __B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpneq_ps (__m128 __A, __m128 __B)
 {
-  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
+  return (__m128) (__A != __B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnlt_ps (__m128 __A, __m128 __B)
 {
   return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
 }
 
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnle_ps (__m128 __A, __m128 __B)
Index: config/i386/emmintrin.h
===================================================================
--- config/i386/emmintrin.h	(revision 197549)
+++ config/i386/emmintrin.h	(working copy)
@@ -30,22 +30,25 @@
 #ifndef __SSE2__
 # error "SSE2 instruction set not enabled"
 #else
 
 /* We need definitions from the SSE header files*/
 #include <xmmintrin.h>
 
 /* SSE2 */
 typedef double __v2df __attribute__ ((__vector_size__ (16)));
 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
 typedef int __v4si __attribute__ ((__vector_size__ (16)));
+typedef unsigned int __v4su __attribute__ ((__vector_size__ (16)));
 typedef short __v8hi __attribute__ ((__vector_size__ (16)));
+typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16)));
 typedef char __v16qi __attribute__ ((__vector_size__ (16)));
 
 /* The Intel API is flexible enough that we must allow aliasing with other
    vector types, and their scalar components.  */
 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
 
 /* Create a selector for use with the SHUFPD instruction.  */
 #define _MM_SHUFFLE2(fp1,fp0) \
  (((fp1) << 1) | (fp0))
@@ -219,72 +222,72 @@ _mm_cvtsi128_si64 (__m128i __A)
 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtsi128_si64x (__m128i __A)
 {
   return __builtin_ia32_vec_ext_v2di ((__v2di)__A, 0);
 }
 #endif
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_addpd ((__v2df)__A, (__v2df)__B);
+  return __A + __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_subpd ((__v2df)__A, (__v2df)__B);
+  return __A - __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_mulpd ((__v2df)__A, (__v2df)__B);
+  return __A * __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_divpd ((__v2df)__A, (__v2df)__B);
+  return __A / __B;
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_div_sd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_pd (__m128d __A)
 {
   return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
 }
 
-/* Return pair {sqrt (A[0), B[1]}.  */
+/* Return pair {sqrt (A[0]), B[1]}.  */
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sqrt_sd (__m128d __A, __m128d __B)
 {
   __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
   return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_min_pd (__m128d __A, __m128d __B)
 {
@@ -329,51 +332,51 @@ _mm_or_pd (__m128d __A, __m128d __B)
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_xor_pd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpeq_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
+  return (__m128d) (__A == __B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmplt_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
+  return (__m128d) (__A < __B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmple_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
+  return (__m128d) (__A <= __B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpgt_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
+  return (__m128d) (__A > __B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpge_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
+  return (__m128d) (__A >= __B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpneq_pd (__m128d __A, __m128d __B)
 {
-  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
+  return (__m128d) (__A != __B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnlt_pd (__m128d __A, __m128d __B)
 {
   return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
 }
 
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cmpnle_pd (__m128d __A, __m128d __B)
@@ -981,39 +984,39 @@ _mm_unpacklo_epi32 (__m128i __A, __m128i
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_epi8 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
+  return (__m128i) ((__v16qi)__A + (__v16qi)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_epi16 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
+  return (__m128i) ((__v8hi)__A + (__v8hi)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_epi32 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
+  return (__m128i) ((__v4si)__A + (__v4si)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_epi64 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
+  return (__m128i) ((__v2di)__A + (__v2di)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_adds_epi8 (__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_adds_epi16 (__m128i __A, __m128i __B)
@@ -1107,45 +1110,45 @@ _mm_mul_su32 (__m64 __A, __m64 __B)
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_epu32 (__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
+  return (__m128i) ((__v8hi)__A << __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
+  return (__m128i) ((__v4si)__A << __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi64 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
+  return (__m128i) ((__v2di)__A << __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srai_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
+  return (__m128i) ((__v8hi)__A >> __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srai_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
+  return (__m128i) ((__v4si)__A >> __B);
 }
 
 #ifdef __OPTIMIZE__
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_si128 (__m128i __A, const int __N)
 {
   return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1156,33 +1159,33 @@ _mm_slli_si128 (__m128i __A, const int _
 #else
 #define _mm_srli_si128(A, N) \
   ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
 #define _mm_slli_si128(A, N) \
   ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
 #endif
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
+  return (__m128i) ((__v8hu)__A >> __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
+  return (__m128i) ((__v4su)__A >> __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi64 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
+  return (__m128i) ((__v2du)__A >> __B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sll_epi16 (__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sll_epi32 (__m128i __A, __m128i __B)

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2014-10-09 18:04 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-04-11 20:10 [i386] Replace builtins with vector extensions Marc Glisse
2014-04-28 11:39 ` Marc Glisse
2014-05-17 13:35   ` Marc Glisse
2014-06-28 10:42     ` Marc Glisse
2014-06-28 13:37       ` Ulrich Drepper
2014-06-28 22:53         ` Marc Glisse
2014-06-29  9:41           ` Ulrich Drepper
2014-06-29 11:06             ` Marc Glisse
2014-07-03 10:17       ` Kirill Yukhin
2014-07-04 19:12         ` Marc Glisse
2014-07-08 11:14           ` Kirill Yukhin
2014-07-08 11:17             ` Jakub Jelinek
2014-07-08 16:02               ` Mike Stump
2014-07-26 17:35             ` Marc Glisse
2014-07-29 11:07               ` Kirill Yukhin
2014-10-09 10:35               ` Marc Glisse
2014-10-09 11:40                 ` Uros Bizjak
2014-10-09 12:34                   ` Marc Glisse
2014-10-09 13:25                     ` Uros Bizjak
2014-10-09 15:14                       ` Kirill Yukhin
2014-10-09 15:35                       ` H.J. Lu
2014-10-09 17:02                 ` Olivier Hainque
2014-10-09 17:56                   ` Marc Glisse
2014-10-09 18:01                     ` Uros Bizjak
2014-10-09 18:05                       ` Marc Glisse
  -- strict thread matches above, loose matches on Subject: below --
2013-04-07 18:03 Marc Glisse
2013-04-07 19:38 ` Marc Glisse
2013-04-08 22:44 ` Marc Glisse
2013-04-09  9:39   ` Richard Biener
2013-04-09 11:10     ` Marc Glisse
2013-04-09 11:25       ` Jakub Jelinek
2013-04-09 12:33         ` Marc Glisse
2013-04-10  7:46       ` Marc Glisse
2013-04-10  9:46         ` Richard Biener
2014-02-05 21:51 ` Marc Glisse

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).