[x86, 2/n] Replace builtins with vector extensions

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [x86, 2/n] Replace builtins with vector extensions
@ 2014-10-18 12:09 Marc Glisse
  2014-11-04 20:32 ` Marc Glisse
  0 siblings, 1 reply; 3+ messages in thread
From: Marc Glisse @ 2014-10-18 12:09 UTC (permalink / raw)
  To: gcc-patches; +Cc: ubizjak

[-- Attachment #1: Type: TEXT/PLAIN, Size: 863 bytes --]

Hello,

this time, +-* for 128 bit integer vectors. I am using an unsigned type so 
the compiler knows that we expect wrapping. I don't know why Intel's 
description of mullo insists that the multiplication is signed, that only 
matters for the high part...

Next parts (waiting for approval for this one) should be:
- same thing with 256 and 512 bit integer vectors
- & | ^ (integer only)

Maybe (or it can wait until the next release):
- < > == abs min max (integer only)


2014-10-20  Marc Glisse  <marc.glisse@inria.fr>

 	* config/i386/emmintrin.h (__v2du, __v4su, __v8hu, __v16qu): New
 	typedefs.
 	(_mm_add_epi8, _mm_add_epi16, _mm_add_epi32, _mm_add_epi64,
 	_mm_sub_epi8, _mm_sub_epi16, _mm_sub_epi32, _mm_sub_epi64,
 	_mm_mullo_epi16): Use vector extensions instead of builtins.
 	* config/i386/smmintrin.h (_mm_mullo_epi32): Likewise.

-- 
Marc Glisse

[-- Attachment #2: Type: TEXT/PLAIN, Size: 6859 bytes --]

Index: gcc/config/i386/emmintrin.h
===================================================================
--- gcc/config/i386/emmintrin.h	(revision 216422)
+++ gcc/config/i386/emmintrin.h	(working copy)
@@ -32,23 +32,27 @@
 
 #ifndef __SSE2__
 #pragma GCC push_options
 #pragma GCC target("sse2")
 #define __DISABLE_SSE2__
 #endif /* __SSE2__ */
 
 /* SSE2 */
 typedef double __v2df __attribute__ ((__vector_size__ (16)));
 typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
 typedef int __v4si __attribute__ ((__vector_size__ (16)));
+typedef unsigned int __v4su __attribute__ ((__vector_size__ (16)));
 typedef short __v8hi __attribute__ ((__vector_size__ (16)));
+typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16)));
 typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16)));
 
 /* The Intel API is flexible enough that we must allow aliasing with other
    vector types, and their scalar components.  */
 typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
 typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
 
 /* Create a selector for use with the SHUFPD instruction.  */
 #define _MM_SHUFFLE2(fp1,fp0) \
  (((fp1) << 1) | (fp0))
 
@@ -999,39 +1003,39 @@ _mm_unpacklo_epi32 (__m128i __A, __m128i
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_unpacklo_epi64 (__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_epi8 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
+  return (__m128i) ((__v16qu)__A + (__v16qu)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_epi16 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
+  return (__m128i) ((__v8hu)__A + (__v8hu)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_epi32 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
+  return (__m128i) ((__v4su)__A + (__v4su)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_add_epi64 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
+  return (__m128i) ((__v2du)__A + (__v2du)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_adds_epi8 (__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_adds_epi16 (__m128i __A, __m128i __B)
@@ -1047,39 +1051,39 @@ _mm_adds_epu8 (__m128i __A, __m128i __B)
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_adds_epu16 (__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_epi8 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
+  return (__m128i) ((__v16qu)__A - (__v16qu)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_epi16 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
+  return (__m128i) ((__v8hu)__A - (__v8hu)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_epi32 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
+  return (__m128i) ((__v4su)__A - (__v4su)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_sub_epi64 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
+  return (__m128i) ((__v2du)__A - (__v2du)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_subs_epi8 (__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_subs_epi16 (__m128i __A, __m128i __B)
@@ -1107,21 +1111,21 @@ _mm_madd_epi16 (__m128i __A, __m128i __B
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mulhi_epi16 (__m128i __A, __m128i __B)
 {
   return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mullo_epi16 (__m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
+  return (__m128i) ((__v8hu)__A * (__v8hu)__B);
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_su32 (__m64 __A, __m64 __B)
 {
   return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_epu32 (__m128i __A, __m128i __B)
Index: gcc/config/i386/smmintrin.h
===================================================================
--- gcc/config/i386/smmintrin.h	(revision 216422)
+++ gcc/config/i386/smmintrin.h	(working copy)
@@ -318,21 +318,21 @@ extern __inline __m128i __attribute__((_
 _mm_max_epu32 (__m128i __X, __m128i __Y)
 {
   return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y);
 }
 
 /* Packed integer 32-bit multiplication with truncation of upper
    halves of results.  */
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mullo_epi32 (__m128i __X, __m128i __Y)
 {
-  return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X, (__v4si)__Y);
+  return (__m128i) ((__v4su)__X * (__v4su)__Y);
 }
 
 /* Packed integer 32-bit multiplication of 2 pairs of operands
    with two 64-bit results.  */
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_mul_epi32 (__m128i __X, __m128i __Y)
 {
   return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y);
 }
 

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [x86, 2/n] Replace builtins with vector extensions
  2014-10-18 12:09 [x86, 2/n] Replace builtins with vector extensions Marc Glisse
@ 2014-11-04 20:32 ` Marc Glisse
  2014-11-05  8:02   ` Uros Bizjak
  0 siblings, 1 reply; 3+ messages in thread
From: Marc Glisse @ 2014-11-04 20:32 UTC (permalink / raw)
  To: gcc-patches; +Cc: ubizjak

Ping?
https://gcc.gnu.org/ml/gcc-patches/2014-10/msg01808.html

On Sat, 18 Oct 2014, Marc Glisse wrote:

> Hello,
>
> this time, +-* for 128 bit integer vectors. I am using an unsigned type so 
> the compiler knows that we expect wrapping. I don't know why Intel's 
> description of mullo insists that the multiplication is signed, that only 
> matters for the high part...
>
> Next parts (waiting for approval for this one) should be:
> - same thing with 256 and 512 bit integer vectors
> - & | ^ (integer only)
>
> Maybe (or it can wait until the next release):
> - < > == abs min max (integer only)
>
>
> 2014-10-20  Marc Glisse  <marc.glisse@inria.fr>
>
> 	* config/i386/emmintrin.h (__v2du, __v4su, __v8hu, __v16qu): New
> 	typedefs.
> 	(_mm_add_epi8, _mm_add_epi16, _mm_add_epi32, _mm_add_epi64,
> 	_mm_sub_epi8, _mm_sub_epi16, _mm_sub_epi32, _mm_sub_epi64,
> 	_mm_mullo_epi16): Use vector extensions instead of builtins.
> 	* config/i386/smmintrin.h (_mm_mullo_epi32): Likewise.
>
>

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [x86, 2/n] Replace builtins with vector extensions
  2014-11-04 20:32 ` Marc Glisse
@ 2014-11-05  8:02   ` Uros Bizjak
  0 siblings, 0 replies; 3+ messages in thread
From: Uros Bizjak @ 2014-11-05  8:02 UTC (permalink / raw)
  To: Marc Glisse; +Cc: gcc-patches

On Tue, Nov 4, 2014 at 9:31 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> Ping?

Uh, yes, LGTM.

(I was under impression that I already OK'd this relatively
non-controversial patch. The effect of having too much open tasks in
parallel, I guess.)

Thanks,
Uros.

> https://gcc.gnu.org/ml/gcc-patches/2014-10/msg01808.html
>
>
> On Sat, 18 Oct 2014, Marc Glisse wrote:
>
>> Hello,
>>
>> this time, +-* for 128 bit integer vectors. I am using an unsigned type so
>> the compiler knows that we expect wrapping. I don't know why Intel's
>> description of mullo insists that the multiplication is signed, that only
>> matters for the high part...
>>
>> Next parts (waiting for approval for this one) should be:
>> - same thing with 256 and 512 bit integer vectors
>> - & | ^ (integer only)
>>
>> Maybe (or it can wait until the next release):
>> - < > == abs min max (integer only)
>>
>>
>> 2014-10-20  Marc Glisse  <marc.glisse@inria.fr>
>>
>>         * config/i386/emmintrin.h (__v2du, __v4su, __v8hu, __v16qu): New
>>         typedefs.
>>         (_mm_add_epi8, _mm_add_epi16, _mm_add_epi32, _mm_add_epi64,
>>         _mm_sub_epi8, _mm_sub_epi16, _mm_sub_epi32, _mm_sub_epi64,
>>         _mm_mullo_epi16): Use vector extensions instead of builtins.
>>         * config/i386/smmintrin.h (_mm_mullo_epi32): Likewise.
>>
>>
>
> --
> Marc Glisse

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2014-11-05  8:02 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-18 12:09 [x86, 2/n] Replace builtins with vector extensions Marc Glisse
2014-11-04 20:32 ` Marc Glisse
2014-11-05  8:02   ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).