* [x86, 2/n] Replace builtins with vector extensions
@ 2014-10-18 12:09 Marc Glisse
2014-11-04 20:32 ` Marc Glisse
0 siblings, 1 reply; 3+ messages in thread
From: Marc Glisse @ 2014-10-18 12:09 UTC (permalink / raw)
To: gcc-patches; +Cc: ubizjak
[-- Attachment #1: Type: TEXT/PLAIN, Size: 863 bytes --]
Hello,
this time, +-* for 128 bit integer vectors. I am using an unsigned type so
the compiler knows that we expect wrapping. I don't know why Intel's
description of mullo insists that the multiplication is signed, that only
matters for the high part...
Next parts (waiting for approval for this one) should be:
- same thing with 256 and 512 bit integer vectors
- & | ^ (integer only)
Maybe (or it can wait until the next release):
- < > == abs min max (integer only)
2014-10-20 Marc Glisse <marc.glisse@inria.fr>
* config/i386/emmintrin.h (__v2du, __v4su, __v8hu, __v16qu): New
typedefs.
(_mm_add_epi8, _mm_add_epi16, _mm_add_epi32, _mm_add_epi64,
_mm_sub_epi8, _mm_sub_epi16, _mm_sub_epi32, _mm_sub_epi64,
_mm_mullo_epi16): Use vector extensions instead of builtins.
* config/i386/smmintrin.h (_mm_mullo_epi32): Likewise.
--
Marc Glisse
[-- Attachment #2: Type: TEXT/PLAIN, Size: 6859 bytes --]
Index: gcc/config/i386/emmintrin.h
===================================================================
--- gcc/config/i386/emmintrin.h (revision 216422)
+++ gcc/config/i386/emmintrin.h (working copy)
@@ -32,23 +32,27 @@
#ifndef __SSE2__
#pragma GCC push_options
#pragma GCC target("sse2")
#define __DISABLE_SSE2__
#endif /* __SSE2__ */
/* SSE2 */
typedef double __v2df __attribute__ ((__vector_size__ (16)));
typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
typedef int __v4si __attribute__ ((__vector_size__ (16)));
+typedef unsigned int __v4su __attribute__ ((__vector_size__ (16)));
typedef short __v8hi __attribute__ ((__vector_size__ (16)));
+typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16)));
typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16)));
/* The Intel API is flexible enough that we must allow aliasing with other
vector types, and their scalar components. */
typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
/* Create a selector for use with the SHUFPD instruction. */
#define _MM_SHUFFLE2(fp1,fp0) \
(((fp1) << 1) | (fp0))
@@ -999,39 +1003,39 @@ _mm_unpacklo_epi32 (__m128i __A, __m128i
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi8 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B);
+ return (__m128i) ((__v16qu)__A + (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi16 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddw128 ((__v8hi)__A, (__v8hi)__B);
+ return (__m128i) ((__v8hu)__A + (__v8hu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi32 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddd128 ((__v4si)__A, (__v4si)__B);
+ return (__m128i) ((__v4su)__A + (__v4su)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_add_epi64 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_paddq128 ((__v2di)__A, (__v2di)__B);
+ return (__m128i) ((__v2du)__A + (__v2du)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi8 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epi16 (__m128i __A, __m128i __B)
@@ -1047,39 +1051,39 @@ _mm_adds_epu8 (__m128i __A, __m128i __B)
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_adds_epu16 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi8 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_psubb128 ((__v16qi)__A, (__v16qi)__B);
+ return (__m128i) ((__v16qu)__A - (__v16qu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi16 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_psubw128 ((__v8hi)__A, (__v8hi)__B);
+ return (__m128i) ((__v8hu)__A - (__v8hu)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi32 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_psubd128 ((__v4si)__A, (__v4si)__B);
+ return (__m128i) ((__v4su)__A - (__v4su)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sub_epi64 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_psubq128 ((__v2di)__A, (__v2di)__B);
+ return (__m128i) ((__v2du)__A - (__v2du)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi8 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_subs_epi16 (__m128i __A, __m128i __B)
@@ -1107,21 +1111,21 @@ _mm_madd_epi16 (__m128i __A, __m128i __B
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_epi16 (__m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi16 (__m128i __A, __m128i __B)
{
- return (__m128i)__builtin_ia32_pmullw128 ((__v8hi)__A, (__v8hi)__B);
+ return (__m128i) ((__v8hu)__A * (__v8hu)__B);
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_su32 (__m64 __A, __m64 __B)
{
return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
}
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32 (__m128i __A, __m128i __B)
Index: gcc/config/i386/smmintrin.h
===================================================================
--- gcc/config/i386/smmintrin.h (revision 216422)
+++ gcc/config/i386/smmintrin.h (working copy)
@@ -318,21 +318,21 @@ extern __inline __m128i __attribute__((_
_mm_max_epu32 (__m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y);
}
/* Packed integer 32-bit multiplication with truncation of upper
halves of results. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mullo_epi32 (__m128i __X, __m128i __Y)
{
- return (__m128i) __builtin_ia32_pmulld128 ((__v4si)__X, (__v4si)__Y);
+ return (__m128i) ((__v4su)__X * (__v4su)__Y);
}
/* Packed integer 32-bit multiplication of 2 pairs of operands
with two 64-bit results. */
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epi32 (__m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y);
}
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [x86, 2/n] Replace builtins with vector extensions
2014-10-18 12:09 [x86, 2/n] Replace builtins with vector extensions Marc Glisse
@ 2014-11-04 20:32 ` Marc Glisse
2014-11-05 8:02 ` Uros Bizjak
0 siblings, 1 reply; 3+ messages in thread
From: Marc Glisse @ 2014-11-04 20:32 UTC (permalink / raw)
To: gcc-patches; +Cc: ubizjak
Ping?
https://gcc.gnu.org/ml/gcc-patches/2014-10/msg01808.html
On Sat, 18 Oct 2014, Marc Glisse wrote:
> Hello,
>
> this time, +-* for 128 bit integer vectors. I am using an unsigned type so
> the compiler knows that we expect wrapping. I don't know why Intel's
> description of mullo insists that the multiplication is signed, that only
> matters for the high part...
>
> Next parts (waiting for approval for this one) should be:
> - same thing with 256 and 512 bit integer vectors
> - & | ^ (integer only)
>
> Maybe (or it can wait until the next release):
> - < > == abs min max (integer only)
>
>
> 2014-10-20 Marc Glisse <marc.glisse@inria.fr>
>
> * config/i386/emmintrin.h (__v2du, __v4su, __v8hu, __v16qu): New
> typedefs.
> (_mm_add_epi8, _mm_add_epi16, _mm_add_epi32, _mm_add_epi64,
> _mm_sub_epi8, _mm_sub_epi16, _mm_sub_epi32, _mm_sub_epi64,
> _mm_mullo_epi16): Use vector extensions instead of builtins.
> * config/i386/smmintrin.h (_mm_mullo_epi32): Likewise.
>
>
--
Marc Glisse
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [x86, 2/n] Replace builtins with vector extensions
2014-11-04 20:32 ` Marc Glisse
@ 2014-11-05 8:02 ` Uros Bizjak
0 siblings, 0 replies; 3+ messages in thread
From: Uros Bizjak @ 2014-11-05 8:02 UTC (permalink / raw)
To: Marc Glisse; +Cc: gcc-patches
On Tue, Nov 4, 2014 at 9:31 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> Ping?
Uh, yes, LGTM.
(I was under impression that I already OK'd this relatively
non-controversial patch. The effect of having too much open tasks in
parallel, I guess.)
Thanks,
Uros.
> https://gcc.gnu.org/ml/gcc-patches/2014-10/msg01808.html
>
>
> On Sat, 18 Oct 2014, Marc Glisse wrote:
>
>> Hello,
>>
>> this time, +-* for 128 bit integer vectors. I am using an unsigned type so
>> the compiler knows that we expect wrapping. I don't know why Intel's
>> description of mullo insists that the multiplication is signed, that only
>> matters for the high part...
>>
>> Next parts (waiting for approval for this one) should be:
>> - same thing with 256 and 512 bit integer vectors
>> - & | ^ (integer only)
>>
>> Maybe (or it can wait until the next release):
>> - < > == abs min max (integer only)
>>
>>
>> 2014-10-20 Marc Glisse <marc.glisse@inria.fr>
>>
>> * config/i386/emmintrin.h (__v2du, __v4su, __v8hu, __v16qu): New
>> typedefs.
>> (_mm_add_epi8, _mm_add_epi16, _mm_add_epi32, _mm_add_epi64,
>> _mm_sub_epi8, _mm_sub_epi16, _mm_sub_epi32, _mm_sub_epi64,
>> _mm_mullo_epi16): Use vector extensions instead of builtins.
>> * config/i386/smmintrin.h (_mm_mullo_epi32): Likewise.
>>
>>
>
> --
> Marc Glisse
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2014-11-05 8:02 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-18 12:09 [x86, 2/n] Replace builtins with vector extensions Marc Glisse
2014-11-04 20:32 ` Marc Glisse
2014-11-05 8:02 ` Uros Bizjak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).