* [i386] Mask generation in avx2intrin.h
@ 2017-11-25 10:21 Marc Glisse
2017-12-05 12:19 ` Marc Glisse
2018-04-30 21:06 ` Marc Glisse
0 siblings, 2 replies; 3+ messages in thread
From: Marc Glisse @ 2017-11-25 10:21 UTC (permalink / raw)
To: gcc-patches
[-- Attachment #1: Type: TEXT/PLAIN, Size: 845 bytes --]
Hello,
the way full masks are generated currently in avx2intrin.h is
questionable: opaque for the inline functions, weird/wrong for the macros.
It is possible we may want to add code so the constant mask with all ones
may be generated with vxorpd+vcmpeqpd instead of loading it from memory,
but that looks like something that should be decided globally, not in each
instruction that uses it.
Bootstrap+regtest on x86_64-pc-linux-gnu (skylake).
2017-11-27 Marc Glisse <marc.glisse@inria.fr>
PR target/80885
* config/i386/avx2intrin.h (_mm_i32gather_pd): Rewrite mask generation.
(_mm256_i32gather_pd): Likewise.
(_mm_i64gather_pd): Likewise.
(_mm256_i64gather_pd): Likewise.
(_mm_i32gather_ps): Likewise.
(_mm256_i32gather_ps): Likewise.
(_mm_i64gather_ps): Likewise.
(_mm256_i64gather_ps): Likewise.
--
Marc Glisse
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: Type: TEXT/x-diff; name=mask.patch, Size: 10984 bytes --]
Index: gcc/config/i386/avx2intrin.h
===================================================================
--- gcc/config/i386/avx2intrin.h (revision 255140)
+++ gcc/config/i386/avx2intrin.h (working copy)
@@ -1241,22 +1241,21 @@ __attribute__ ((__gnu_inline__, __always
_mm_srlv_epi64 (__m128i __X, __m128i __Y)
{
return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
}
#ifdef __OPTIMIZE__
extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_i32gather_pd (double const *__base, __m128i __index, const int __scale)
{
- __v2df __zero = _mm_setzero_pd ();
- __v2df __mask = _mm_cmpeq_pd (__zero, __zero);
+ __v2df __mask = (__v2df)_mm_set1_epi64x (-1);
return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
__base,
(__v4si)__index,
__mask,
__scale);
}
extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1267,22 +1266,21 @@ _mm_mask_i32gather_pd (__m128d __src, do
__base,
(__v4si)__index,
(__v2df)__mask,
__scale);
}
extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale)
{
- __v4df __zero = _mm256_setzero_pd ();
- __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ);
+ __v4df __mask = (__v4df)_mm256_set1_epi64x (-1);
return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
__base,
(__v4si)__index,
__mask,
__scale);
}
extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1294,21 +1292,21 @@ _mm256_mask_i32gather_pd (__m256d __src,
(__v4si)__index,
(__v4df)__mask,
__scale);
}
extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_i64gather_pd (double const *__base, __m128i __index, const int __scale)
{
__v2df __src = _mm_setzero_pd ();
- __v2df __mask = _mm_cmpeq_pd (__src, __src);
+ __v2df __mask = (__v2df)_mm_set1_epi64x (-1);
return (__m128d) __builtin_ia32_gatherdiv2df (__src,
__base,
(__v2di)__index,
__mask,
__scale);
}
extern __inline __m128d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1320,21 +1318,21 @@ _mm_mask_i64gather_pd (__m128d __src, do
(__v2di)__index,
(__v2df)__mask,
__scale);
}
extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale)
{
__v4df __src = _mm256_setzero_pd ();
- __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ);
+ __v4df __mask = (__v4df)_mm256_set1_epi64x (-1);
return (__m256d) __builtin_ia32_gatherdiv4df (__src,
__base,
(__v4di)__index,
__mask,
__scale);
}
extern __inline __m256d
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1346,21 +1344,21 @@ _mm256_mask_i64gather_pd (__m256d __src,
(__v4di)__index,
(__v4df)__mask,
__scale);
}
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_i32gather_ps (float const *__base, __m128i __index, const int __scale)
{
__v4sf __src = _mm_setzero_ps ();
- __v4sf __mask = _mm_cmpeq_ps (__src, __src);
+ __v4sf __mask = (__v4sf)_mm_set1_epi64x (-1);
return (__m128) __builtin_ia32_gathersiv4sf (__src,
__base,
(__v4si)__index,
__mask,
__scale);
}
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1372,21 +1370,21 @@ _mm_mask_i32gather_ps (__m128 __src, flo
(__v4si)__index,
(__v4sf)__mask,
__scale);
}
extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale)
{
__v8sf __src = _mm256_setzero_ps ();
- __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ);
+ __v8sf __mask = (__v8sf)_mm256_set1_epi64x (-1);
return (__m256) __builtin_ia32_gathersiv8sf (__src,
__base,
(__v8si)__index,
__mask,
__scale);
}
extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1398,21 +1396,21 @@ _mm256_mask_i32gather_ps (__m256 __src,
(__v8si)__index,
(__v8sf)__mask,
__scale);
}
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm_i64gather_ps (float const *__base, __m128i __index, const int __scale)
{
__v4sf __src = _mm_setzero_ps ();
- __v4sf __mask = _mm_cmpeq_ps (__src, __src);
+ __v4sf __mask = (__v4sf)_mm_set1_epi64x (-1);
return (__m128) __builtin_ia32_gatherdiv4sf (__src,
__base,
(__v2di)__index,
__mask,
__scale);
}
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1424,21 +1422,21 @@ _mm_mask_i64gather_ps (__m128 __src, flo
(__v2di)__index,
(__v4sf)__mask,
__scale);
}
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
_mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale)
{
__v4sf __src = _mm_setzero_ps ();
- __v4sf __mask = _mm_cmpeq_ps (__src, __src);
+ __v4sf __mask = (__v4sf)_mm_set1_epi64x (-1);
return (__m128) __builtin_ia32_gatherdiv4sf256 (__src,
__base,
(__v4di)__index,
__mask,
__scale);
}
extern __inline __m128
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1665,126 +1663,119 @@ _mm256_mask_i64gather_epi32 (__m128i __s
__base,
(__v4di)__index,
(__v4si)__mask,
__scale);
}
#else /* __OPTIMIZE__ */
#define _mm_i32gather_pd(BASE, INDEX, SCALE) \
(__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \
(double const *)BASE, \
(__v4si)(__m128i)INDEX, \
- (__v2df)_mm_set1_pd( \
- (double)(long long int) -1), \
+ (__v2df)_mm_set1_epi64x (-1), \
(int)SCALE)
#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
(__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \
(double const *)BASE, \
(__v4si)(__m128i)INDEX, \
(__v2df)(__m128d)MASK, \
(int)SCALE)
#define _mm256_i32gather_pd(BASE, INDEX, SCALE) \
(__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \
(double const *)BASE, \
(__v4si)(__m128i)INDEX, \
- (__v4df)_mm256_set1_pd( \
- (double)(long long int) -1), \
+ (__v4df)_mm256_set1_epi64x(-1),\
(int)SCALE)
#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
(__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \
(double const *)BASE, \
(__v4si)(__m128i)INDEX, \
(__v4df)(__m256d)MASK, \
(int)SCALE)
#define _mm_i64gather_pd(BASE, INDEX, SCALE) \
(__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \
(double const *)BASE, \
(__v2di)(__m128i)INDEX, \
- (__v2df)_mm_set1_pd( \
- (double)(long long int) -1), \
+ (__v2df)_mm_set1_epi64x (-1), \
(int)SCALE)
#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
(__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \
(double const *)BASE, \
(__v2di)(__m128i)INDEX, \
(__v2df)(__m128d)MASK, \
(int)SCALE)
#define _mm256_i64gather_pd(BASE, INDEX, SCALE) \
(__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \
(double const *)BASE, \
(__v4di)(__m256i)INDEX, \
- (__v4df)_mm256_set1_pd( \
- (double)(long long int) -1), \
+ (__v4df)_mm256_set1_epi64x(-1),\
(int)SCALE)
#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
(__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \
(double const *)BASE, \
(__v4di)(__m256i)INDEX, \
(__v4df)(__m256d)MASK, \
(int)SCALE)
#define _mm_i32gather_ps(BASE, INDEX, SCALE) \
(__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \
(float const *)BASE, \
(__v4si)(__m128i)INDEX, \
- _mm_set1_ps ((float)(int) -1), \
+ (__v4sf)_mm_set1_epi64x (-1), \
(int)SCALE)
#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
(__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \
(float const *)BASE, \
(__v4si)(__m128i)INDEX, \
(__v4sf)(__m128d)MASK, \
(int)SCALE)
#define _mm256_i32gather_ps(BASE, INDEX, SCALE) \
(__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
(float const *)BASE, \
(__v8si)(__m256i)INDEX, \
- (__v8sf)_mm256_set1_ps ( \
- (float)(int) -1), \
+ (__v8sf)_mm256_set1_epi64x(-1),\
(int)SCALE)
#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
(__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \
(float const *)BASE, \
(__v8si)(__m256i)INDEX, \
(__v8sf)(__m256d)MASK, \
(int)SCALE)
#define _mm_i64gather_ps(BASE, INDEX, SCALE) \
(__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \
(float const *)BASE, \
(__v2di)(__m128i)INDEX, \
- (__v4sf)_mm_set1_ps ( \
- (float)(int) -1), \
+ (__v4sf)_mm_set1_epi64x (-1), \
(int)SCALE)
#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
(__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \
(float const *)BASE, \
(__v2di)(__m128i)INDEX, \
(__v4sf)(__m128d)MASK, \
(int)SCALE)
#define _mm256_i64gather_ps(BASE, INDEX, SCALE) \
(__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
(float const *)BASE, \
(__v4di)(__m256i)INDEX, \
- (__v4sf)_mm_set1_ps( \
- (float)(int) -1), \
+ (__v4sf)_mm_set1_epi64x (-1),\
(int)SCALE)
#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
(__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \
(float const *)BASE, \
(__v4di)(__m256i)INDEX, \
(__v4sf)(__m128)MASK, \
(int)SCALE)
#define _mm_i32gather_epi64(BASE, INDEX, SCALE) \
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [i386] Mask generation in avx2intrin.h
2017-11-25 10:21 [i386] Mask generation in avx2intrin.h Marc Glisse
@ 2017-12-05 12:19 ` Marc Glisse
2018-04-30 21:06 ` Marc Glisse
1 sibling, 0 replies; 3+ messages in thread
From: Marc Glisse @ 2017-12-05 12:19 UTC (permalink / raw)
To: gcc-patches; +Cc: ubizjak, kirill.yukhin
Adding Cc: Uros and Kirill
https://gcc.gnu.org/ml/gcc-patches/2017-11/msg02233.html
On Sat, 25 Nov 2017, Marc Glisse wrote:
> Hello,
>
> the way full masks are generated currently in avx2intrin.h is questionable:
> opaque for the inline functions, weird/wrong for the macros.
>
> It is possible we may want to add code so the constant mask with all ones may
> be generated with vxorpd+vcmpeqpd instead of loading it from memory, but that
> looks like something that should be decided globally, not in each instruction
> that uses it.
>
> Bootstrap+regtest on x86_64-pc-linux-gnu (skylake).
>
> 2017-11-27 Marc Glisse <marc.glisse@inria.fr>
>
> PR target/80885
> * config/i386/avx2intrin.h (_mm_i32gather_pd): Rewrite mask generation.
> (_mm256_i32gather_pd): Likewise.
> (_mm_i64gather_pd): Likewise.
> (_mm256_i64gather_pd): Likewise.
> (_mm_i32gather_ps): Likewise.
> (_mm256_i32gather_ps): Likewise.
> (_mm_i64gather_ps): Likewise.
> (_mm256_i64gather_ps): Likewise.
--
Marc Glisse
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [i386] Mask generation in avx2intrin.h
2017-11-25 10:21 [i386] Mask generation in avx2intrin.h Marc Glisse
2017-12-05 12:19 ` Marc Glisse
@ 2018-04-30 21:06 ` Marc Glisse
1 sibling, 0 replies; 3+ messages in thread
From: Marc Glisse @ 2018-04-30 21:06 UTC (permalink / raw)
To: gcc-patches
Ping https://gcc.gnu.org/ml/gcc-patches/2017-11/msg02233.html
On Sat, 25 Nov 2017, Marc Glisse wrote:
> Hello,
>
> the way full masks are generated currently in avx2intrin.h is questionable:
> opaque for the inline functions, weird/wrong for the macros.
>
> It is possible we may want to add code so the constant mask with all ones may
> be generated with vxorpd+vcmpeqpd instead of loading it from memory, but that
> looks like something that should be decided globally, not in each instruction
> that uses it.
>
> Bootstrap+regtest on x86_64-pc-linux-gnu (skylake).
>
> 2017-11-27 Marc Glisse <marc.glisse@inria.fr>
>
> PR target/80885
> * config/i386/avx2intrin.h (_mm_i32gather_pd): Rewrite mask generation.
> (_mm256_i32gather_pd): Likewise.
> (_mm_i64gather_pd): Likewise.
> (_mm256_i64gather_pd): Likewise.
> (_mm_i32gather_ps): Likewise.
> (_mm256_i32gather_ps): Likewise.
> (_mm_i64gather_ps): Likewise.
> (_mm256_i64gather_ps): Likewise.
--
Marc Glisse
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2018-04-30 21:04 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-25 10:21 [i386] Mask generation in avx2intrin.h Marc Glisse
2017-12-05 12:19 ` Marc Glisse
2018-04-30 21:06 ` Marc Glisse
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).