Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
       [not found] <201704221338.46300.linux@carewolf.com>
@ 2017-04-24  7:43 ` Allan Sandfeld Jensen
  2017-04-24  7:47   ` Jakub Jelinek
       [not found] ` <201704241101.29634.linux@carewolf.com>
  1 sibling, 1 reply; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2017-04-24  7:43 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: Text/Plain, Size: 470 bytes --]

On Saturday 22 April 2017, Allan Sandfeld Jensen wrote:
> Replaces definitions of immediate logical shift intrinsics with GCC
> extension syntax. Tests are added to ensure the intrinsics still produce
> the right instructions and that a few basic optimizations now work.
> 
> Compared to the earlier version of the patch, all potentially undefined
> shifts are now avoided, which also means no variable shifts or arithmetic
> right shifts.

Fixed 2 errors in the tests.

[-- Attachment #2: SIMD-shifts-4.diff --]
[-- Type: text/x-patch, Size: 11565 bytes --]

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b58f5050db0..b9406550fc5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2017-04-22  Allan Sandfeld Jensen  <sandfeld@kde.org>
+
+	* config/i386/emmintrin.h (_mm_slli_*, _mm_srli_*):
+	Use vector intrinstics instead of builtins.
+	* config/i386/avx2intrin.h (_mm256_slli_*, _mm256_srli_*):
+	Use vector intrinstics instead of builtins.
+
 2017-04-21  Uros Bizjak  <ubizjak@gmail.com>
 
 	* config/i386/i386.md (*extzvqi_mem_rex64): Move above *extzv<mode>.
diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h
index 82f170a3d61..acb49734131 100644
--- a/gcc/config/i386/avx2intrin.h
+++ b/gcc/config/i386/avx2intrin.h
@@ -667,7 +667,7 @@ extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_slli_epi16 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
+  return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) : _mm256_setzero_si256();
 }
 
 extern __inline __m256i
@@ -681,7 +681,7 @@ extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_slli_epi32 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
+  return ((__B & 0xff) < 32) ? (__m256i)((__v8si)__A << (__B & 0xff)) : _mm256_setzero_si256();
 }
 
 extern __inline __m256i
@@ -695,7 +695,7 @@ extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_slli_epi64 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
+  return ((__B & 0xff) < 64) ? (__m256i)((__v4di)__A << (__B & 0xff)) : _mm256_setzero_si256();
 }
 
 extern __inline __m256i
@@ -758,7 +758,7 @@ extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_srli_epi16 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
+  return ((__B & 0xff) < 16) ? (__m256i) ((__v16hu)__A >> (__B & 0xff)) : _mm256_setzero_si256();
 }
 
 extern __inline __m256i
@@ -772,7 +772,7 @@ extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_srli_epi32 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
+  return ((__B & 0xff) < 32) ? (__m256i) ((__v8su)__A >> (__B & 0xff)) : _mm256_setzero_si256();
 }
 
 extern __inline __m256i
@@ -786,7 +786,7 @@ extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_srli_epi64 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
+  return ((__B & 0xff) < 64) ? (__m256i) ((__v4du)__A >> (__B & 0xff)) : _mm256_setzero_si256();
 }
 
 extern __inline __m256i
diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index 828f4a07a9b..5c048d9fd0d 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -1140,19 +1140,19 @@ _mm_mul_epu32 (__m128i __A, __m128i __B)
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
+  return ((__B & 0xff) < 16) ? (__m128i)((__v8hi)__A << (__B & 0xff)) : _mm_setzero_si128();
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
+  return ((__B & 0xff) < 32) ? (__m128i)((__v4si)__A << (__B & 0xff)) : _mm_setzero_si128();
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi64 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
+  return ((__B & 0xff) < 64) ? (__m128i)((__v2di)__A << (__B & 0xff)) : _mm_setzero_si128();
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1205,19 +1205,19 @@ _mm_slli_si128 (__m128i __A, const int __N)
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
+  return ((__B & 0xff) < 16) ? (__m128i)((__v8hu)__A >> (__B & 0xff)) : _mm_setzero_si128();
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
+  return ((__B & 0xff) < 32) ? (__m128i)((__v4su)__A >> (__B & 0xff)) : _mm_setzero_si128();
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi64 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
+  return ((__B & 0xff) < 64) ? (__m128i)((__v2du)__A >> (__B & 0xff)) : _mm_setzero_si128();
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 6f4dc8d5095..ffface0f6b3 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,13 @@
+2017-04-22  Allan Sandfeld Jensen  <sandfeld@kde.org>
+
+	* gcc.target/i386/sse2-pslld-1.c: Expand test with more corner cases.
+	* gcc.target/i386/sse2-psllw-1.c: Expand test with more corner cases.
+	* gcc.target/i386/sse2-pslrd-1.c: Expand test with more corner cases.
+	* gcc.target/i386/sse2-shifts-1.c: New testcases of shift intrinsics
+	producing intended instructions.
+	* gcc.target/i386/sse2-shifts-2.c: New testcasse of shift intrinsics
+	being folded.
+
 2017-04-21  Janus Weil  <janus@gcc.gnu.org>
 
 	PR fortran/80392
diff --git a/gcc/testsuite/gcc.target/i386/sse2-pslld-1.c b/gcc/testsuite/gcc.target/i386/sse2-pslld-1.c
index 31474e3234f..b2eb938fc06 100644
--- a/gcc/testsuite/gcc.target/i386/sse2-pslld-1.c
+++ b/gcc/testsuite/gcc.target/i386/sse2-pslld-1.c
@@ -10,17 +10,15 @@
 #define TEST sse2_test
 #endif
 
-#define N 0xf
-
 #include CHECK_H
 
 #include <emmintrin.h>
 
 static __m128i
 __attribute__((noinline, unused))
-test (__m128i s1)
+test (__m128i s1, int n)
 {
-  return _mm_slli_epi32 (s1, N); 
+  return _mm_slli_epi32 (s1, n); 
 }
 
 static void
@@ -28,16 +26,25 @@ TEST (void)
 {
   union128i_d u, s;
   int e[4] = {0};
-  int i;
+  int ns[4] = {15, 65, 260, -250};
  
   s.x = _mm_set_epi32 (1, -2, 3, 4);
 
-  u.x = test (s.x);
-
-  if (N < 32)
-    for (i = 0; i < 4; i++)
-      e[i] = s.a[i] << N; 
-
-  if (check_union128i_d (u, e))
-    abort (); 
+  for (int j = 0; j < 4; j++) {
+    int n = ns[j];
+    u.x = test (s.x, n);
+    
+    n = n & 0xff;
+    if (n < 32) {
+      for (int i = 0; i < 4; i++)
+        e[i] = s.a[i] << n;
+    } else {
+      for (int i = 0; i < 4; i++)
+        e[i] = 0;
+    }
+
+
+    if (check_union128i_d (u, e))
+      abort ();
+  }
 }
diff --git a/gcc/testsuite/gcc.target/i386/sse2-psllw-1.c b/gcc/testsuite/gcc.target/i386/sse2-psllw-1.c
index 3153ec45529..6a740fce050 100644
--- a/gcc/testsuite/gcc.target/i386/sse2-psllw-1.c
+++ b/gcc/testsuite/gcc.target/i386/sse2-psllw-1.c
@@ -10,17 +10,15 @@
 #define TEST sse2_test
 #endif
 
-#define N 0xb
-
 #include CHECK_H
 
 #include <emmintrin.h>
 
 static __m128i
 __attribute__((noinline, unused))
-test (__m128i s1)
+test (__m128i s1, int n)
 {
-  return _mm_slli_epi16 (s1, N); 
+  return _mm_slli_epi16 (s1, n); 
 }
 
 static void
@@ -28,16 +26,25 @@ TEST (void)
 {
   union128i_w u, s;
   short e[8] = {0};
+  int ns[4] = {11, 16, 63, -250};
   int i;
  
   s.x = _mm_set_epi16 (1, 2, 3, 4, 5, 6, 0x7000, 0x9000);
 
-  u.x = test (s.x);
-
-  if (N < 16)
-    for (i = 0; i < 8; i++)
-      e[i] = s.a[i] << N; 
-
-  if (check_union128i_w (u, e))
-    abort (); 
+  for (int j = 0; j < 4; j++) {
+    int n = ns[j];
+    u.x = test (s.x, n);
+
+    n = n & 0xff;
+    if (n < 16) {
+      for (i = 0; i < 8; i++)
+        e[i] = s.a[i] << n;
+    } else {
+      for (int i = 0; i < 8; i++)
+        e[i] = 0;
+    }
+
+    if (check_union128i_w (u, e))
+      abort (); 
+  }
 }
diff --git a/gcc/testsuite/gcc.target/i386/sse2-psrld-1.c b/gcc/testsuite/gcc.target/i386/sse2-psrld-1.c
index d310fc45204..ec5f1c2d391 100644
--- a/gcc/testsuite/gcc.target/i386/sse2-psrld-1.c
+++ b/gcc/testsuite/gcc.target/i386/sse2-psrld-1.c
@@ -10,17 +10,15 @@
 #define TEST sse2_test
 #endif
 
-#define N 0xf
-
 #include CHECK_H
 
 #include <emmintrin.h>
 
 static __m128i
 __attribute__((noinline, unused))
-test (__m128i s1)
+test (__m128i s1, int n)
 {
-  return _mm_srli_epi32 (s1, N); 
+  return _mm_srli_epi32 (s1, n);
 }
 
 static void
@@ -28,19 +26,28 @@ TEST (void)
 {
   union128i_d u, s;
   int e[4] = {0};
+  int ns[4] = {15, 65, 260, -250};
   unsigned int tmp;
   int i;
  
   s.x = _mm_set_epi32 (1, -2, 3, 4);
 
-  u.x = test (s.x);
-
-  if (N < 32)
-    for (i = 0; i < 4; i++) {
-      tmp  = s.a[i];
-      e[i] = tmp >> N; 
+  for (int j = 0; j < 4; j++) {
+    int n = ns[j];
+    u.x = test (s.x, n);
+
+    n = n & 0xff;
+    if (n < 32) {
+      for (i = 0; i < 4; i++) {
+        tmp  = s.a[i];
+        e[i] = tmp >> n; 
+      }
+    } else {
+      for (int i = 0; i < 4; i++)
+        e[i] = 0;
     }
 
-  if (check_union128i_d (u, e))
-    abort (); 
+    if (check_union128i_d (u, e))
+      abort ();
+  }
 }
diff --git a/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c b/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c
new file mode 100644
index 00000000000..a2305cf042a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-require-effective-target sse2 } */
+
+#include <emmintrin.h>
+
+__m128i test1(__m128i a)
+{
+    return _mm_slli_epi16(a, 9);
+}
+
+__m128i test2(__m128i a)
+{
+    return _mm_slli_epi32(a, 13);
+}
+
+__m128i test3(__m128i a)
+{
+    return _mm_slli_epi64(a, 17);
+}
+
+__m128i test4(__m128i a)
+{
+    return _mm_srli_epi16(a, 9);
+}
+
+__m128i test5(__m128i a)
+{
+    return _mm_srli_epi32(a, 13);
+}
+
+__m128i test6(__m128i a)
+{
+    return _mm_srli_epi64(a, 7);
+}
+
+__m128i test7(__m128i a)
+{
+    return _mm_srai_epi16(a, 3);
+}
+
+__m128i test8(__m128i a)
+{
+    return _mm_srai_epi32(a, 6);
+}
+
+/* { dg-final { scan-assembler "psllw" } } */
+/* { dg-final { scan-assembler "pslld" } } */
+/* { dg-final { scan-assembler "psllq" } } */
+/* { dg-final { scan-assembler "psrlw" } } */
+/* { dg-final { scan-assembler "psrld" } } */
+/* { dg-final { scan-assembler "psrlq" } } */
+/* { dg-final { scan-assembler "psraw" } } */
+/* { dg-final { scan-assembler "psrad" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c b/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c
new file mode 100644
index 00000000000..ce05a7dc44e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+#include <emmintrin.h>
+
+__m128i test1(__m128i a)
+{
+    a = _mm_slli_epi16(a, 2);
+    return _mm_slli_epi16(a, 3);
+}
+/* { dg-final { scan-assembler "psllw.*5"} } */
+
+__m128i test3(__m128i a)
+{
+    a = _mm_srli_epi16(a, 4);
+    return _mm_srli_epi16(a, 9);
+}
+/* { dg-final { scan-assembler-times "psrlw" 1} } */
+
+__m128i test4(__m128i a)
+{
+    a = _mm_setr_epi32(128, 255, 86, 23);
+    return _mm_srli_epi32(a, 8);
+}
+/* { dg-final { scan-assembler-not "psrld"} } */
+

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
  2017-04-24  7:43 ` [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts Allan Sandfeld Jensen
@ 2017-04-24  7:47   ` Jakub Jelinek
  2017-04-24  8:02     ` Allan Sandfeld Jensen
  2017-04-24 14:43     ` Allan Sandfeld Jensen
  0 siblings, 2 replies; 16+ messages in thread
From: Jakub Jelinek @ 2017-04-24  7:47 UTC (permalink / raw)
  To: Allan Sandfeld Jensen, Uros Bizjak; +Cc: gcc-patches

On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote:
> --- a/gcc/config/i386/avx2intrin.h
> +++ b/gcc/config/i386/avx2intrin.h
> @@ -667,7 +667,7 @@ extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm256_slli_epi16 (__m256i __A, int __B)
>  {
> -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> +  return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) : _mm256_setzero_si256();
>  }

What is the advantage of doing that when you replace one operation with
several (&, <, ?:, <<)?
I'd say instead we should fold the builtins if in the gimple fold target
hook we see the shift count constant and can decide based on that.
Or we could use __builtin_constant_p (__B) to decide whether to use
the generic vector shifts or builtin, but that means larger IL.

	Jakub

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
  2017-04-24  7:47   ` Jakub Jelinek
@ 2017-04-24  8:02     ` Allan Sandfeld Jensen
  2017-04-24  8:25       ` Jakub Jelinek
  2017-04-24 14:43     ` Allan Sandfeld Jensen
  1 sibling, 1 reply; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2017-04-24  8:02 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Uros Bizjak, gcc-patches

On Monday 24 April 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote:
> > --- a/gcc/config/i386/avx2intrin.h
> > +++ b/gcc/config/i386/avx2intrin.h
> > @@ -667,7 +667,7 @@ extern __inline __m256i
> > 
> >  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> >  _mm256_slli_epi16 (__m256i __A, int __B)
> >  {
> > 
> > -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> > +  return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) :
> > _mm256_setzero_si256();
> > 
> >  }
> 
> What is the advantage of doing that when you replace one operation with
> several (&, <, ?:, <<)?
> I'd say instead we should fold the builtins if in the gimple fold target
> hook we see the shift count constant and can decide based on that.
> Or we could use __builtin_constant_p (__B) to decide whether to use
> the generic vector shifts or builtin, but that means larger IL.

The advantage is that in this builtin, the __B is always a literal (or 
constexpr), so the if statement is resolved at compile time.

`Allan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
  2017-04-24  8:02     ` Allan Sandfeld Jensen
@ 2017-04-24  8:25       ` Jakub Jelinek
  2017-04-24  8:25         ` Allan Sandfeld Jensen
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Jelinek @ 2017-04-24  8:25 UTC (permalink / raw)
  To: Allan Sandfeld Jensen; +Cc: Uros Bizjak, gcc-patches

On Mon, Apr 24, 2017 at 09:51:29AM +0200, Allan Sandfeld Jensen wrote:
> On Monday 24 April 2017, Jakub Jelinek wrote:
> > On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote:
> > > --- a/gcc/config/i386/avx2intrin.h
> > > +++ b/gcc/config/i386/avx2intrin.h
> > > @@ -667,7 +667,7 @@ extern __inline __m256i
> > > 
> > >  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > >  _mm256_slli_epi16 (__m256i __A, int __B)
> > >  {
> > > 
> > > -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> > > +  return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) :
> > > _mm256_setzero_si256();
> > > 
> > >  }
> > 
> > What is the advantage of doing that when you replace one operation with
> > several (&, <, ?:, <<)?
> > I'd say instead we should fold the builtins if in the gimple fold target
> > hook we see the shift count constant and can decide based on that.
> > Or we could use __builtin_constant_p (__B) to decide whether to use
> > the generic vector shifts or builtin, but that means larger IL.
> 
> The advantage is that in this builtin, the __B is always a literal (or 
> constexpr), so the if statement is resolved at compile time.

Do we really want to support all the thousands _mm* intrinsics in constexpr
contexts?  People can just use generic vectors instead.

That said, both the options I've mentioned above provide the same advantages
and don't have the disadvantages of pessimizing normal code.

	Jakub

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
  2017-04-24  8:25       ` Jakub Jelinek
@ 2017-04-24  8:25         ` Allan Sandfeld Jensen
  2017-04-24  8:38           ` Jakub Jelinek
  0 siblings, 1 reply; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2017-04-24  8:25 UTC (permalink / raw)
  To: gcc-patches, Jakub Jelinek; +Cc: Uros Bizjak

On Monday 24 April 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 09:51:29AM +0200, Allan Sandfeld Jensen wrote:
> > On Monday 24 April 2017, Jakub Jelinek wrote:
> > > On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote:
> > > > --- a/gcc/config/i386/avx2intrin.h
> > > > +++ b/gcc/config/i386/avx2intrin.h
> > > > @@ -667,7 +667,7 @@ extern __inline __m256i
> > > > 
> > > >  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > > >  _mm256_slli_epi16 (__m256i __A, int __B)
> > > >  {
> > > > 
> > > > -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> > > > +  return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B &
> > > > 0xff)) : _mm256_setzero_si256();
> > > > 
> > > >  }
> > > 
> > > What is the advantage of doing that when you replace one operation with
> > > several (&, <, ?:, <<)?
> > > I'd say instead we should fold the builtins if in the gimple fold
> > > target hook we see the shift count constant and can decide based on
> > > that. Or we could use __builtin_constant_p (__B) to decide whether to
> > > use the generic vector shifts or builtin, but that means larger IL.
> > 
> > The advantage is that in this builtin, the __B is always a literal (or
> > constexpr), so the if statement is resolved at compile time.
> 
> Do we really want to support all the thousands _mm* intrinsics in constexpr
> contexts?  People can just use generic vectors instead.
> 
I would love to support it, but first we need a C extension attribute matching 
constexpr, and I consider it a separate issue.

> That said, both the options I've mentioned above provide the same
> advantages and don't have the disadvantages of pessimizing normal code.
> 
What pessimizing? This produce the same or better code for all legal 
arguments. The only difference besides better generated code is that it allows 
the intrinsics to be used incorrectly with non-literal arguments because we 
lack the C-extension for constexp to prevent that.

`Allan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
  2017-04-24  8:25         ` Allan Sandfeld Jensen
@ 2017-04-24  8:38           ` Jakub Jelinek
  2017-04-24  8:40             ` Allan Sandfeld Jensen
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Jelinek @ 2017-04-24  8:38 UTC (permalink / raw)
  To: Allan Sandfeld Jensen; +Cc: gcc-patches, Uros Bizjak

On Mon, Apr 24, 2017 at 10:02:40AM +0200, Allan Sandfeld Jensen wrote:
> > That said, both the options I've mentioned above provide the same
> > advantages and don't have the disadvantages of pessimizing normal code.
> > 
> What pessimizing? This produce the same or better code for all legal 
> arguments. The only difference besides better generated code is that it allows 

No.  Have you really tried that?

> the intrinsics to be used incorrectly with non-literal arguments because we 
> lack the C-extension for constexp to prevent that.

Consider e.g. -O2 -mavx2 -mtune=intel:
#include <x86intrin.h>

__m256i
foo (__m256i x, int s)
{
  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)x, s);
}

__m256i
bar (__m256i x, int s)
{
  return ((s & 0xff) < 16) ? (__m256i)((__v16hi)x << (s & 0xff)) : _mm256_setzero_si256 ();
}

The first one generates
        movl    %edi, %edi
        vmovq   %rdi, %xmm1
        vpsllw  %xmm1, %ymm0, %ymm0
        ret
(because that is actually what the instruction does), the second one
        movzbl  %dil, %edi
        cmpl    $15, %edi
        jg      .L5
        vmovq   %rdi, %xmm1
        vpsllw  %xmm1, %ymm0, %ymm0
        ret
        .p2align 4,,7
        .p2align 3
.L5:
        vpxor   %xmm0, %xmm0, %xmm0
        ret

	Jakub

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
  2017-04-24  8:38           ` Jakub Jelinek
@ 2017-04-24  8:40             ` Allan Sandfeld Jensen
  2017-04-24  8:54               ` Allan Sandfeld Jensen
  2017-04-24  8:57               ` Jakub Jelinek
  0 siblings, 2 replies; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2017-04-24  8:40 UTC (permalink / raw)
  To: gcc-patches, Jakub Jelinek; +Cc: Uros Bizjak

On Monday 24 April 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 10:02:40AM +0200, Allan Sandfeld Jensen wrote:
> > > That said, both the options I've mentioned above provide the same
> > > advantages and don't have the disadvantages of pessimizing normal code.
> > 
> > What pessimizing? This produce the same or better code for all legal
> > arguments. The only difference besides better generated code is that it
> > allows
> 
> No.  Have you really tried that?
> 
> > the intrinsics to be used incorrectly with non-literal arguments because
> > we lack the C-extension for constexp to prevent that.
> 
> Consider e.g. -O2 -mavx2 -mtune=intel:
> #include <x86intrin.h>
> 
> __m256i
> foo (__m256i x, int s)
> {
>   return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)x, s);
> }
> 
> __m256i
> bar (__m256i x, int s)
> {
>   return ((s & 0xff) < 16) ? (__m256i)((__v16hi)x << (s & 0xff)) :
> _mm256_setzero_si256 (); }
> 
> The first one generates
>         movl    %edi, %edi
>         vmovq   %rdi, %xmm1
>         vpsllw  %xmm1, %ymm0, %ymm0
>         ret
> (because that is actually what the instruction does), the second one
That is a different instruction. That is the vpsllw not vpsllwi

The intrinsics I changed is the immediate version, I didn't change the non-
immediate version. It is probably a bug if you can give non-immediate values 
to the immediate only intrinsic. At least both versions handles it, if in 
different ways, but is is illegal arguments.

`Allan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
  2017-04-24  8:40             ` Allan Sandfeld Jensen
@ 2017-04-24  8:54               ` Allan Sandfeld Jensen
  2017-04-24  8:57               ` Jakub Jelinek
  1 sibling, 0 replies; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2017-04-24  8:54 UTC (permalink / raw)
  To: gcc-patches; +Cc: Jakub Jelinek

On Monday 24 April 2017, Allan Sandfeld Jensen wrote:
> On Monday 24 April 2017, Jakub Jelinek wrote:
> > On Mon, Apr 24, 2017 at 10:02:40AM +0200, Allan Sandfeld Jensen wrote:
> > > > That said, both the options I've mentioned above provide the same
> > > > advantages and don't have the disadvantages of pessimizing normal
> > > > code.
> > > 
> > > What pessimizing? This produce the same or better code for all legal
> > > arguments. The only difference besides better generated code is that it
> > > allows
> > 
> > No.  Have you really tried that?
> > 
> > > the intrinsics to be used incorrectly with non-literal arguments
> > > because we lack the C-extension for constexp to prevent that.
> > 
> > Consider e.g. -O2 -mavx2 -mtune=intel:
> > #include <x86intrin.h>
> > 
> > __m256i
> > foo (__m256i x, int s)
> > {
> > 
> >   return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)x, s);
> > 
> > }
> > 
> > __m256i
> > bar (__m256i x, int s)
> > {
> > 
> >   return ((s & 0xff) < 16) ? (__m256i)((__v16hi)x << (s & 0xff)) :
> > _mm256_setzero_si256 (); }
> > 
> > The first one generates
> > 
> >         movl    %edi, %edi
> >         vmovq   %rdi, %xmm1
> >         vpsllw  %xmm1, %ymm0, %ymm0
> >         ret
> > 
> > (because that is actually what the instruction does), the second one
> 
> That is a different instruction. That is the vpsllw not vpsllwi
> 
> The intrinsics I changed is the immediate version, I didn't change the non-
> immediate version. It is probably a bug if you can give non-immediate
> values to the immediate only intrinsic. At least both versions handles it,
> if in different ways, but is is illegal arguments.
> 
Though I now that I think about it, this means my change of to the existing 
sse-psslw-1.c test and friends is wrong, because it uses variable input.

`Allan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
  2017-04-24  8:40             ` Allan Sandfeld Jensen
  2017-04-24  8:54               ` Allan Sandfeld Jensen
@ 2017-04-24  8:57               ` Jakub Jelinek
  1 sibling, 0 replies; 16+ messages in thread
From: Jakub Jelinek @ 2017-04-24  8:57 UTC (permalink / raw)
  To: Allan Sandfeld Jensen; +Cc: gcc-patches, Uros Bizjak

On Mon, Apr 24, 2017 at 10:34:58AM +0200, Allan Sandfeld Jensen wrote:
> That is a different instruction. That is the vpsllw not vpsllwi
> 
> The intrinsics I changed is the immediate version, I didn't change the non-
> immediate version. It is probably a bug if you can give non-immediate values 
> to the immediate only intrinsic. At least both versions handles it, if in 
> different ways, but is is illegal arguments.

The documentation is unclear on that and I've only recently fixed up some
cases where these intrinsics weren't able to handle non-constant arguments
in some cases, while both ICC and clang coped with that fine.
So it is clearly allowed and handled by all the compilers and needs to be
supported, people use that in real-world code.

	Jakub

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
  2017-04-24  9:38   ` Jakub Jelinek
@ 2017-04-24  9:38     ` Allan Sandfeld Jensen
  0 siblings, 0 replies; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2017-04-24  9:38 UTC (permalink / raw)
  To: gcc-patches, Jakub Jelinek; +Cc: Kirill Yukhin, Uros Bizjak

On Monday 24 April 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 11:01:29AM +0200, Allan Sandfeld Jensen wrote:
> > On Monday 24 April 2017, Jakub Jelinek wrote:
> > > On Mon, Apr 24, 2017 at 10:34:58AM +0200, Allan Sandfeld Jensen wrote:
> > > > That is a different instruction. That is the vpsllw not vpsllwi
> > > > 
> > > > The intrinsics I changed is the immediate version, I didn't change
> > > > the non- immediate version. It is probably a bug if you can give
> > > > non-immediate values to the immediate only intrinsic. At least both
> > > > versions handles it, if in different ways, but is is illegal
> > > > arguments.
> > > 
> > > The documentation is unclear on that and I've only recently fixed up
> > > some cases where these intrinsics weren't able to handle non-constant
> > > arguments in some cases, while both ICC and clang coped with that
> > > fine.
> > > So it is clearly allowed and handled by all the compilers and needs to
> > > be supported, people use that in real-world code.
> > 
> > Undoubtedly it happens. I just make a mistake myself that created that
> > case. But it is rather unfortunate, and means we make wrong code
> > currently for corner case values.
> 
> The intrinsic documentation is poor, usually you have a good documentation
> on what the instructions do, and then you just have to guess what the
> intrinsics do.  You can of course ask Intel for clarification.
> 
> If you try:
> #include <x86intrin.h>
> 
> __m128i
> foo (__m128i a, int b)
> {
>   return _mm_slli_epi16 (a, b);
> }
> and call it with 257 from somewhere else, you can see that all the
> compilers will give you zero vector.  And similarly if you use 257
> literally instead of b.  So what the intrinsic (unlike the instruction)
> actually does is that it compares all bits of the imm8 argument (supposedly
> using unsigned comparison) and if it is bigger than 15 (or 7 or 31 or 63
> depending on the bitsize of element) it yields 0 vector.
> 
Good point. I was using intel's documentation at 
https://software.intel.com/sites/landingpage/IntrinsicsGuide/, but if all 
compilers including us does something else, practicality wins.

It did make me curious and test out what _mm_slli_epi16(v, -250); compiles to. 
For some reason that becomes an undefined shift using the non-immediate sll in 
gcc, but returns the zero-vector in clang. With my patch it was a 6 bit shift, 
but that is apparently not de-facto standard.


`Allan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
       [not found] ` <201704241101.29634.linux@carewolf.com>
@ 2017-04-24  9:38   ` Jakub Jelinek
  2017-04-24  9:38     ` Allan Sandfeld Jensen
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Jelinek @ 2017-04-24  9:38 UTC (permalink / raw)
  To: Allan Sandfeld Jensen, Kirill Yukhin; +Cc: gcc-patches, Uros Bizjak

On Mon, Apr 24, 2017 at 11:01:29AM +0200, Allan Sandfeld Jensen wrote:
> On Monday 24 April 2017, Jakub Jelinek wrote:
> > On Mon, Apr 24, 2017 at 10:34:58AM +0200, Allan Sandfeld Jensen wrote:
> > > That is a different instruction. That is the vpsllw not vpsllwi
> > > 
> > > The intrinsics I changed is the immediate version, I didn't change the
> > > non- immediate version. It is probably a bug if you can give
> > > non-immediate values to the immediate only intrinsic. At least both
> > > versions handles it, if in different ways, but is is illegal arguments.
> > 
> > The documentation is unclear on that and I've only recently fixed up some
> > cases where these intrinsics weren't able to handle non-constant arguments
> > in some cases, while both ICC and clang coped with that fine.
> > So it is clearly allowed and handled by all the compilers and needs to be
> > supported, people use that in real-world code.
> > 
> Undoubtedly it happens. I just make a mistake myself that created that case. 
> But it is rather unfortunate, and means we make wrong code currently for 
> corner case values.

The intrinsic documentation is poor, usually you have a good documentation
on what the instructions do, and then you just have to guess what the
intrinsics do.  You can of course ask Intel for clarification.

If you try:
#include <x86intrin.h>

__m128i
foo (__m128i a, int b)
{
  return _mm_slli_epi16 (a, b);
}
and call it with 257 from somewhere else, you can see that all the compilers
will give you zero vector.  And similarly if you use 257 literally instead
of b.  So what the intrinsic (unlike the instruction)
actually does is that it compares all bits of the imm8 argument (supposedly
using unsigned comparison) and if it is bigger than 15 (or 7 or 31 or 63
depending on the bitsize of element) it yields 0 vector.

	Jakub

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
  2017-04-24  7:47   ` Jakub Jelinek
  2017-04-24  8:02     ` Allan Sandfeld Jensen
@ 2017-04-24 14:43     ` Allan Sandfeld Jensen
  2017-05-02 10:17       ` Jakub Jelinek
  1 sibling, 1 reply; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2017-04-24 14:43 UTC (permalink / raw)
  To: gcc-patches, Jakub Jelinek

[-- Attachment #1: Type: Text/Plain, Size: 1064 bytes --]

On Monday 24 April 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote:
> > --- a/gcc/config/i386/avx2intrin.h
> > +++ b/gcc/config/i386/avx2intrin.h
> > @@ -667,7 +667,7 @@ extern __inline __m256i
> > 
> >  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> >  _mm256_slli_epi16 (__m256i __A, int __B)
> >  {
> > 
> > -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> > +  return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) :
> > _mm256_setzero_si256();
> > 
> >  }
> 
> What is the advantage of doing that when you replace one operation with
> several (&, <, ?:, <<)?
> I'd say instead we should fold the builtins if in the gimple fold target
> hook we see the shift count constant and can decide based on that.
> Or we could use __builtin_constant_p (__B) to decide whether to use
> the generic vector shifts or builtin, but that means larger IL.
> 
Okay, I have tried that, and I also made it more obvious how the intrinsics 
can become non-immediate shift.


[-- Attachment #2: SIMD-shifts-5.diff --]
[-- Type: text/x-patch, Size: 15619 bytes --]

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b58f5050db0..b9406550fc5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2017-04-22  Allan Sandfeld Jensen  <sandfeld@kde.org>
+
+	* config/i386/emmintrin.h (_mm_slli_*, _mm_srli_*):
+	Use vector intrinstics instead of builtins.
+	* config/i386/avx2intrin.h (_mm256_slli_*, _mm256_srli_*):
+	Use vector intrinstics instead of builtins.
+
 2017-04-21  Uros Bizjak  <ubizjak@gmail.com>
 
 	* config/i386/i386.md (*extzvqi_mem_rex64): Move above *extzv<mode>.
diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h
index 82f170a3d61..64ba52b244e 100644
--- a/gcc/config/i386/avx2intrin.h
+++ b/gcc/config/i386/avx2intrin.h
@@ -665,13 +665,6 @@ _mm256_slli_si256 (__m256i __A, const int __N)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_slli_epi16 (__m256i __A, int __B)
-{
-  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
-}
-
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_sll_epi16 (__m256i __A, __m128i __B)
 {
   return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
@@ -679,9 +672,11 @@ _mm256_sll_epi16 (__m256i __A, __m128i __B)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_slli_epi32 (__m256i __A, int __B)
+_mm256_slli_epi16 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 16) ? (__m256i)((__v16hi)__A << __B) : _mm256_setzero_si256();
+  return _mm256_sll_epi16(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m256i
@@ -693,9 +688,11 @@ _mm256_sll_epi32 (__m256i __A, __m128i __B)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_slli_epi64 (__m256i __A, int __B)
+_mm256_slli_epi32 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 32) ? (__m256i)((__v8si)__A << __B) : _mm256_setzero_si256();
+  return _mm256_sll_epi32(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m256i
@@ -707,6 +704,15 @@ _mm256_sll_epi64 (__m256i __A, __m128i __B)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi64 (__m256i __A, int __B)
+{
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 64) ? (__m256i)((__v4di)__A << __B) : _mm256_setzero_si256();
+  return _mm256_sll_epi64(__A, _mm_cvtsi32_si128(__B));
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_srai_epi16 (__m256i __A, int __B)
 {
   return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
@@ -756,13 +762,6 @@ _mm256_srli_si256 (__m256i __A, const int __N)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_srli_epi16 (__m256i __A, int __B)
-{
-  return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
-}
-
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_srl_epi16 (__m256i __A, __m128i __B)
 {
   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
@@ -770,9 +769,11 @@ _mm256_srl_epi16 (__m256i __A, __m128i __B)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_srli_epi32 (__m256i __A, int __B)
+_mm256_srli_epi16 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 16) ? (__m256i)((__v16hu)__A >> __B) : _mm256_setzero_si256();
+  return _mm256_srl_epi16(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m256i
@@ -784,9 +785,11 @@ _mm256_srl_epi32 (__m256i __A, __m128i __B)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_srli_epi64 (__m256i __A, int __B)
+_mm256_srli_epi32 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 32) ? (__m256i)((__v8su)__A >> __B) : _mm256_setzero_si256();
+  return _mm256_srl_epi32(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m256i
@@ -798,6 +801,15 @@ _mm256_srl_epi64 (__m256i __A, __m128i __B)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi64 (__m256i __A, int __B)
+{
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 64) ? (__m256i)((__v4du)__A >> __B) : _mm256_setzero_si256();
+  return _mm256_srl_epi64(__A, _mm_cvtsi32_si128(__B));
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_sub_epi8 (__m256i __A, __m256i __B)
 {
   return (__m256i) ((__v32qu)__A - (__v32qu)__B);
diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index 828f4a07a9b..419041e2acb 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -903,6 +903,28 @@ _mm_cvtss_sd (__m128d __A, __m128 __B)
   return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
 }
 
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_si128 (int __A)
+{
+  return _mm_set_epi32 (0, 0, 0, __A);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si128 (long long __A)
+{
+  return _mm_set_epi64x (0, __A);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_si128 (long long __A)
+{
+  return _mm_set_epi64x (0, __A);
+}
+#endif
+
 #ifdef __OPTIMIZE__
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
@@ -1138,21 +1160,75 @@ _mm_mul_epu32 (__m128i __A, __m128i __B)
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 16) ? (__m128i)((__v8hi)__A << __B) : _mm_setzero_si128();
+  return _mm_sll_epi16(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 32) ? (__m128i)((__v4si)__A << __B) : _mm_setzero_si128();
+  return _mm_sll_epi32(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi64 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 64) ? (__m128i)((__v2di)__A << __B) : _mm_setzero_si128();
+  return _mm_sll_epi64(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1205,67 +1281,25 @@ _mm_slli_si128 (__m128i __A, const int __N)
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 16) ? (__m128i)((__v8hu)__A >> __B) : _mm_setzero_si128();
+  return _mm_srl_epi16(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 32) ? (__m128i)((__v4su)__A >> __B) : _mm_setzero_si128();
+  return _mm_srl_epi32(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi64 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sll_epi16 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sll_epi32 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sll_epi64 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sra_epi16 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sra_epi32 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_srl_epi16 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_srl_epi32 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_srl_epi64 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 64) ? (__m128i)((__v2du)__A >> __B) : _mm_setzero_si128();
+  return _mm_srl_epi64(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1497,28 +1531,6 @@ _mm_mfence (void)
   __builtin_ia32_mfence ();
 }
 
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsi32_si128 (int __A)
-{
-  return _mm_set_epi32 (0, 0, 0, __A);
-}
-
-#ifdef __x86_64__
-/* Intel intrinsic.  */
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsi64_si128 (long long __A)
-{
-  return _mm_set_epi64x (0, __A);
-}
-
-/* Microsoft intrinsic.  */
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsi64x_si128 (long long __A)
-{
-  return _mm_set_epi64x (0, __A);
-}
-#endif
-
 /* Casts between various SP, DP, INT vector types.  Note that these do no
    conversion of values, they just change the type.  */
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 6f4dc8d5095..a4470730ac6 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@
+2017-04-22  Allan Sandfeld Jensen  <sandfeld@kde.org>
+
+	* gcc.target/i386/sse2-shifts-1.c: New testcases of shift intrinsics
+	producing intended instructions.
+	* gcc.target/i386/sse2-shifts-2.c: New testcasse of shift intrinsics
+	being folded.
+
 2017-04-21  Janus Weil  <janus@gcc.gnu.org>
 
 	PR fortran/80392
diff --git a/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c b/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c
new file mode 100644
index 00000000000..a2305cf042a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c
@@ -0,0 +1,54 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-require-effective-target sse2 } */
+
+#include <emmintrin.h>
+
+__m128i test1(__m128i a)
+{
+    return _mm_slli_epi16(a, 9);
+}
+
+__m128i test2(__m128i a)
+{
+    return _mm_slli_epi32(a, 13);
+}
+
+__m128i test3(__m128i a)
+{
+    return _mm_slli_epi64(a, 17);
+}
+
+__m128i test4(__m128i a)
+{
+    return _mm_srli_epi16(a, 9);
+}
+
+__m128i test5(__m128i a)
+{
+    return _mm_srli_epi32(a, 13);
+}
+
+__m128i test6(__m128i a)
+{
+    return _mm_srli_epi64(a, 7);
+}
+
+__m128i test7(__m128i a)
+{
+    return _mm_srai_epi16(a, 3);
+}
+
+__m128i test8(__m128i a)
+{
+    return _mm_srai_epi32(a, 6);
+}
+
+/* { dg-final { scan-assembler "psllw" } } */
+/* { dg-final { scan-assembler "pslld" } } */
+/* { dg-final { scan-assembler "psllq" } } */
+/* { dg-final { scan-assembler "psrlw" } } */
+/* { dg-final { scan-assembler "psrld" } } */
+/* { dg-final { scan-assembler "psrlq" } } */
+/* { dg-final { scan-assembler "psraw" } } */
+/* { dg-final { scan-assembler "psrad" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c b/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c
new file mode 100644
index 00000000000..ce05a7dc44e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+#include <emmintrin.h>
+
+__m128i test1(__m128i a)
+{
+    a = _mm_slli_epi16(a, 2);
+    return _mm_slli_epi16(a, 3);
+}
+/* { dg-final { scan-assembler "psllw.*5"} } */
+
+__m128i test3(__m128i a)
+{
+    a = _mm_srli_epi16(a, 4);
+    return _mm_srli_epi16(a, 9);
+}
+/* { dg-final { scan-assembler-times "psrlw" 1} } */
+
+__m128i test4(__m128i a)
+{
+    a = _mm_setr_epi32(128, 255, 86, 23);
+    return _mm_srli_epi32(a, 8);
+}
+/* { dg-final { scan-assembler-not "psrld"} } */
+

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
  2017-04-24 14:43     ` Allan Sandfeld Jensen
@ 2017-05-02 10:17       ` Jakub Jelinek
  2017-05-02 11:22         ` Allan Sandfeld Jensen
  2017-05-02 15:58         ` Marc Glisse
  0 siblings, 2 replies; 16+ messages in thread
From: Jakub Jelinek @ 2017-05-02 10:17 UTC (permalink / raw)
  To: Allan Sandfeld Jensen, Uros Bizjak; +Cc: gcc-patches

On Mon, Apr 24, 2017 at 03:15:11PM +0200, Allan Sandfeld Jensen wrote:
> Okay, I have tried that, and I also made it more obvious how the intrinsics 
> can become non-immediate shift.
> 

> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index b58f5050db0..b9406550fc5 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,10 @@
> +2017-04-22  Allan Sandfeld Jensen  <sandfeld@kde.org>
> +
> +	* config/i386/emmintrin.h (_mm_slli_*, _mm_srli_*):
> +	Use vector intrinstics instead of builtins.
> +	* config/i386/avx2intrin.h (_mm256_slli_*, _mm256_srli_*):
> +	Use vector intrinstics instead of builtins.
> +
>  2017-04-21  Uros Bizjak  <ubizjak@gmail.com>
>  
>  	* config/i386/i386.md (*extzvqi_mem_rex64): Move above *extzv<mode>.
> diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h
> index 82f170a3d61..64ba52b244e 100644
> --- a/gcc/config/i386/avx2intrin.h
> +++ b/gcc/config/i386/avx2intrin.h
> @@ -665,13 +665,6 @@ _mm256_slli_si256 (__m256i __A, const int __N)
>  
>  extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_slli_epi16 (__m256i __A, int __B)
> -{
> -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> -}
> -
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm256_sll_epi16 (__m256i __A, __m128i __B)
>  {
>    return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
> @@ -679,9 +672,11 @@ _mm256_sll_epi16 (__m256i __A, __m128i __B)
>  
>  extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_slli_epi32 (__m256i __A, int __B)
> +_mm256_slli_epi16 (__m256i __A, int __B)
>  {
> -  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
> +  if (__builtin_constant_p(__B))
> +    return ((unsigned int)__B < 16) ? (__m256i)((__v16hi)__A << __B) : _mm256_setzero_si256();
> +  return _mm256_sll_epi16(__A, _mm_cvtsi32_si128(__B));
>  }

The formatting is wrong, missing spaces before function names and opening (,
too long lines.  Also, you've removed some builtin uses like
__builtin_ia32_psllwi256 above, but haven't removed those builtins from the
compiler (unlike the intrinsics, the builtins are not supported and can be
removed).  But I guess the primary question is on Uros, do we
want to handle this in the *intrin.h headers and thus increase the size
of those (and their parsing time etc.), or do we want to handle this
in the target folders (tree as well as gimple one), where we'd convert
e.g. __builtin_ia32_psllwi256 to the shift if the shift count is constant.

	Jakub

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
  2017-05-02 10:17       ` Jakub Jelinek
@ 2017-05-02 11:22         ` Allan Sandfeld Jensen
  2017-05-02 15:58         ` Marc Glisse
  1 sibling, 0 replies; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2017-05-02 11:22 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Uros Bizjak, gcc-patches

On Tuesday 02 May 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 03:15:11PM +0200, Allan Sandfeld Jensen wrote:
> > Okay, I have tried that, and I also made it more obvious how the
> > intrinsics can become non-immediate shift.
> > 
> > 
> > diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> > index b58f5050db0..b9406550fc5 100644
> > --- a/gcc/ChangeLog
> > +++ b/gcc/ChangeLog
> > @@ -1,3 +1,10 @@
> > +2017-04-22  Allan Sandfeld Jensen  <sandfeld@kde.org>
> > +
> > +	* config/i386/emmintrin.h (_mm_slli_*, _mm_srli_*):
> > +	Use vector intrinstics instead of builtins.
> > +	* config/i386/avx2intrin.h (_mm256_slli_*, _mm256_srli_*):
> > +	Use vector intrinstics instead of builtins.
> > +
> > 
> >  2017-04-21  Uros Bizjak  <ubizjak@gmail.com>
> >  
> >  	* config/i386/i386.md (*extzvqi_mem_rex64): Move above *extzv<mode>.
> > 
> > diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h
> > index 82f170a3d61..64ba52b244e 100644
> > --- a/gcc/config/i386/avx2intrin.h
> > +++ b/gcc/config/i386/avx2intrin.h
> > @@ -665,13 +665,6 @@ _mm256_slli_si256 (__m256i __A, const int __N)
> > 
> >  extern __inline __m256i
> >  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > 
> > -_mm256_slli_epi16 (__m256i __A, int __B)
> > -{
> > -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> > -}
> > -
> > -extern __inline __m256i
> > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > 
> >  _mm256_sll_epi16 (__m256i __A, __m128i __B)
> >  {
> >  
> >    return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
> > 
> > @@ -679,9 +672,11 @@ _mm256_sll_epi16 (__m256i __A, __m128i __B)
> > 
> >  extern __inline __m256i
> >  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > 
> > -_mm256_slli_epi32 (__m256i __A, int __B)
> > +_mm256_slli_epi16 (__m256i __A, int __B)
> > 
> >  {
> > 
> > -  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
> > +  if (__builtin_constant_p(__B))
> > +    return ((unsigned int)__B < 16) ? (__m256i)((__v16hi)__A << __B) :
> > _mm256_setzero_si256(); +  return _mm256_sll_epi16(__A,
> > _mm_cvtsi32_si128(__B));
> > 
> >  }
> 
> The formatting is wrong, missing spaces before function names and opening
> (, too long lines.  Also, you've removed some builtin uses like
> __builtin_ia32_psllwi256 above, but haven't removed those builtins from the
> compiler (unlike the intrinsics, the builtins are not supported and can be
> removed).  But I guess the primary question is on Uros, do we
> want to handle this in the *intrin.h headers and thus increase the size
> of those (and their parsing time etc.), or do we want to handle this
> in the target folders (tree as well as gimple one), where we'd convert
> e.g. __builtin_ia32_psllwi256 to the shift if the shift count is constant.
> 
Ok. I will await what you decide.

Btw. I thought of an alternative idea: Make a new set of built-ins, called for 
instance __builtin_lshift and __builtin_rshift, that translates simply to 
GIMPLE shifts, just like cpp_shifts currently does, the only difference being 
the new shifts (unlike C/C++ shifts) are defined for all shift sizes and on 
negative values.  With this also variable shift intrinsics can be written 
without builtins. Though to do this would making a whole set of them for all 
integer types, it would need to be implemented in the c-parser like 
__buitin_shuffle, and not with the other generic builtins.

Would that make sense?

Best regards
`Allan

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
  2017-05-02 10:17       ` Jakub Jelinek
  2017-05-02 11:22         ` Allan Sandfeld Jensen
@ 2017-05-02 15:58         ` Marc Glisse
  1 sibling, 0 replies; 16+ messages in thread
From: Marc Glisse @ 2017-05-02 15:58 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Allan Sandfeld Jensen, Uros Bizjak, gcc-patches

On Tue, 2 May 2017, Jakub Jelinek wrote:

> Also, you've removed some builtin uses like __builtin_ia32_psllwi256 
> above, but haven't removed those builtins from the compiler (unlike the 
> intrinsics, the builtins are not supported and can be removed).

When we changed previous intrinsics, the same issue came up, and Ada folks 
asked us to keep the builtins...

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts
@ 2017-04-24  9:17 Allan Sandfeld Jensen
  0 siblings, 0 replies; 16+ messages in thread
From: Allan Sandfeld Jensen @ 2017-04-24  9:17 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: gcc-patches

On Monday 24 April 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 10:34:58AM +0200, Allan Sandfeld Jensen wrote:
> > That is a different instruction. That is the vpsllw not vpsllwi
> > 
> > The intrinsics I changed is the immediate version, I didn't change the
> > non- immediate version. It is probably a bug if you can give
> > non-immediate values to the immediate only intrinsic. At least both
> > versions handles it, if in different ways, but is is illegal arguments.
> 
> The documentation is unclear on that and I've only recently fixed up some
> cases where these intrinsics weren't able to handle non-constant arguments
> in some cases, while both ICC and clang coped with that fine.
> So it is clearly allowed and handled by all the compilers and needs to be
> supported, people use that in real-world code.
> 
Undoubtedly it happens. I just make a mistake myself that created that case. 
But it is rather unfortunate, and means we make wrong code currently for 
corner case values.

Note the difference in definition between the two intrinsics: 
_mm_ssl_epi16:
FOR j := 0 to 7
	i := j*16
	IF count[63:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] << count[63:0])
	FI
ENDFOR

_mm_ssli_epi16:
FOR j := 0 to 7
	i := j*16
	IF imm8[7:0] > 15
		dst[i+15:i] := 0
	ELSE
		dst[i+15:i] := ZeroExtend(a[i+15:i] << imm8[7:0])
	FI
ENDFOR

For a value such as 257, the immediate version does a 1 bit shift, while the 
non-immediate returns a zero vector. A simple function using the immediate 
intrinsic has to have an if-statement, if transformed to using the non-
immediate instruction.

`Allan

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2017-05-02 15:48 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <201704221338.46300.linux@carewolf.com>
2017-04-24  7:43 ` [PATCH] [x86] Avoid builtins for SSE/AVX2 immidiate logical shifts Allan Sandfeld Jensen
2017-04-24  7:47   ` Jakub Jelinek
2017-04-24  8:02     ` Allan Sandfeld Jensen
2017-04-24  8:25       ` Jakub Jelinek
2017-04-24  8:25         ` Allan Sandfeld Jensen
2017-04-24  8:38           ` Jakub Jelinek
2017-04-24  8:40             ` Allan Sandfeld Jensen
2017-04-24  8:54               ` Allan Sandfeld Jensen
2017-04-24  8:57               ` Jakub Jelinek
2017-04-24 14:43     ` Allan Sandfeld Jensen
2017-05-02 10:17       ` Jakub Jelinek
2017-05-02 11:22         ` Allan Sandfeld Jensen
2017-05-02 15:58         ` Marc Glisse
     [not found] ` <201704241101.29634.linux@carewolf.com>
2017-04-24  9:38   ` Jakub Jelinek
2017-04-24  9:38     ` Allan Sandfeld Jensen
2017-04-24  9:17 Allan Sandfeld Jensen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).