[PATCH] i386: Optimize pmovmskb on inverted vector to inversion of pmovmskb result [PR98461]

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] i386: Optimize pmovmskb on inverted vector to inversion of pmovmskb result [PR98461]
@ 2020-12-30  9:23 Jakub Jelinek
  2020-12-30 10:08 ` Uros Bizjak
  0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2020-12-30  9:23 UTC (permalink / raw)
  To: Uros Bizjak, Kirill Yukhin; +Cc: gcc-patches

Hi!

The following patch adds combine splitters to optimize:
-	vpcmpeqd	%ymm1, %ymm1, %ymm1
-	vpandn	%ymm1, %ymm0, %ymm0
 	vpmovmskb	%ymm0, %eax
+	notl	%eax
etc. (for vectors with less than 32 elements with xorl instead of notl).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2020-12-30  Jakub Jelinek  <jakub@redhat.com>

	PR target/98461
	* config/i386/sse.md (<sse2_avx2>_pmovmskb): Add splitters
	for pmovmskb of NOT vector.

	* gcc.target/i386/sse2-pr98461.c: New test.
	* gcc.target/i386/avx2-pr98461.c: New test.

--- gcc/config/i386/sse.md.jj	2020-12-28 12:27:32.318754687 +0100
+++ gcc/config/i386/sse.md	2020-12-29 14:15:45.898508216 +0100
@@ -16099,6 +16099,53 @@ (define_insn "*sse2_pmovmskb_ext"
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SI")])
 
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(unspec:SI
+	  [(not:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand"))]
+          UNSPEC_MOVMSK))]
+  "TARGET_SSE2"
+  [(set (match_dup 2)
+	(unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))
+   (set (match_dup 0) (match_dup 3))]
+{
+  operands[2] = gen_reg_rtx (SImode);
+  if (GET_MODE_NUNITS (<MODE>mode) == 32)
+    operands[3] = gen_rtx_NOT (SImode, operands[2]);
+  else
+    {
+      operands[3]
+	= gen_int_mode ((HOST_WIDE_INT_1 << GET_MODE_NUNITS (<MODE>mode)) - 1,
+			SImode);
+      operands[3] = gen_rtx_XOR (SImode, operands[2], operands[3]);
+    }
+})
+
+(define_split
+  [(set (match_operand:SI 0 "register_operand")
+	(unspec:SI
+	  [(subreg:VI1_AVX2 (not (match_operand 1 "register_operand")) 0)]
+          UNSPEC_MOVMSK))]
+  "TARGET_SSE2
+   && GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_VECTOR_INT
+   && GET_MODE_SIZE (GET_MODE (operands[1])) == <MODE_SIZE>"
+  [(set (match_dup 2)
+	(unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))
+   (set (match_dup 0) (match_dup 3))]
+{
+  operands[2] = gen_reg_rtx (SImode);
+  operands[1] = gen_lowpart (<MODE>mode, operands[1]);
+  if (GET_MODE_NUNITS (<MODE>mode) == 32)
+    operands[3] = gen_rtx_NOT (SImode, operands[2]);
+  else
+    {
+      operands[3]
+	= gen_int_mode ((HOST_WIDE_INT_1 << GET_MODE_NUNITS (<MODE>mode)) - 1,
+			SImode);
+      operands[3] = gen_rtx_XOR (SImode, operands[2], operands[3]);
+    }
+})
+
 (define_insn_and_split "*<sse2_avx2>_pmovmskb_lt"
   [(set (match_operand:SI 0 "register_operand" "=r")
 	(unspec:SI
--- gcc/testsuite/gcc.target/i386/sse2-pr98461.c.jj	2020-12-29 14:20:44.258146127 +0100
+++ gcc/testsuite/gcc.target/i386/sse2-pr98461.c	2020-12-29 14:23:11.462490600 +0100
@@ -0,0 +1,50 @@
+/* PR target/98461 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-sse3 -masm=att" } */
+/* { dg-final { scan-assembler-times "\tpmovmskb\t" 6 } } */
+/* { dg-final { scan-assembler-times "\txorl\t" 6 } } */
+/* { dg-final { scan-assembler-not "\tpcmpeq" } } */
+/* { dg-final { scan-assembler-not "\tpxor" } } */
+/* { dg-final { scan-assembler-not "\tpandn" } } */
+
+#include <x86intrin.h>
+
+int
+f1 (__m128i x)
+{
+  return _mm_movemask_epi8 (x) ^ 65535;
+}
+
+int
+f2 (__m128i x)
+{
+  return _mm_movemask_epi8 (_mm_andnot_si128 (x, _mm_set1_epi8 (255)));
+}
+
+int
+f3 (__v16qi x)
+{
+  x ^= (__v16qi) { -1, -1, -1, -1, -1, -1, -1, -1,
+		   -1, -1, -1, -1, -1, -1, -1, -1 };
+  return _mm_movemask_epi8 ((__m128i) x);
+}
+
+long
+f4 (__m128i x)
+{
+  return (unsigned) (_mm_movemask_epi8 (x) ^ 65535);
+}
+
+long
+f5 (__m128i x)
+{
+  return (unsigned) _mm_movemask_epi8 (_mm_andnot_si128 (x, _mm_set1_epi8 (255)));
+}
+
+long
+f6 (__v16qi x)
+{
+  x ^= (__v16qi) { -1, -1, -1, -1, -1, -1, -1, -1,
+		   -1, -1, -1, -1, -1, -1, -1, -1 };
+  return (unsigned) _mm_movemask_epi8 ((__m128i) x);
+}
--- gcc/testsuite/gcc.target/i386/avx2-pr98461.c.jj	2020-12-29 14:20:27.429335767 +0100
+++ gcc/testsuite/gcc.target/i386/avx2-pr98461.c	2020-12-29 14:19:50.944746895 +0100
@@ -0,0 +1,54 @@
+/* PR target/98461 */
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -masm=att" } */
+/* { dg-final { scan-assembler-times "\tvpmovmskb\t" 6 } } */
+/* { dg-final { scan-assembler-times "\tnotl\t" 6 } } */
+/* { dg-final { scan-assembler-not "\tvpcmpeq" } } */
+/* { dg-final { scan-assembler-not "\tvpxor" } } */
+/* { dg-final { scan-assembler-not "\tvpandn" } } */
+
+#include <x86intrin.h>
+
+int
+f1 (__m256i x)
+{
+  return ~_mm256_movemask_epi8 (x);
+}
+
+int
+f2 (__m256i x)
+{
+  return _mm256_movemask_epi8 (_mm256_andnot_si256 (x, _mm256_set1_epi8 (255)));
+}
+
+int
+f3 (__v32qi x)
+{
+  x ^= (__v32qi) { -1, -1, -1, -1, -1, -1, -1, -1,
+		   -1, -1, -1, -1, -1, -1, -1, -1,
+		   -1, -1, -1, -1, -1, -1, -1, -1,
+		   -1, -1, -1, -1, -1, -1, -1, -1 };
+  return _mm256_movemask_epi8 ((__m256i) x);
+}
+
+long
+f4 (__m256i x)
+{
+  return (unsigned) ~_mm256_movemask_epi8 (x);
+}
+
+long
+f5 (__m256i x)
+{
+  return (unsigned) _mm256_movemask_epi8 (_mm256_andnot_si256 (x, _mm256_set1_epi8 (255)));
+}
+
+long
+f6 (__v32qi x)
+{
+  x ^= (__v32qi) { -1, -1, -1, -1, -1, -1, -1, -1,
+		   -1, -1, -1, -1, -1, -1, -1, -1,
+		   -1, -1, -1, -1, -1, -1, -1, -1,
+		   -1, -1, -1, -1, -1, -1, -1, -1 };
+  return (unsigned) _mm256_movemask_epi8 ((__m256i) x);
+}

	Jakub


^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [PATCH] i386: Optimize pmovmskb on inverted vector to inversion of pmovmskb result [PR98461]
  2020-12-30  9:23 [PATCH] i386: Optimize pmovmskb on inverted vector to inversion of pmovmskb result [PR98461] Jakub Jelinek
@ 2020-12-30 10:08 ` Uros Bizjak
  0 siblings, 0 replies; 2+ messages in thread
From: Uros Bizjak @ 2020-12-30 10:08 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Kirill Yukhin, gcc-patches

On Wed, Dec 30, 2020 at 10:23 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> The following patch adds combine splitters to optimize:
> -       vpcmpeqd        %ymm1, %ymm1, %ymm1
> -       vpandn  %ymm1, %ymm0, %ymm0
>         vpmovmskb       %ymm0, %eax
> +       notl    %eax
> etc. (for vectors with less than 32 elements with xorl instead of notl).
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2020-12-30  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/98461
>         * config/i386/sse.md (<sse2_avx2>_pmovmskb): Add splitters
>         for pmovmskb of NOT vector.
>
>         * gcc.target/i386/sse2-pr98461.c: New test.
>         * gcc.target/i386/avx2-pr98461.c: New test.

OK.

Thanks,
Uros.

>
> --- gcc/config/i386/sse.md.jj   2020-12-28 12:27:32.318754687 +0100
> +++ gcc/config/i386/sse.md      2020-12-29 14:15:45.898508216 +0100
> @@ -16099,6 +16099,53 @@ (define_insn "*sse2_pmovmskb_ext"
>     (set_attr "prefix" "maybe_vex")
>     (set_attr "mode" "SI")])
>
> +(define_split
> +  [(set (match_operand:SI 0 "register_operand")
> +       (unspec:SI
> +         [(not:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand"))]
> +          UNSPEC_MOVMSK))]
> +  "TARGET_SSE2"
> +  [(set (match_dup 2)
> +       (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))
> +   (set (match_dup 0) (match_dup 3))]
> +{
> +  operands[2] = gen_reg_rtx (SImode);
> +  if (GET_MODE_NUNITS (<MODE>mode) == 32)
> +    operands[3] = gen_rtx_NOT (SImode, operands[2]);
> +  else
> +    {
> +      operands[3]
> +       = gen_int_mode ((HOST_WIDE_INT_1 << GET_MODE_NUNITS (<MODE>mode)) - 1,
> +                       SImode);
> +      operands[3] = gen_rtx_XOR (SImode, operands[2], operands[3]);
> +    }
> +})
> +
> +(define_split
> +  [(set (match_operand:SI 0 "register_operand")
> +       (unspec:SI
> +         [(subreg:VI1_AVX2 (not (match_operand 1 "register_operand")) 0)]
> +          UNSPEC_MOVMSK))]
> +  "TARGET_SSE2
> +   && GET_MODE_CLASS (GET_MODE (operands[1])) == MODE_VECTOR_INT
> +   && GET_MODE_SIZE (GET_MODE (operands[1])) == <MODE_SIZE>"
> +  [(set (match_dup 2)
> +       (unspec:SI [(match_dup 1)] UNSPEC_MOVMSK))
> +   (set (match_dup 0) (match_dup 3))]
> +{
> +  operands[2] = gen_reg_rtx (SImode);
> +  operands[1] = gen_lowpart (<MODE>mode, operands[1]);
> +  if (GET_MODE_NUNITS (<MODE>mode) == 32)
> +    operands[3] = gen_rtx_NOT (SImode, operands[2]);
> +  else
> +    {
> +      operands[3]
> +       = gen_int_mode ((HOST_WIDE_INT_1 << GET_MODE_NUNITS (<MODE>mode)) - 1,
> +                       SImode);
> +      operands[3] = gen_rtx_XOR (SImode, operands[2], operands[3]);
> +    }
> +})
> +
>  (define_insn_and_split "*<sse2_avx2>_pmovmskb_lt"
>    [(set (match_operand:SI 0 "register_operand" "=r")
>         (unspec:SI
> --- gcc/testsuite/gcc.target/i386/sse2-pr98461.c.jj     2020-12-29 14:20:44.258146127 +0100
> +++ gcc/testsuite/gcc.target/i386/sse2-pr98461.c        2020-12-29 14:23:11.462490600 +0100
> @@ -0,0 +1,50 @@
> +/* PR target/98461 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -msse2 -mno-sse3 -masm=att" } */
> +/* { dg-final { scan-assembler-times "\tpmovmskb\t" 6 } } */
> +/* { dg-final { scan-assembler-times "\txorl\t" 6 } } */
> +/* { dg-final { scan-assembler-not "\tpcmpeq" } } */
> +/* { dg-final { scan-assembler-not "\tpxor" } } */
> +/* { dg-final { scan-assembler-not "\tpandn" } } */
> +
> +#include <x86intrin.h>
> +
> +int
> +f1 (__m128i x)
> +{
> +  return _mm_movemask_epi8 (x) ^ 65535;
> +}
> +
> +int
> +f2 (__m128i x)
> +{
> +  return _mm_movemask_epi8 (_mm_andnot_si128 (x, _mm_set1_epi8 (255)));
> +}
> +
> +int
> +f3 (__v16qi x)
> +{
> +  x ^= (__v16qi) { -1, -1, -1, -1, -1, -1, -1, -1,
> +                  -1, -1, -1, -1, -1, -1, -1, -1 };
> +  return _mm_movemask_epi8 ((__m128i) x);
> +}
> +
> +long
> +f4 (__m128i x)
> +{
> +  return (unsigned) (_mm_movemask_epi8 (x) ^ 65535);
> +}
> +
> +long
> +f5 (__m128i x)
> +{
> +  return (unsigned) _mm_movemask_epi8 (_mm_andnot_si128 (x, _mm_set1_epi8 (255)));
> +}
> +
> +long
> +f6 (__v16qi x)
> +{
> +  x ^= (__v16qi) { -1, -1, -1, -1, -1, -1, -1, -1,
> +                  -1, -1, -1, -1, -1, -1, -1, -1 };
> +  return (unsigned) _mm_movemask_epi8 ((__m128i) x);
> +}
> --- gcc/testsuite/gcc.target/i386/avx2-pr98461.c.jj     2020-12-29 14:20:27.429335767 +0100
> +++ gcc/testsuite/gcc.target/i386/avx2-pr98461.c        2020-12-29 14:19:50.944746895 +0100
> @@ -0,0 +1,54 @@
> +/* PR target/98461 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx2 -masm=att" } */
> +/* { dg-final { scan-assembler-times "\tvpmovmskb\t" 6 } } */
> +/* { dg-final { scan-assembler-times "\tnotl\t" 6 } } */
> +/* { dg-final { scan-assembler-not "\tvpcmpeq" } } */
> +/* { dg-final { scan-assembler-not "\tvpxor" } } */
> +/* { dg-final { scan-assembler-not "\tvpandn" } } */
> +
> +#include <x86intrin.h>
> +
> +int
> +f1 (__m256i x)
> +{
> +  return ~_mm256_movemask_epi8 (x);
> +}
> +
> +int
> +f2 (__m256i x)
> +{
> +  return _mm256_movemask_epi8 (_mm256_andnot_si256 (x, _mm256_set1_epi8 (255)));
> +}
> +
> +int
> +f3 (__v32qi x)
> +{
> +  x ^= (__v32qi) { -1, -1, -1, -1, -1, -1, -1, -1,
> +                  -1, -1, -1, -1, -1, -1, -1, -1,
> +                  -1, -1, -1, -1, -1, -1, -1, -1,
> +                  -1, -1, -1, -1, -1, -1, -1, -1 };
> +  return _mm256_movemask_epi8 ((__m256i) x);
> +}
> +
> +long
> +f4 (__m256i x)
> +{
> +  return (unsigned) ~_mm256_movemask_epi8 (x);
> +}
> +
> +long
> +f5 (__m256i x)
> +{
> +  return (unsigned) _mm256_movemask_epi8 (_mm256_andnot_si256 (x, _mm256_set1_epi8 (255)));
> +}
> +
> +long
> +f6 (__v32qi x)
> +{
> +  x ^= (__v32qi) { -1, -1, -1, -1, -1, -1, -1, -1,
> +                  -1, -1, -1, -1, -1, -1, -1, -1,
> +                  -1, -1, -1, -1, -1, -1, -1, -1,
> +                  -1, -1, -1, -1, -1, -1, -1, -1 };
> +  return (unsigned) _mm256_movemask_epi8 ((__m256i) x);
> +}
>
>         Jakub
>

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2020-12-30 10:08 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-30  9:23 [PATCH] i386: Optimize pmovmskb on inverted vector to inversion of pmovmskb result [PR98461] Jakub Jelinek
2020-12-30 10:08 ` Uros Bizjak

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).