public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH][i386] Split not+broadcast+pand to broadcast+pandn.
@ 2021-05-25  5:49 Hongtao Liu
  2021-05-25  6:11 ` Andrew Pinski
  0 siblings, 1 reply; 21+ messages in thread
From: Hongtao Liu @ 2021-05-25  5:49 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 615 bytes --]

Hi:
  This patch is about to do transformation like below.
  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Ok for trunk?

from
        notl    %edi
        vpbroadcastd    %edi, %xmm0
        vpand   %xmm1, %xmm0, %xmm0
to
        vpbroadcastd    %edi, %xmm0
        vpandn   %xmm1, %xmm0, %xmm0

gcc/ChangeLog:

        PR target/100711
        * config/i386/sse.md (*andnot<mode>3): New combine splitter
        after it.

gcc/testsuite/ChangeLog:

        PR target/100711
        * gcc.target/i386/avx2-pr100711.c: New test.
        * gcc.target/i386/avx512bw-pr100711.c: New test.


-- 
BR,
Hongtao

[-- Attachment #2: 0001-i386-Split-not-broadcast-pand-to-broadcast-pandn.patch --]
[-- Type: text/x-patch, Size: 5337 bytes --]

From 2a70b50fe3ebe129a66d8e4d5c8c025cb6df6e4c Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Fri, 21 May 2021 11:12:49 +0800
Subject: [PATCH] [i386] Split not+broadcast+pand to broadcast+pandn.

Split
	notl    %edi
      	vpbroadcastd    %edi, %xmm0
      	vpand   %xmm1, %xmm0, %xmm0
to
      	vpbroadcastd    %edi, %xmm0
      	vpandn   %xmm1, %xmm0, %xmm0

gcc/ChangeLog:

	PR target/100711
	* config/i386/sse.md (*andnot<mode>3): New combine splitter
	after it.

gcc/testsuite/ChangeLog:

	PR target/100711
	* gcc.target/i386/avx2-pr100711.c: New test.
	* gcc.target/i386/avx512bw-pr100711.c: New test.
---
 gcc/config/i386/sse.md                        | 20 +++++
 gcc/testsuite/gcc.target/i386/avx2-pr100711.c | 73 +++++++++++++++++++
 .../gcc.target/i386/avx512bw-pr100711.c       | 48 ++++++++++++
 3 files changed, 141 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr100711.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a4503ddcb73..999c7322aac 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -3990,6 +3990,26 @@ (define_insn "*andnot<mode>3"
 	      ]
 	      (const_string "<ssevecmode>")))])
 
+;; Split
+;;	notl    %edi
+;;      vpbroadcastd    %edi, %xmm0
+;;      vpand   %xmm1, %xmm0, %xmm0
+;;to
+;;      vpbroadcastd    %edi, %xmm0
+;;      vpandn   %xmm1, %xmm0, %xmm0
+
+(define_split
+  [(set (match_operand:VI 0 "register_operand")
+	(and:VI
+	  (vec_duplicate:VI
+	    (not:<ssescalarmode>
+	      (match_operand:<ssescalarmode> 1 "register_operand")))
+	  (match_operand:VI 2 "bcst_vector_operand")))]
+  "TARGET_AVX2"
+  [(set (match_dup 3) (vec_duplicate:VI (match_dup 1)))
+   (set (match_dup 0) (and:VI (not:VI (match_dup 3)) (match_dup 2)))]
+  "operands[3] = gen_reg_rtx (<MODE>mode);")
+
 (define_insn "*andnottf3"
   [(set (match_operand:TF 0 "register_operand" "=x,x,v,v")
 	(and:TF
diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr100711.c b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
new file mode 100644
index 00000000000..5b144623873
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
@@ -0,0 +1,73 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "pandn" 8 } } */
+/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+
+v16qi
+f1 (char a, v16qi c)
+{
+  char b = ~a;
+  return (__extension__(v16qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v32qi
+f2 (char a, v32qi c)
+{
+  char b = ~a;
+  return (__extension__(v32qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v8hi
+f3 (short a, v8hi c)
+{
+  short b = ~a;
+  return (__extension__(v8hi) {b, b, b, b, b, b, b, b}) & c;
+}
+
+v16hi
+f4 (short a, v16hi c)
+{
+  short b = ~a;
+  return (__extension__(v16hi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v4si
+f5 (int a, v4si c)
+{
+  int b = ~a;
+  return (__extension__(v4si) {b, b, b, b}) & c;
+}
+
+v8si
+f6 (int a, v8si c)
+{
+  int b = ~a;
+  return (__extension__(v8si) {b, b, b, b, b, b, b, b}) & c;
+}
+
+v2di
+f7 (long long a, v2di c)
+{
+  long long b = ~a;
+  return (__extension__(v2di) {b, b}) & c;
+}
+
+v4di
+f8 (long long a, v4di c)
+{
+  long long b = ~a;
+  return (__extension__(v4di) {b, b, b, b}) & c;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
new file mode 100644
index 00000000000..f0a103d0bc2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "pandn" 4 } } */
+/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef short v32hi __attribute__((vector_size(64)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef long long v8di __attribute__((vector_size(64)));
+
+v64qi
+f1 (char a, v64qi c)
+{
+  char b = ~a;
+  return (__extension__(v64qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v32hi
+f2 (short a, v32hi c)
+{
+  short b = ~a;
+  return (__extension__(v32hi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v16si
+f3 (int a, v16si c)
+{
+  int b = ~a;
+  return (__extension__(v16si) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v8di
+f4 (long long a, v8di c)
+{
+  long long b = ~a;
+  return (__extension__(v8di) {b, b, b, b, b, b, b, b}) & c;
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH][i386] Split not+broadcast+pand to broadcast+pandn.
  2021-05-25  5:49 [PATCH][i386] Split not+broadcast+pand to broadcast+pandn Hongtao Liu
@ 2021-05-25  6:11 ` Andrew Pinski
  2021-05-25  6:23   ` Hongtao Liu
  0 siblings, 1 reply; 21+ messages in thread
From: Andrew Pinski @ 2021-05-25  6:11 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: GCC Patches

On Mon, May 24, 2021 at 11:03 PM Hongtao Liu via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> Hi:
>   This patch is about to do transformation like below.
>   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
>   Ok for trunk?
>
> from
>         notl    %edi
>         vpbroadcastd    %edi, %xmm0
>         vpand   %xmm1, %xmm0, %xmm0
> to
>         vpbroadcastd    %edi, %xmm0
>         vpandn   %xmm1, %xmm0, %xmm0
>
> gcc/ChangeLog:
>
>         PR target/100711
>         * config/i386/sse.md (*andnot<mode>3): New combine splitter
>         after it.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/100711
>         * gcc.target/i386/avx2-pr100711.c: New test.
>         * gcc.target/i386/avx512bw-pr100711.c: New test.
>


Does it make sense to make this more generic and have combine/simplify
rtx instead try:
(vec_dup (not)) to (not (vec_dup))

Thanks,
Andrew Pinski

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH][i386] Split not+broadcast+pand to broadcast+pandn.
  2021-05-25  6:11 ` Andrew Pinski
@ 2021-05-25  6:23   ` Hongtao Liu
  2021-05-25  6:29     ` Andrew Pinski
  0 siblings, 1 reply; 21+ messages in thread
From: Hongtao Liu @ 2021-05-25  6:23 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: GCC Patches

On Tue, May 25, 2021 at 2:11 PM Andrew Pinski <pinskia@gmail.com> wrote:
>
> On Mon, May 24, 2021 at 11:03 PM Hongtao Liu via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> >
> > Hi:
> >   This patch is about to do transformation like below.
> >   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> >   Ok for trunk?
> >
> > from
> >         notl    %edi
> >         vpbroadcastd    %edi, %xmm0
> >         vpand   %xmm1, %xmm0, %xmm0
> > to
> >         vpbroadcastd    %edi, %xmm0
> >         vpandn   %xmm1, %xmm0, %xmm0
> >
> > gcc/ChangeLog:
> >
> >         PR target/100711
> >         * config/i386/sse.md (*andnot<mode>3): New combine splitter
> >         after it.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         PR target/100711
> >         * gcc.target/i386/avx2-pr100711.c: New test.
> >         * gcc.target/i386/avx512bw-pr100711.c: New test.
> >
>
>
> Does it make sense to make this more generic and have combine/simplify
> rtx instead try:
> (vec_dup (not)) to (not (vec_dup))

Even w/ that, a combine splitter is still needed since we don't have
any pandn patterns which contain op1 as vec_duplicate or "not" pattern
for vector mode,  generic simplification only helps combine/forprop to
match more possibilities, but not split pattern by itself.

>
> Thanks,
> Andrew Pinski



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH][i386] Split not+broadcast+pand to broadcast+pandn.
  2021-05-25  6:23   ` Hongtao Liu
@ 2021-05-25  6:29     ` Andrew Pinski
  2021-05-25  6:34       ` Hongtao Liu
  0 siblings, 1 reply; 21+ messages in thread
From: Andrew Pinski @ 2021-05-25  6:29 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: GCC Patches

On Mon, May 24, 2021 at 11:23 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Tue, May 25, 2021 at 2:11 PM Andrew Pinski <pinskia@gmail.com> wrote:
> >
> > On Mon, May 24, 2021 at 11:03 PM Hongtao Liu via Gcc-patches
> > <gcc-patches@gcc.gnu.org> wrote:
> > >
> > > Hi:
> > >   This patch is about to do transformation like below.
> > >   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > >   Ok for trunk?
> > >
> > > from
> > >         notl    %edi
> > >         vpbroadcastd    %edi, %xmm0
> > >         vpand   %xmm1, %xmm0, %xmm0
> > > to
> > >         vpbroadcastd    %edi, %xmm0
> > >         vpandn   %xmm1, %xmm0, %xmm0
> > >
> > > gcc/ChangeLog:
> > >
> > >         PR target/100711
> > >         * config/i386/sse.md (*andnot<mode>3): New combine splitter
> > >         after it.
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         PR target/100711
> > >         * gcc.target/i386/avx2-pr100711.c: New test.
> > >         * gcc.target/i386/avx512bw-pr100711.c: New test.
> > >
> >
> >
> > Does it make sense to make this more generic and have combine/simplify
> > rtx instead try:
> > (vec_dup (not)) to (not (vec_dup))
>
> Even w/ that, a combine splitter is still needed since we don't have
> any pandn patterns which contain op1 as vec_duplicate or "not" pattern
> for vector mode,  generic simplification only helps combine/forprop to
> match more possibilities, but not split pattern by itself.

Huh?  This is a 3->2 combining which definitely just happen.
You don't need a "not" pattern for the vector mode for this to happen
since the combining happens without an insn defined.

Thanks,
Andrew Pinski

>
> >
> > Thanks,
> > Andrew Pinski
>
>
>
> --
> BR,
> Hongtao

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH][i386] Split not+broadcast+pand to broadcast+pandn.
  2021-05-25  6:29     ` Andrew Pinski
@ 2021-05-25  6:34       ` Hongtao Liu
  2021-05-26  1:21         ` Hongtao Liu
  0 siblings, 1 reply; 21+ messages in thread
From: Hongtao Liu @ 2021-05-25  6:34 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: GCC Patches

On Tue, May 25, 2021 at 2:29 PM Andrew Pinski <pinskia@gmail.com> wrote:
>
> On Mon, May 24, 2021 at 11:23 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > On Tue, May 25, 2021 at 2:11 PM Andrew Pinski <pinskia@gmail.com> wrote:
> > >
> > > On Mon, May 24, 2021 at 11:03 PM Hongtao Liu via Gcc-patches
> > > <gcc-patches@gcc.gnu.org> wrote:
> > > >
> > > > Hi:
> > > >   This patch is about to do transformation like below.
> > > >   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > > >   Ok for trunk?
> > > >
> > > > from
> > > >         notl    %edi
> > > >         vpbroadcastd    %edi, %xmm0
> > > >         vpand   %xmm1, %xmm0, %xmm0
> > > > to
> > > >         vpbroadcastd    %edi, %xmm0
> > > >         vpandn   %xmm1, %xmm0, %xmm0
> > > >
> > > > gcc/ChangeLog:
> > > >
> > > >         PR target/100711
> > > >         * config/i386/sse.md (*andnot<mode>3): New combine splitter
> > > >         after it.
> > > >
> > > > gcc/testsuite/ChangeLog:
> > > >
> > > >         PR target/100711
> > > >         * gcc.target/i386/avx2-pr100711.c: New test.
> > > >         * gcc.target/i386/avx512bw-pr100711.c: New test.
> > > >
> > >
> > >
> > > Does it make sense to make this more generic and have combine/simplify
> > > rtx instead try:
> > > (vec_dup (not)) to (not (vec_dup))
> >
> > Even w/ that, a combine splitter is still needed since we don't have
> > any pandn patterns which contain op1 as vec_duplicate or "not" pattern
> > for vector mode,  generic simplification only helps combine/forprop to
> > match more possibilities, but not split pattern by itself.
>
> Huh?  This is a 3->2 combining which definitely just happen.
Oh, I don't know that, thanks for the classification, I thought
combine only does n->1.
> You don't need a "not" pattern for the vector mode for this to happen
> since the combining happens without an insn defined.
>
> Thanks,
> Andrew Pinski
>
> >
> > >
> > > Thanks,
> > > Andrew Pinski
> >
> >
> >
> > --
> > BR,
> > Hongtao



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH][i386] Split not+broadcast+pand to broadcast+pandn.
  2021-05-25  6:34       ` Hongtao Liu
@ 2021-05-26  1:21         ` Hongtao Liu
  2021-05-26  4:12           ` Andrew Pinski
  0 siblings, 1 reply; 21+ messages in thread
From: Hongtao Liu @ 2021-05-26  1:21 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: GCC Patches, Andrew Pinski

[-- Attachment #1: Type: text/plain, Size: 903 bytes --]

Update patch:
  The new patch simplify (vec_duplicate (not (nonimmedaite_operand)))
to (not (vec_duplicate (nonimmedaite_operand))). This is not a
straightforward simplification, just adding some tendency to pull not
out of vec_duplicate.

  For i386, it will enable below opt

from
        notl    %edi
        vpbroadcastd    %edi, %xmm0
        vpand   %xmm1, %xmm0, %xmm0
to
        vpbroadcastd    %edi, %xmm0
        vpandn   %xmm1, %xmm0, %xmm0

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Ok for trunk?
gcc/ChangeLog:

        PR target/100711
        * simplify-rtx.c (simplify_unary_operation_1):
        Simplify (vec_duplicate (not (nonimmedaite_operand)))
        to (not (vec_duplicate (nonimmedaite_operand))).

gcc/testsuite/ChangeLog:

        PR target/100711
        * gcc.target/i386/avx2-pr100711.c: New test.
        * gcc.target/i386/avx512bw-pr100711.c: New test.

[-- Attachment #2: 0001-Simplify-vec_duplicate-not-nonimmedaite_operand-to-n.patch --]
[-- Type: text/x-patch, Size: 5492 bytes --]

From aa36def1266538fdda02177be8dbf9433d7e959c Mon Sep 17 00:00:00 2001
From: liuhongt <hongtao.liu@intel.com>
Date: Tue, 25 May 2021 17:17:32 +0800
Subject: [PATCH] Simplify (vec_duplicate (not (nonimmedaite_operand))) to (not
 (vec_duplicate (nonimmedaite_operand))).

This is not a straightforward simplification, just adding some
tendency to pull not out of vec_duplicate.

For i386, it will enable below opt

from
	notl    %edi
      	vpbroadcastd    %edi, %xmm0
      	vpand   %xmm1, %xmm0, %xmm0
to
      	vpbroadcastd    %edi, %xmm0
      	vpandn   %xmm1, %xmm0, %xmm0

gcc/ChangeLog:

	PR target/100711
	* simplify-rtx.c (simplify_unary_operation_1):
	Simplify (vec_duplicate (not (nonimmedaite_operand)))
	to (not (vec_duplicate (nonimmedaite_operand))).

gcc/testsuite/ChangeLog:

	PR target/100711
	* gcc.target/i386/avx2-pr100711.c: New test.
	* gcc.target/i386/avx512bw-pr100711.c: New test.
---
 gcc/simplify-rtx.c                            |  9 +++
 gcc/testsuite/gcc.target/i386/avx2-pr100711.c | 73 +++++++++++++++++++
 .../gcc.target/i386/avx512bw-pr100711.c       | 48 ++++++++++++
 3 files changed, 130 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr100711.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c

diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 04423bbd195..bb23183a8e0 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -36,6 +36,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "selftest.h"
 #include "selftest-rtl.h"
 #include "rtx-vector-builder.h"
+#include "tm_p.h"
 
 /* Simplification and canonicalization of RTL.  */
 
@@ -1708,6 +1709,14 @@ simplify_context::simplify_unary_operation_1 (rtx_code code, machine_mode mode,
 #endif
       break;
 
+    /* Prefer (not (vec_duplicate (nonimmedaite_operand)))
+       to (vec_duplicate (not (nonimmedaite_operand))).  */
+    case VEC_DUPLICATE:
+      if (GET_CODE (op) == NOT
+	  && nonimmediate_operand (XEXP (op, 0), GET_MODE (op)))
+	return gen_rtx_NOT (mode, gen_rtx_VEC_DUPLICATE (mode, XEXP (op, 0)));
+      break;
+
     default:
       break;
     }
diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr100711.c b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
new file mode 100644
index 00000000000..5b144623873
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
@@ -0,0 +1,73 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "pandn" 8 } } */
+/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+
+v16qi
+f1 (char a, v16qi c)
+{
+  char b = ~a;
+  return (__extension__(v16qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v32qi
+f2 (char a, v32qi c)
+{
+  char b = ~a;
+  return (__extension__(v32qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v8hi
+f3 (short a, v8hi c)
+{
+  short b = ~a;
+  return (__extension__(v8hi) {b, b, b, b, b, b, b, b}) & c;
+}
+
+v16hi
+f4 (short a, v16hi c)
+{
+  short b = ~a;
+  return (__extension__(v16hi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v4si
+f5 (int a, v4si c)
+{
+  int b = ~a;
+  return (__extension__(v4si) {b, b, b, b}) & c;
+}
+
+v8si
+f6 (int a, v8si c)
+{
+  int b = ~a;
+  return (__extension__(v8si) {b, b, b, b, b, b, b, b}) & c;
+}
+
+v2di
+f7 (long long a, v2di c)
+{
+  long long b = ~a;
+  return (__extension__(v2di) {b, b}) & c;
+}
+
+v4di
+f8 (long long a, v4di c)
+{
+  long long b = ~a;
+  return (__extension__(v4di) {b, b, b, b}) & c;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
new file mode 100644
index 00000000000..f0a103d0bc2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "pandn" 4 } } */
+/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef short v32hi __attribute__((vector_size(64)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef long long v8di __attribute__((vector_size(64)));
+
+v64qi
+f1 (char a, v64qi c)
+{
+  char b = ~a;
+  return (__extension__(v64qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v32hi
+f2 (short a, v32hi c)
+{
+  short b = ~a;
+  return (__extension__(v32hi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v16si
+f3 (int a, v16si c)
+{
+  int b = ~a;
+  return (__extension__(v16si) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v8di
+f4 (long long a, v8di c)
+{
+  long long b = ~a;
+  return (__extension__(v8di) {b, b, b, b, b, b, b, b}) & c;
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH][i386] Split not+broadcast+pand to broadcast+pandn.
  2021-05-26  1:21         ` Hongtao Liu
@ 2021-05-26  4:12           ` Andrew Pinski
  2021-05-26  5:17             ` Hongtao Liu
  0 siblings, 1 reply; 21+ messages in thread
From: Andrew Pinski @ 2021-05-26  4:12 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: Segher Boessenkool, GCC Patches

On Tue, May 25, 2021 at 6:17 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> Update patch:
>   The new patch simplify (vec_duplicate (not (nonimmedaite_operand)))
> to (not (vec_duplicate (nonimmedaite_operand))). This is not a
> straightforward simplification, just adding some tendency to pull not
> out of vec_duplicate.
>
>   For i386, it will enable below opt
>
> from
>         notl    %edi
>         vpbroadcastd    %edi, %xmm0
>         vpand   %xmm1, %xmm0, %xmm0
> to
>         vpbroadcastd    %edi, %xmm0
>         vpandn   %xmm1, %xmm0, %xmm0
>
>   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
>   Ok for trunk?
> gcc/ChangeLog:
>
>         PR target/100711
>         * simplify-rtx.c (simplify_unary_operation_1):
>         Simplify (vec_duplicate (not (nonimmedaite_operand)))
>         to (not (vec_duplicate (nonimmedaite_operand))).
>
> gcc/testsuite/ChangeLog:
>
>         PR target/100711
>         * gcc.target/i386/avx2-pr100711.c: New test.
>         * gcc.target/i386/avx512bw-pr100711.c: New test.

This patch should not use nonimmedaite_operand at all in
simplify-rtx.c.  Rather use !CONSTANT_P (XEXP (op, 0)) instead.
And even then (not CONST_INT) will never be there anyways as it will
always be simplified to a constant in the first place.  So removing
that check is fine.

Thanks,
Andrew

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH][i386] Split not+broadcast+pand to broadcast+pandn.
  2021-05-26  4:12           ` Andrew Pinski
@ 2021-05-26  5:17             ` Hongtao Liu
  2021-06-01  8:32               ` Hongtao Liu
  0 siblings, 1 reply; 21+ messages in thread
From: Hongtao Liu @ 2021-05-26  5:17 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Segher Boessenkool, GCC Patches

On Wed, May 26, 2021 at 12:12 PM Andrew Pinski <pinskia@gmail.com> wrote:
>
> On Tue, May 25, 2021 at 6:17 PM Hongtao Liu <crazylht@gmail.com> wrote:
> >
> > Update patch:
> >   The new patch simplify (vec_duplicate (not (nonimmedaite_operand)))
> > to (not (vec_duplicate (nonimmedaite_operand))). This is not a
> > straightforward simplification, just adding some tendency to pull not
> > out of vec_duplicate.
> >
> >   For i386, it will enable below opt
> >
> > from
> >         notl    %edi
> >         vpbroadcastd    %edi, %xmm0
> >         vpand   %xmm1, %xmm0, %xmm0
> > to
> >         vpbroadcastd    %edi, %xmm0
> >         vpandn   %xmm1, %xmm0, %xmm0
> >
> >   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> >   Ok for trunk?
> > gcc/ChangeLog:
> >
> >         PR target/100711
> >         * simplify-rtx.c (simplify_unary_operation_1):
> >         Simplify (vec_duplicate (not (nonimmedaite_operand)))
> >         to (not (vec_duplicate (nonimmedaite_operand))).
> >
> > gcc/testsuite/ChangeLog:
> >
> >         PR target/100711
> >         * gcc.target/i386/avx2-pr100711.c: New test.
> >         * gcc.target/i386/avx512bw-pr100711.c: New test.
>
> This patch should not use nonimmedaite_operand at all in
There's no simplification opportunity for nonimmediate_operand, but
I'm not sure for other cases(not constants).
Reading from codes in case NOT of simplify_unary_operation_1, there
may be (vec_duplicate (not (plus X - 1))???

> simplify-rtx.c.  Rather use !CONSTANT_P (XEXP (op, 0)) instead.
> And even then (not CONST_INT) will never be there anyways as it will
> always be simplified to a constant in the first place.  So removing
> that check is fine.
>
> Thanks,
> Andrew



-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH][i386] Split not+broadcast+pand to broadcast+pandn.
  2021-05-26  5:17             ` Hongtao Liu
@ 2021-06-01  8:32               ` Hongtao Liu
  2021-06-01 13:54                 ` Segher Boessenkool
  2021-06-01 14:02                 ` Segher Boessenkool
  0 siblings, 2 replies; 21+ messages in thread
From: Hongtao Liu @ 2021-06-01  8:32 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Segher Boessenkool, GCC Patches

[-- Attachment #1: Type: text/plain, Size: 1835 bytes --]

On Wed, May 26, 2021 at 1:17 PM Hongtao Liu <crazylht@gmail.com> wrote:
>
> On Wed, May 26, 2021 at 12:12 PM Andrew Pinski <pinskia@gmail.com> wrote:
> >
> > On Tue, May 25, 2021 at 6:17 PM Hongtao Liu <crazylht@gmail.com> wrote:
> > >
> > > Update patch:
> > >   The new patch simplify (vec_duplicate (not (nonimmedaite_operand)))
> > > to (not (vec_duplicate (nonimmedaite_operand))). This is not a
> > > straightforward simplification, just adding some tendency to pull not
> > > out of vec_duplicate.
> > >
> > >   For i386, it will enable below opt
> > >
> > > from
> > >         notl    %edi
> > >         vpbroadcastd    %edi, %xmm0
> > >         vpand   %xmm1, %xmm0, %xmm0
> > > to
> > >         vpbroadcastd    %edi, %xmm0
> > >         vpandn   %xmm1, %xmm0, %xmm0
> > >
> > >   Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
> > >   Ok for trunk?
> > > gcc/ChangeLog:
> > >
> > >         PR target/100711
> > >         * simplify-rtx.c (simplify_unary_operation_1):
> > >         Simplify (vec_duplicate (not (nonimmedaite_operand)))
> > >         to (not (vec_duplicate (nonimmedaite_operand))).
> > >
> > > gcc/testsuite/ChangeLog:
> > >
> > >         PR target/100711
> > >         * gcc.target/i386/avx2-pr100711.c: New test.
> > >         * gcc.target/i386/avx512bw-pr100711.c: New test.
> >
> > This patch should not use nonimmedaite_operand at all in
> There's no simplification opportunity for nonimmediate_operand, but
> I'm not sure for other cases(not constants).
> Reading from codes in case NOT of simplify_unary_operation_1, there
> may be (vec_duplicate (not (plus X - 1))???

After reconsidering, I think you're right, (not  op) will be
simplified in the first place, so the updated patch just pulls not out
of vec_duplicate.

> >
> > Thanks,
> > Andrew
>
>
>
> --
> BR,
> Hongtao



--
BR,
Hongtao

[-- Attachment #2: 0001-Simplify-vec_duplicate-not-op-to-not-vec_duplicate-o_v3.patch --]
[-- Type: application/x-patch, Size: 5125 bytes --]

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH][i386] Split not+broadcast+pand to broadcast+pandn.
  2021-06-01  8:32               ` Hongtao Liu
@ 2021-06-01 13:54                 ` Segher Boessenkool
  2021-06-01 14:02                 ` Segher Boessenkool
  1 sibling, 0 replies; 21+ messages in thread
From: Segher Boessenkool @ 2021-06-01 13:54 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: Andrew Pinski, GCC Patches

On Tue, Jun 01, 2021 at 04:32:42PM +0800, Hongtao Liu wrote:

[ no attachment to reply to ]

Please send this with either the patch actually inline, or as
attachment with content-disposition inline, no encoding, and a valid
text mimetype.  So that people can see it, also on the archives, and
actually reply to it!

I'll see if I can fix it up this time.


Segher

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH][i386] Split not+broadcast+pand to broadcast+pandn.
  2021-06-01  8:32               ` Hongtao Liu
  2021-06-01 13:54                 ` Segher Boessenkool
@ 2021-06-01 14:02                 ` Segher Boessenkool
  2021-06-02  5:39                   ` liuhongt
  2021-06-02  5:41                   ` [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)) liuhongt
  1 sibling, 2 replies; 21+ messages in thread
From: Segher Boessenkool @ 2021-06-01 14:02 UTC (permalink / raw)
  To: Hongtao Liu; +Cc: Andrew Pinski, GCC Patches

> 	PR target/100711
> 	* simplify-rtx.c (simplify_unary_operation_1):
> 	Simplify (vec_duplicate (not op)) to (not (vec_duplicate op)).

This is not a simplification.  If we want to do this we need to document
this canonicalisation (in md.texi, "Insn Canonicalizations").

> +    /* Prefer (not (vec_duplicate (nonimmedaite_operand)))
> +       to (vec_duplicate (not (nonimmedaite_operand))).  */

What Andrew said here (also, it's misspelled :-) )

> +    case VEC_DUPLICATE:
> +      if (GET_CODE (op) == NOT)
> +	return gen_rtx_NOT (mode, gen_rtx_VEC_DUPLICATE (mode, XEXP (op, 0)));
> +      break;

If it isn't a canonicalisation you need to simplify the result, and then
only do it if it does in fact simplify.  You risk "simplification" loops
if you don't.


Segher

^ permalink raw reply	[flat|nested] 21+ messages in thread

* (no subject)
  2021-06-01 14:02                 ` Segher Boessenkool
@ 2021-06-02  5:39                   ` liuhongt
  2021-06-02  5:39                     ` [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)) liuhongt
  2021-06-02  5:49                     ` Hongtao Liu
  2021-06-02  5:41                   ` [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)) liuhongt
  1 sibling, 2 replies; 21+ messages in thread
From: liuhongt @ 2021-06-02  5:39 UTC (permalink / raw)
  To: segher; +Cc: gcc-patches, pinskia

This is the updated patch.



^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).
  2021-06-02  5:39                   ` liuhongt
@ 2021-06-02  5:39                     ` liuhongt
  2021-06-02  7:07                       ` Richard Biener
  2021-06-02  5:49                     ` Hongtao Liu
  1 sibling, 1 reply; 21+ messages in thread
From: liuhongt @ 2021-06-02  5:39 UTC (permalink / raw)
  To: segher; +Cc: gcc-patches, pinskia

For i386, it will enable below opt

from
	notl    %edi
      	vpbroadcastd    %edi, %xmm0
      	vpand   %xmm1, %xmm0, %xmm0
to
      	vpbroadcastd    %edi, %xmm0
      	vpandn   %xmm1, %xmm0, %xmm0

gcc/ChangeLog:

	PR target/100711
	* simplify-rtx.c (simplify_unary_operation_1):
	Canonicalize (vec_duplicate (not A)) to
	(not (vec_duplicate A)).
	* doc/md.texi (Insn Canonicalizations): Document
	canonicalization of vec_duplicate.

gcc/testsuite/ChangeLog:

	PR target/100711
	* gcc.target/i386/avx2-pr100711.c: New test.
	* gcc.target/i386/avx512bw-pr100711.c: New test.
---
 gcc/doc/md.texi                               |  5 ++
 gcc/simplify-rtx.c                            |  6 ++
 gcc/testsuite/gcc.target/i386/avx2-pr100711.c | 73 +++++++++++++++++++
 .../gcc.target/i386/avx512bw-pr100711.c       | 48 ++++++++++++
 4 files changed, 132 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr100711.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 0e65b3ae663..06b42901413 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -8297,6 +8297,11 @@ operand of @code{mult} is also a shift, then that is extended also.
 This transformation is only applied when it can be proven that the
 original operation had sufficient precision to prevent overflow.
 
+@cindex @code{vec_duplicate}, canonicalization of
+@item
+@code{(vec_duplicate (not @var{a}))} is converted to
+@code{(not (vec_duplicate @var{a}))}.
+
 @end itemize
 
 Further canonicalization rules are defined in the function
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 04423bbd195..171fc447d50 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -1708,6 +1708,12 @@ simplify_context::simplify_unary_operation_1 (rtx_code code, machine_mode mode,
 #endif
       break;
 
+      /* Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).  */
+    case VEC_DUPLICATE:
+      if (GET_CODE (op) == NOT)
+	return gen_rtx_NOT (mode, gen_rtx_VEC_DUPLICATE (mode, XEXP (op, 0)));
+      break;
+
     default:
       break;
     }
diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr100711.c b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
new file mode 100644
index 00000000000..5b144623873
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
@@ -0,0 +1,73 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "pandn" 8 } } */
+/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+
+v16qi
+f1 (char a, v16qi c)
+{
+  char b = ~a;
+  return (__extension__(v16qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v32qi
+f2 (char a, v32qi c)
+{
+  char b = ~a;
+  return (__extension__(v32qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v8hi
+f3 (short a, v8hi c)
+{
+  short b = ~a;
+  return (__extension__(v8hi) {b, b, b, b, b, b, b, b}) & c;
+}
+
+v16hi
+f4 (short a, v16hi c)
+{
+  short b = ~a;
+  return (__extension__(v16hi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v4si
+f5 (int a, v4si c)
+{
+  int b = ~a;
+  return (__extension__(v4si) {b, b, b, b}) & c;
+}
+
+v8si
+f6 (int a, v8si c)
+{
+  int b = ~a;
+  return (__extension__(v8si) {b, b, b, b, b, b, b, b}) & c;
+}
+
+v2di
+f7 (long long a, v2di c)
+{
+  long long b = ~a;
+  return (__extension__(v2di) {b, b}) & c;
+}
+
+v4di
+f8 (long long a, v4di c)
+{
+  long long b = ~a;
+  return (__extension__(v4di) {b, b, b, b}) & c;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
new file mode 100644
index 00000000000..f0a103d0bc2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "pandn" 4 } } */
+/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef short v32hi __attribute__((vector_size(64)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef long long v8di __attribute__((vector_size(64)));
+
+v64qi
+f1 (char a, v64qi c)
+{
+  char b = ~a;
+  return (__extension__(v64qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v32hi
+f2 (short a, v32hi c)
+{
+  short b = ~a;
+  return (__extension__(v32hi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v16si
+f3 (int a, v16si c)
+{
+  int b = ~a;
+  return (__extension__(v16si) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v8di
+f4 (long long a, v8di c)
+{
+  long long b = ~a;
+  return (__extension__(v8di) {b, b, b, b, b, b, b, b}) & c;
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).
  2021-06-01 14:02                 ` Segher Boessenkool
  2021-06-02  5:39                   ` liuhongt
@ 2021-06-02  5:41                   ` liuhongt
  1 sibling, 0 replies; 21+ messages in thread
From: liuhongt @ 2021-06-02  5:41 UTC (permalink / raw)
  To: segher; +Cc: gcc-patches, pinskia

For i386, it will enable below opt

from
	notl    %edi
      	vpbroadcastd    %edi, %xmm0
      	vpand   %xmm1, %xmm0, %xmm0
to
      	vpbroadcastd    %edi, %xmm0
      	vpandn   %xmm1, %xmm0, %xmm0

gcc/ChangeLog:

	PR target/100711
	* simplify-rtx.c (simplify_unary_operation_1):
	Canonicalize (vec_duplicate (not A)) to
	(not (vec_duplicate A)).
	* doc/md.texi (Insn Canonicalizations): Document
	canonicalization of vec_duplicate.

gcc/testsuite/ChangeLog:

	PR target/100711
	* gcc.target/i386/avx2-pr100711.c: New test.
	* gcc.target/i386/avx512bw-pr100711.c: New test.
---
 gcc/doc/md.texi                               |  5 ++
 gcc/simplify-rtx.c                            |  6 ++
 gcc/testsuite/gcc.target/i386/avx2-pr100711.c | 73 +++++++++++++++++++
 .../gcc.target/i386/avx512bw-pr100711.c       | 48 ++++++++++++
 4 files changed, 132 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr100711.c
 create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c

diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 0e65b3ae663..06b42901413 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -8297,6 +8297,11 @@ operand of @code{mult} is also a shift, then that is extended also.
 This transformation is only applied when it can be proven that the
 original operation had sufficient precision to prevent overflow.
 
+@cindex @code{vec_duplicate}, canonicalization of
+@item
+@code{(vec_duplicate (not @var{a}))} is converted to
+@code{(not (vec_duplicate @var{a}))}.
+
 @end itemize
 
 Further canonicalization rules are defined in the function
diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
index 04423bbd195..171fc447d50 100644
--- a/gcc/simplify-rtx.c
+++ b/gcc/simplify-rtx.c
@@ -1708,6 +1708,12 @@ simplify_context::simplify_unary_operation_1 (rtx_code code, machine_mode mode,
 #endif
       break;
 
+      /* Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).  */
+    case VEC_DUPLICATE:
+      if (GET_CODE (op) == NOT)
+	return gen_rtx_NOT (mode, gen_rtx_VEC_DUPLICATE (mode, XEXP (op, 0)));
+      break;
+
     default:
       break;
     }
diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr100711.c b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
new file mode 100644
index 00000000000..5b144623873
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
@@ -0,0 +1,73 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "pandn" 8 } } */
+/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
+typedef char v16qi __attribute__((vector_size(16)));
+typedef char v32qi __attribute__((vector_size(32)));
+typedef short v8hi __attribute__((vector_size(16)));
+typedef short v16hi __attribute__((vector_size(32)));
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+typedef long long v2di __attribute__((vector_size(16)));
+typedef long long v4di __attribute__((vector_size(32)));
+
+v16qi
+f1 (char a, v16qi c)
+{
+  char b = ~a;
+  return (__extension__(v16qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v32qi
+f2 (char a, v32qi c)
+{
+  char b = ~a;
+  return (__extension__(v32qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v8hi
+f3 (short a, v8hi c)
+{
+  short b = ~a;
+  return (__extension__(v8hi) {b, b, b, b, b, b, b, b}) & c;
+}
+
+v16hi
+f4 (short a, v16hi c)
+{
+  short b = ~a;
+  return (__extension__(v16hi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v4si
+f5 (int a, v4si c)
+{
+  int b = ~a;
+  return (__extension__(v4si) {b, b, b, b}) & c;
+}
+
+v8si
+f6 (int a, v8si c)
+{
+  int b = ~a;
+  return (__extension__(v8si) {b, b, b, b, b, b, b, b}) & c;
+}
+
+v2di
+f7 (long long a, v2di c)
+{
+  long long b = ~a;
+  return (__extension__(v2di) {b, b}) & c;
+}
+
+v4di
+f8 (long long a, v4di c)
+{
+  long long b = ~a;
+  return (__extension__(v4di) {b, b, b, b}) & c;
+}
diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
new file mode 100644
index 00000000000..f0a103d0bc2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
@@ -0,0 +1,48 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512bw -O2" } */
+/* { dg-final { scan-assembler-times "pandn" 4 } } */
+/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
+
+typedef char v64qi __attribute__((vector_size(64)));
+typedef short v32hi __attribute__((vector_size(64)));
+typedef int v16si __attribute__((vector_size(64)));
+typedef long long v8di __attribute__((vector_size(64)));
+
+v64qi
+f1 (char a, v64qi c)
+{
+  char b = ~a;
+  return (__extension__(v64qi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v32hi
+f2 (short a, v32hi c)
+{
+  short b = ~a;
+  return (__extension__(v32hi) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v16si
+f3 (int a, v16si c)
+{
+  int b = ~a;
+  return (__extension__(v16si) {b, b, b, b, b, b, b, b,
+				 b, b, b, b, b, b, b, b}) & c;
+}
+
+v8di
+f4 (long long a, v8di c)
+{
+  long long b = ~a;
+  return (__extension__(v8di) {b, b, b, b, b, b, b, b}) & c;
+}
-- 
2.18.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re:
  2021-06-02  5:39                   ` liuhongt
  2021-06-02  5:39                     ` [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)) liuhongt
@ 2021-06-02  5:49                     ` Hongtao Liu
  1 sibling, 0 replies; 21+ messages in thread
From: Hongtao Liu @ 2021-06-02  5:49 UTC (permalink / raw)
  To: liuhongt; +Cc: Segher Boessenkool, GCC Patches

Please discard this one, sorry for disturbing.
Obviously I'm new to git send-email.

On Wed, Jun 2, 2021 at 1:40 PM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> This is the updated patch.
>
>


-- 
BR,
Hongtao

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).
  2021-06-02  5:39                     ` [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)) liuhongt
@ 2021-06-02  7:07                       ` Richard Biener
  2021-06-02 20:46                         ` Segher Boessenkool
  0 siblings, 1 reply; 21+ messages in thread
From: Richard Biener @ 2021-06-02  7:07 UTC (permalink / raw)
  To: liuhongt; +Cc: Segher Boessenkool, GCC Patches

On Wed, Jun 2, 2021 at 7:41 AM liuhongt via Gcc-patches
<gcc-patches@gcc.gnu.org> wrote:
>
> For i386, it will enable below opt
>
> from
>         notl    %edi
>         vpbroadcastd    %edi, %xmm0
>         vpand   %xmm1, %xmm0, %xmm0
> to
>         vpbroadcastd    %edi, %xmm0
>         vpandn   %xmm1, %xmm0, %xmm0

There will be cases where (vec_duplicate (not A)) is better
than (not (vec_duplicate A)), so I'm not sure it is a good idea
to forcefully canonicalize unary operations.  I suppose the
simplification happens inside combine - doesn't combine
already have code to try variants of an expression and isn't
this a good candidate that can be added there, avoiding
the canonicalization?

Richard.

> gcc/ChangeLog:
>
>         PR target/100711
>         * simplify-rtx.c (simplify_unary_operation_1):
>         Canonicalize (vec_duplicate (not A)) to
>         (not (vec_duplicate A)).
>         * doc/md.texi (Insn Canonicalizations): Document
>         canonicalization of vec_duplicate.
>
> gcc/testsuite/ChangeLog:
>
>         PR target/100711
>         * gcc.target/i386/avx2-pr100711.c: New test.
>         * gcc.target/i386/avx512bw-pr100711.c: New test.
> ---
>  gcc/doc/md.texi                               |  5 ++
>  gcc/simplify-rtx.c                            |  6 ++
>  gcc/testsuite/gcc.target/i386/avx2-pr100711.c | 73 +++++++++++++++++++
>  .../gcc.target/i386/avx512bw-pr100711.c       | 48 ++++++++++++
>  4 files changed, 132 insertions(+)
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx2-pr100711.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
>
> diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
> index 0e65b3ae663..06b42901413 100644
> --- a/gcc/doc/md.texi
> +++ b/gcc/doc/md.texi
> @@ -8297,6 +8297,11 @@ operand of @code{mult} is also a shift, then that is extended also.
>  This transformation is only applied when it can be proven that the
>  original operation had sufficient precision to prevent overflow.
>
> +@cindex @code{vec_duplicate}, canonicalization of
> +@item
> +@code{(vec_duplicate (not @var{a}))} is converted to
> +@code{(not (vec_duplicate @var{a}))}.
> +
>  @end itemize
>
>  Further canonicalization rules are defined in the function
> diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c
> index 04423bbd195..171fc447d50 100644
> --- a/gcc/simplify-rtx.c
> +++ b/gcc/simplify-rtx.c
> @@ -1708,6 +1708,12 @@ simplify_context::simplify_unary_operation_1 (rtx_code code, machine_mode mode,
>  #endif
>        break;
>
> +      /* Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).  */
> +    case VEC_DUPLICATE:
> +      if (GET_CODE (op) == NOT)
> +       return gen_rtx_NOT (mode, gen_rtx_VEC_DUPLICATE (mode, XEXP (op, 0)));
> +      break;
> +
>      default:
>        break;
>      }
> diff --git a/gcc/testsuite/gcc.target/i386/avx2-pr100711.c b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
> new file mode 100644
> index 00000000000..5b144623873
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx2-pr100711.c
> @@ -0,0 +1,73 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -O2" } */
> +/* { dg-final { scan-assembler-times "pandn" 8 } } */
> +/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
> +typedef char v16qi __attribute__((vector_size(16)));
> +typedef char v32qi __attribute__((vector_size(32)));
> +typedef short v8hi __attribute__((vector_size(16)));
> +typedef short v16hi __attribute__((vector_size(32)));
> +typedef int v4si __attribute__((vector_size(16)));
> +typedef int v8si __attribute__((vector_size(32)));
> +typedef long long v2di __attribute__((vector_size(16)));
> +typedef long long v4di __attribute__((vector_size(32)));
> +
> +v16qi
> +f1 (char a, v16qi c)
> +{
> +  char b = ~a;
> +  return (__extension__(v16qi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v32qi
> +f2 (char a, v32qi c)
> +{
> +  char b = ~a;
> +  return (__extension__(v32qi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v8hi
> +f3 (short a, v8hi c)
> +{
> +  short b = ~a;
> +  return (__extension__(v8hi) {b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v16hi
> +f4 (short a, v16hi c)
> +{
> +  short b = ~a;
> +  return (__extension__(v16hi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v4si
> +f5 (int a, v4si c)
> +{
> +  int b = ~a;
> +  return (__extension__(v4si) {b, b, b, b}) & c;
> +}
> +
> +v8si
> +f6 (int a, v8si c)
> +{
> +  int b = ~a;
> +  return (__extension__(v8si) {b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v2di
> +f7 (long long a, v2di c)
> +{
> +  long long b = ~a;
> +  return (__extension__(v2di) {b, b}) & c;
> +}
> +
> +v4di
> +f8 (long long a, v4di c)
> +{
> +  long long b = ~a;
> +  return (__extension__(v4di) {b, b, b, b}) & c;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
> new file mode 100644
> index 00000000000..f0a103d0bc2
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/avx512bw-pr100711.c
> @@ -0,0 +1,48 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mavx512bw -O2" } */
> +/* { dg-final { scan-assembler-times "pandn" 4 } } */
> +/* { dg-final { scan-assembler-not "not\[bwlq\]" } } */
> +
> +typedef char v64qi __attribute__((vector_size(64)));
> +typedef short v32hi __attribute__((vector_size(64)));
> +typedef int v16si __attribute__((vector_size(64)));
> +typedef long long v8di __attribute__((vector_size(64)));
> +
> +v64qi
> +f1 (char a, v64qi c)
> +{
> +  char b = ~a;
> +  return (__extension__(v64qi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v32hi
> +f2 (short a, v32hi c)
> +{
> +  short b = ~a;
> +  return (__extension__(v32hi) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v16si
> +f3 (int a, v16si c)
> +{
> +  int b = ~a;
> +  return (__extension__(v16si) {b, b, b, b, b, b, b, b,
> +                                b, b, b, b, b, b, b, b}) & c;
> +}
> +
> +v8di
> +f4 (long long a, v8di c)
> +{
> +  long long b = ~a;
> +  return (__extension__(v8di) {b, b, b, b, b, b, b, b}) & c;
> +}
> --
> 2.18.1
>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).
  2021-06-02  7:07                       ` Richard Biener
@ 2021-06-02 20:46                         ` Segher Boessenkool
  2021-06-03 11:03                           ` Liu, Hongtao
  0 siblings, 1 reply; 21+ messages in thread
From: Segher Boessenkool @ 2021-06-02 20:46 UTC (permalink / raw)
  To: Richard Biener; +Cc: liuhongt, GCC Patches

Hi!

On Wed, Jun 02, 2021 at 09:07:35AM +0200, Richard Biener wrote:
> On Wed, Jun 2, 2021 at 7:41 AM liuhongt via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
> > For i386, it will enable below opt
> >
> > from
> >         notl    %edi
> >         vpbroadcastd    %edi, %xmm0
> >         vpand   %xmm1, %xmm0, %xmm0
> > to
> >         vpbroadcastd    %edi, %xmm0
> >         vpandn   %xmm1, %xmm0, %xmm0
> 
> There will be cases where (vec_duplicate (not A)) is better
> than (not (vec_duplicate A)), so I'm not sure it is a good idea
> to forcefully canonicalize unary operations.

It is two unaries in sequence, where the order does not matter either.
As in all such cases you either have to handle both cases everywhere, or
have a canonical order.

> I suppose the
> simplification happens inside combine

combine uses simplify-rtx for most cases (it is part of combine, but
used in quite a few other places these days).

> - doesn't combine
> already have code to try variants of an expression and isn't
> this a good candidate that can be added there, avoiding
> the canonicalization?

As I mentioned, this is done in simplify-rtx in cases that do not have a
canonical representation.  This is critical because it prevents loops.

A very typical example is how UMIN is optimised:

   case UMIN:
      if (trueop1 == CONST0_RTX (mode) && ! side_effects_p (op0))
	return op1;
      if (rtx_equal_p (trueop0, trueop1) && ! side_effects_p (op0))
	return op0;
      tem = simplify_associative_operation (code, mode, op0, op1);
      if (tem)
	return tem;
      break;

(the stuff using "tem").

Hongtao, can we do something similar here?  Does that work well?  Please
try it out :-)


Segher

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).
  2021-06-02 20:46                         ` Segher Boessenkool
@ 2021-06-03 11:03                           ` Liu, Hongtao
  2021-06-03 11:06                             ` Jakub Jelinek
  2021-06-03 19:59                             ` Segher Boessenkool
  0 siblings, 2 replies; 21+ messages in thread
From: Liu, Hongtao @ 2021-06-03 11:03 UTC (permalink / raw)
  To: Segher Boessenkool, Richard Biener; +Cc: GCC Patches



>-----Original Message-----
>From: Segher Boessenkool <segher@kernel.crashing.org>
>Sent: Thursday, June 3, 2021 4:46 AM
>To: Richard Biener <richard.guenther@gmail.com>
>Cc: Liu, Hongtao <hongtao.liu@intel.com>; GCC Patches <gcc-
>patches@gcc.gnu.org>
>Subject: Re: [PATCH] Canonicalize (vec_duplicate (not A)) to (not
>(vec_duplicate A)).
>
>Hi!
>
>On Wed, Jun 02, 2021 at 09:07:35AM +0200, Richard Biener wrote:
>> On Wed, Jun 2, 2021 at 7:41 AM liuhongt via Gcc-patches
>> <gcc-patches@gcc.gnu.org> wrote:
>> > For i386, it will enable below opt
>> >
>> > from
>> >         notl    %edi
>> >         vpbroadcastd    %edi, %xmm0
>> >         vpand   %xmm1, %xmm0, %xmm0
>> > to
>> >         vpbroadcastd    %edi, %xmm0
>> >         vpandn   %xmm1, %xmm0, %xmm0
>>
>> There will be cases where (vec_duplicate (not A)) is better than (not
>> (vec_duplicate A)), so I'm not sure it is a good idea to forcefully
>> canonicalize unary operations.
>
>It is two unaries in sequence, where the order does not matter either.
>As in all such cases you either have to handle both cases everywhere, or have
>a canonical order.
>
>> I suppose the
>> simplification happens inside combine
>
>combine uses simplify-rtx for most cases (it is part of combine, but used in
>quite a few other places these days).
>
>> - doesn't combine
>> already have code to try variants of an expression and isn't this a
>> good candidate that can be added there, avoiding the canonicalization?
>
>As I mentioned, this is done in simplify-rtx in cases that do not have a
>canonical representation.  This is critical because it prevents loops.
>
>A very typical example is how UMIN is optimised:
>
>   case UMIN:
>      if (trueop1 == CONST0_RTX (mode) && ! side_effects_p (op0))
>	return op1;
>      if (rtx_equal_p (trueop0, trueop1) && ! side_effects_p (op0))
>	return op0;
>      tem = simplify_associative_operation (code, mode, op0, op1);
>      if (tem)
>	return tem;
>      break;
>
>(the stuff using "tem").
>
>Hongtao, can we do something similar here?  Does that work well?  Please try
>it out :-)

In simplify_rtx, no simplication occurs, there is just the difference between
 (vec_duplicate (not REG)) and (not (vec_duplicate (REG)). So here tem will only be 0.
Basically we don't know it's a simplication until combine successfully split the
3->2 instructions (not + broadcast + and to andnot + broadcast), but it's pretty awkward
to do this in combine.

Consider andnot is existed for many backends, I think a canonicalization is needed here.
Maybe we can add insn canonicalization for transforming (and (vect_duplicate (not A)) B) to 
(and (not (duplicate (not A)) B) instead of (vec_duplicate (not A)) to (not (vec_duplicate A))?

>
>
>Segher

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).
  2021-06-03 11:03                           ` Liu, Hongtao
@ 2021-06-03 11:06                             ` Jakub Jelinek
  2021-06-03 19:59                             ` Segher Boessenkool
  1 sibling, 0 replies; 21+ messages in thread
From: Jakub Jelinek @ 2021-06-03 11:06 UTC (permalink / raw)
  To: Liu, Hongtao; +Cc: Segher Boessenkool, Richard Biener, GCC Patches

On Thu, Jun 03, 2021 at 11:03:43AM +0000, Liu, Hongtao via Gcc-patches wrote:
> In simplify_rtx, no simplication occurs, there is just the difference between
>  (vec_duplicate (not REG)) and (not (vec_duplicate (REG)). So here tem will only be 0.
> Basically we don't know it's a simplication until combine successfully split the
> 3->2 instructions (not + broadcast + and to andnot + broadcast), but it's pretty awkward
> to do this in combine.
> 
> Consider andnot is existed for many backends, I think a canonicalization is needed here.
> Maybe we can add insn canonicalization for transforming (and (vect_duplicate (not A)) B) to 
> (and (not (duplicate (not A)) B) instead of (vec_duplicate (not A)) to (not (vec_duplicate A))?

For the (not (vec_duplicate)) vs. (vec_duplicate (not)) it isn't clear which
one is generally a win on major targets, so I'd say it is better to add a
combine splitter to swap it in backends that want that.

	Jakub


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).
  2021-06-03 11:03                           ` Liu, Hongtao
  2021-06-03 11:06                             ` Jakub Jelinek
@ 2021-06-03 19:59                             ` Segher Boessenkool
  2021-06-04  2:48                               ` Liu, Hongtao
  1 sibling, 1 reply; 21+ messages in thread
From: Segher Boessenkool @ 2021-06-03 19:59 UTC (permalink / raw)
  To: Liu, Hongtao; +Cc: Richard Biener, GCC Patches

On Thu, Jun 03, 2021 at 11:03:43AM +0000, Liu, Hongtao wrote:
> >A very typical example is how UMIN is optimised:
> >
> >   case UMIN:
> >      if (trueop1 == CONST0_RTX (mode) && ! side_effects_p (op0))
> >	return op1;
> >      if (rtx_equal_p (trueop0, trueop1) && ! side_effects_p (op0))
> >	return op0;
> >      tem = simplify_associative_operation (code, mode, op0, op1);
> >      if (tem)
> >	return tem;
> >      break;
> >
> >(the stuff using "tem").
> >
> >Hongtao, can we do something similar here?  Does that work well?  Please try
> >it out :-)
> 
> In simplify_rtx, no simplication occurs, there is just the difference between
>  (vec_duplicate (not REG)) and (not (vec_duplicate (REG)). So here tem will only be 0.

simplify-rtx is used by combine.  When you do and+not+splat for example
my suggestion should kick in.  Try it out, don't just dismiss it?

> Basically we don't know it's a simplication until combine successfully split the
> 3->2 instructions (not + broadcast + and to andnot + broadcast), but it's pretty awkward
> to do this in combine.

But you need to do this *before* it is split.  That is the whole point.

> Consider andnot is existed for many backends, I think a canonicalization is needed here.

Please do note that that is not as easy as yoou may think: you need to
make sure nothing ever creates non-canonical code.

> Maybe we can add insn canonicalization for transforming (and (vect_duplicate (not A)) B) to 
> (and (not (duplicate (not A)) B) instead of (vec_duplicate (not A)) to (not (vec_duplicate A))?

I don't understand what this means?


Segher

^ permalink raw reply	[flat|nested] 21+ messages in thread

* RE: [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).
  2021-06-03 19:59                             ` Segher Boessenkool
@ 2021-06-04  2:48                               ` Liu, Hongtao
  0 siblings, 0 replies; 21+ messages in thread
From: Liu, Hongtao @ 2021-06-04  2:48 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Richard Biener, GCC Patches



>-----Original Message-----
>From: Segher Boessenkool <segher@kernel.crashing.org>
>Sent: Friday, June 4, 2021 4:00 AM
>To: Liu, Hongtao <hongtao.liu@intel.com>
>Cc: Richard Biener <richard.guenther@gmail.com>; GCC Patches <gcc-
>patches@gcc.gnu.org>
>Subject: Re: [PATCH] Canonicalize (vec_duplicate (not A)) to (not
>(vec_duplicate A)).
>
>On Thu, Jun 03, 2021 at 11:03:43AM +0000, Liu, Hongtao wrote:
>> >A very typical example is how UMIN is optimised:
>> >
>> >   case UMIN:
>> >      if (trueop1 == CONST0_RTX (mode) && ! side_effects_p (op0))
>> >	return op1;
>> >      if (rtx_equal_p (trueop0, trueop1) && ! side_effects_p (op0))
>> >	return op0;
>> >      tem = simplify_associative_operation (code, mode, op0, op1);
>> >      if (tem)
>> >	return tem;
>> >      break;
>> >
>> >(the stuff using "tem").
>> >
>> >Hongtao, can we do something similar here?  Does that work well?
>> >Please try it out :-)
>>
>> In simplify_rtx, no simplication occurs, there is just the difference
>> between  (vec_duplicate (not REG)) and (not (vec_duplicate (REG)). So here
>tem will only be 0.
>
>simplify-rtx is used by combine.  When you do and+not+splat for example my
>suggestion should kick in.  Try it out, don't just dismiss it?
>
Forgive my obtuseness, do you mean try the following changes, if so then there will be no "kick in", 
temp will be 0, there's no simplification here since it's just the difference between  (vec_duplicate (not REG))
 and (not (vec_duplicate (REG)). Or maybe you mean something else?

@@ -1708,6 +1708,17 @@ simplify_context::simplify_unary_operation_1 (rtx_code code, machine_mode mode,
 #endif
       break;

+      /* Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)).  */
+    case VEC_DUPLICATE:
+      if (GET_CODE (op) == NOT)
+       {
+         rtx vec_dup = gen_rtx_VEC_DUPLICATE (mode, XEXP (op, 0));
+         temp = simplify_unary_operation (NOT, mode, vec_dup, GET_MODE (op));
+         if (temp)
+           return temp;
+       }
+      break;
+
>> Basically we don't know it's a simplication until combine successfully
>> split the
>> 3->2 instructions (not + broadcast + and to andnot + broadcast), but
>> 3->it's pretty awkward
>> to do this in combine.
>
>But you need to do this *before* it is split.  That is the whole point.
>
>> Consider andnot is existed for many backends, I think a canonicalization is
>needed here.
>
>Please do note that that is not as easy as yoou may think: you need to make
>sure nothing ever creates non-canonical code.
>
>> Maybe we can add insn canonicalization for transforming (and
>> (vect_duplicate (not A)) B) to (and (not (duplicate (not A)) B) instead of
>(vec_duplicate (not A)) to (not (vec_duplicate A))?
>
>I don't understand what this means?
I mean let's give a last shot for andnot in case AND like below

@ -3702,6 +3702,16 @@ simplify_context::simplify_binary_operation_1 (rtx_code code,
       tem = simplify_associative_operation (code, mode, op0, op1);
       if (tem)
        return tem;
+
+      if (GET_CODE (op0) == VEC_DUPLICATE
+         && GET_CODE (XEXP (op0, 0)) == NOT)
+       {
+         rtx vec_dup = gen_rtx_VEC_DUPLICATE (GET_MODE (op0),
+                                              XEXP (XEXP (op0, 0), 0));
+         return simplify_gen_binary (AND, mode,
+                                     gen_rtx_NOT (mode, vec_dup),
+                                     op1);
+       }
       break;
>
>
>Segher

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2021-06-04  2:49 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-25  5:49 [PATCH][i386] Split not+broadcast+pand to broadcast+pandn Hongtao Liu
2021-05-25  6:11 ` Andrew Pinski
2021-05-25  6:23   ` Hongtao Liu
2021-05-25  6:29     ` Andrew Pinski
2021-05-25  6:34       ` Hongtao Liu
2021-05-26  1:21         ` Hongtao Liu
2021-05-26  4:12           ` Andrew Pinski
2021-05-26  5:17             ` Hongtao Liu
2021-06-01  8:32               ` Hongtao Liu
2021-06-01 13:54                 ` Segher Boessenkool
2021-06-01 14:02                 ` Segher Boessenkool
2021-06-02  5:39                   ` liuhongt
2021-06-02  5:39                     ` [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)) liuhongt
2021-06-02  7:07                       ` Richard Biener
2021-06-02 20:46                         ` Segher Boessenkool
2021-06-03 11:03                           ` Liu, Hongtao
2021-06-03 11:06                             ` Jakub Jelinek
2021-06-03 19:59                             ` Segher Boessenkool
2021-06-04  2:48                               ` Liu, Hongtao
2021-06-02  5:49                     ` Hongtao Liu
2021-06-02  5:41                   ` [PATCH] Canonicalize (vec_duplicate (not A)) to (not (vec_duplicate A)) liuhongt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).