public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-1773] Add x86 addsub SLP pattern
@ 2021-06-24 11:09 Richard Biener
  0 siblings, 0 replies; only message in thread
From: Richard Biener @ 2021-06-24 11:09 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:7a6c31f0f84a7295433ebac09b94fae2d5cc2892

commit r12-1773-g7a6c31f0f84a7295433ebac09b94fae2d5cc2892
Author: Richard Biener <rguenther@suse.de>
Date:   Mon May 31 13:19:01 2021 +0200

    Add x86 addsub SLP pattern
    
    This addds SLP pattern recognition for the SSE3/AVX [v]addsubp{ds} v0, v1
    instructions which compute { v0[0] - v1[0], v0[1], + v1[1], ... }
    thus subtract, add alternating on lanes, starting with subtract.
    
    It adds a corresponding optab and direct internal function,
    vec_addsub$a3 and renames the existing i386 backend patterns to
    the new canonical name.
    
    The SLP pattern matches the exact alternating lane sequence rather
    than trying to be clever and anticipating incoming permutes - we
    could permute the two input vectors to the needed lane alternation,
    do the addsub and then permute the result vector back but that's
    only profitable in case the two input or the output permute will
    vanish - something Tamars refactoring of SLP pattern recog should
    make possible.
    
    2021-06-17  Richard Biener  <rguenther@suse.de>
    
            * config/i386/sse.md (avx_addsubv4df3): Rename to
            vec_addsubv4df3.
            (avx_addsubv8sf3): Rename to vec_addsubv8sf3.
            (sse3_addsubv2df3): Rename to vec_addsubv2df3.
            (sse3_addsubv4sf3): Rename to vec_addsubv4sf3.
            * config/i386/i386-builtin.def: Adjust.
            * internal-fn.def (VEC_ADDSUB): New internal optab fn.
            * optabs.def (vec_addsub_optab): New optab.
            * tree-vect-slp-patterns.c (class addsub_pattern): New.
            (slp_patterns): Add addsub_pattern.
            * tree-vect-slp.c (vect_optimize_slp): Disable propagation
            across CFN_VEC_ADDSUB.
            * tree-vectorizer.h (vect_pattern::vect_pattern): Make
            m_ops optional.
            * doc/md.texi (vec_addsub<mode>3): Document.
    
            * gcc.target/i386/vect-addsubv2df.c: New testcase.
            * gcc.target/i386/vect-addsubv4sf.c: Likewise.
            * gcc.target/i386/vect-addsubv4df.c: Likewise.
            * gcc.target/i386/vect-addsubv8sf.c: Likewise.
            * gcc.target/i386/vect-addsub-2.c: Likewise.
            * gcc.target/i386/vect-addsub-3.c: Likewise.

Diff:
---
 gcc/config/i386/i386-builtin.def                |   8 +-
 gcc/config/i386/sse.md                          |   8 +-
 gcc/doc/md.texi                                 |   8 ++
 gcc/internal-fn.def                             |   1 +
 gcc/optabs.def                                  |   1 +
 gcc/testsuite/gcc.target/i386/vect-addsub-2.c   |  21 +++++
 gcc/testsuite/gcc.target/i386/vect-addsub-3.c   |  38 +++++++++
 gcc/testsuite/gcc.target/i386/vect-addsubv2df.c |  42 ++++++++++
 gcc/testsuite/gcc.target/i386/vect-addsubv4df.c |  36 +++++++++
 gcc/testsuite/gcc.target/i386/vect-addsubv4sf.c |  46 +++++++++++
 gcc/testsuite/gcc.target/i386/vect-addsubv8sf.c |  46 +++++++++++
 gcc/tree-vect-slp-patterns.c                    | 100 ++++++++++++++++++++++++
 gcc/tree-vect-slp.c                             |   1 +
 gcc/tree-vectorizer.h                           |   3 +-
 14 files changed, 350 insertions(+), 9 deletions(-)

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 31df3a613dd..ea79e0bdda2 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -855,8 +855,8 @@ BDESC (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_mmx_subv1di3, "__
 BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF)
 BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF)
 
-BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
-BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
+BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_vec_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
+BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_vec_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
 BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
 BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
 BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
@@ -996,8 +996,8 @@ BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128
 /* AVX */
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF)
+BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_vec_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF)
+BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_vec_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF)
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5bd65dd9312..1f1db8214cc 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2410,7 +2410,7 @@
    (set_attr "prefix" "<round_saeonly_scalar_prefix>")
    (set_attr "mode" "<ssescalarmode>")])
 
-(define_insn "avx_addsubv4df3"
+(define_insn "vec_addsubv4df3"
   [(set (match_operand:V4DF 0 "register_operand" "=x")
 	(vec_merge:V4DF
 	  (minus:V4DF
@@ -2424,7 +2424,7 @@
    (set_attr "prefix" "vex")
    (set_attr "mode" "V4DF")])
 
-(define_insn "sse3_addsubv2df3"
+(define_insn "vec_addsubv2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x,x")
 	(vec_merge:V2DF
 	  (minus:V2DF
@@ -2442,7 +2442,7 @@
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "V2DF")])
 
-(define_insn "avx_addsubv8sf3"
+(define_insn "vec_addsubv8sf3"
   [(set (match_operand:V8SF 0 "register_operand" "=x")
 	(vec_merge:V8SF
 	  (minus:V8SF
@@ -2456,7 +2456,7 @@
    (set_attr "prefix" "vex")
    (set_attr "mode" "V8SF")])
 
-(define_insn "sse3_addsubv4sf3"
+(define_insn "vec_addsubv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x,x")
 	(vec_merge:V4SF
 	  (minus:V4SF
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 00caf3844cc..1b918144330 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5682,6 +5682,14 @@ signed/unsigned elements of size S@.  Subtract the high/low elements of 2 from
 1 and widen the resulting elements. Put the N/2 results of size 2*S in the
 output vector (operand 0).
 
+@cindex @code{vec_addsub@var{m}3} instruction pattern
+@item @samp{vec_addsub@var{m}3}
+Alternating subtract, add with even lanes doing subtract and odd
+lanes doing addition.  Operands 1 and 2 and the outout operand are vectors
+with mode @var{m}.
+
+These instructions are not allowed to @code{FAIL}.
+
 @cindex @code{mulhisi3} instruction pattern
 @item @samp{mulhisi3}
 Multiply operands 1 and 2, which have mode @code{HImode}, and store
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index b2f414d2131..c3b8e730960 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -281,6 +281,7 @@ DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST, cmul_conj, binary)
+DEF_INTERNAL_OPTAB_FN (VEC_ADDSUB, ECF_CONST, vec_addsub, binary)
 
 
 /* FP scales.  */
diff --git a/gcc/optabs.def b/gcc/optabs.def
index b192a9d070b..41ab2598eb6 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -407,6 +407,7 @@ OPTAB_D (vec_widen_usubl_hi_optab, "vec_widen_usubl_hi_$a")
 OPTAB_D (vec_widen_usubl_lo_optab, "vec_widen_usubl_lo_$a")
 OPTAB_D (vec_widen_uaddl_hi_optab, "vec_widen_uaddl_hi_$a")
 OPTAB_D (vec_widen_uaddl_lo_optab, "vec_widen_uaddl_lo_$a")
+OPTAB_D (vec_addsub_optab, "vec_addsub$a3")
 
 OPTAB_D (sync_add_optab, "sync_add$I$a")
 OPTAB_D (sync_and_optab, "sync_and$I$a")
diff --git a/gcc/testsuite/gcc.target/i386/vect-addsub-2.c b/gcc/testsuite/gcc.target/i386/vect-addsub-2.c
new file mode 100644
index 00000000000..a6b941461e8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-addsub-2.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-options "-O3 -msse3" } */
+
+float a[1024], b[1024];
+
+void foo()
+{
+  for (int i = 0; i < 256; i++)
+    {
+      a[4*i+0] = a[4*i+0] - b[4*i+0];
+      a[4*i+1] = a[4*i+1] + b[4*i+1];
+      a[4*i+2] = a[4*i+2] - b[4*i+2];
+      a[4*i+3] = a[4*i+3] + b[4*i+3];
+    }
+}
+
+/* We should be able to vectorize this with SLP using the addsub
+   SLP pattern.  */
+/* { dg-final { scan-assembler "addsubps" } } */
+/* { dg-final { scan-assembler-not "shuf" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-addsub-3.c b/gcc/testsuite/gcc.target/i386/vect-addsub-3.c
new file mode 100644
index 00000000000..b27ee56bd73
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-addsub-3.c
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-options "-O3 -msse3" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse3-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse3_test
+#endif
+
+#include CHECK_H
+
+double a[2], b[2], c[2];
+
+void __attribute__((noipa))
+foo ()
+{
+  /* When we want to use addsubpd we have to keep permuting both
+     loads, if instead we blend the result of an add and a sub we
+     can combine the blend with the permute.  Both are similar in cost,
+     verify we did not wrongly apply both.  */
+  double tem0 = a[1] - b[1];
+  double tem1 = a[0] + b[0];
+  c[0] = tem0;
+  c[1] = tem1;
+}
+
+static void
+TEST (void)
+{
+  a[0] = 1.; a[1] = 2.;
+  b[0] = 2.; b[1] = 4.;
+  foo ();
+  if (c[0] != -2. || c[1] != 3.)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/vect-addsubv2df.c b/gcc/testsuite/gcc.target/i386/vect-addsubv2df.c
new file mode 100644
index 00000000000..547485d5519
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-addsubv2df.c
@@ -0,0 +1,42 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-options "-O3 -msse3 -fdump-tree-slp2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse3-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse3_test
+#endif
+
+#include CHECK_H
+
+double x[2], y[2], z[2];
+void __attribute__((noipa)) foo ()
+{
+  x[0] = y[0] - z[0];
+  x[1] = y[1] + z[1];
+}
+void __attribute__((noipa)) bar ()
+{
+  x[0] = y[0] + z[0];
+  x[1] = y[1] - z[1];
+}
+static void
+TEST (void)
+{
+  for (int i = 0; i < 2; ++i)
+    {
+      y[i] = i + 1;
+      z[i] = 2 * i + 1;
+    }
+  foo ();
+  if (x[0] != 0 || x[1] != 5)
+    __builtin_abort ();
+  bar ();
+  if (x[0] != 2 || x[1] != -1)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "ADDSUB" 1 "slp2" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-addsubv4df.c b/gcc/testsuite/gcc.target/i386/vect-addsubv4df.c
new file mode 100644
index 00000000000..e0a1b3d9d00
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-addsubv4df.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target avx_runtime } } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -mavx -fdump-tree-slp2" } */
+
+double x[4], y[4], z[4];
+void __attribute__((noipa)) foo ()
+{
+  x[0] = y[0] - z[0];
+  x[1] = y[1] + z[1];
+  x[2] = y[2] - z[2];
+  x[3] = y[3] + z[3];
+}
+void __attribute__((noipa)) bar ()
+{
+  x[0] = y[0] + z[0];
+  x[1] = y[1] - z[1];
+  x[2] = y[2] + z[2];
+  x[3] = y[3] - z[3];
+}
+int main()
+{
+  for (int i = 0; i < 4; ++i)
+    {
+      y[i] = i + 1;
+      z[i] = 2 * i + 1;
+    }
+  foo ();
+  if (x[0] != 0 || x[1] != 5 || x[2] != -2 || x[3] != 11)
+    __builtin_abort ();
+  bar ();
+  if (x[0] != 2 || x[1] != -1 || x[2] != 8 || x[3] != -3)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "ADDSUB" 1 "slp2" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-addsubv4sf.c b/gcc/testsuite/gcc.target/i386/vect-addsubv4sf.c
new file mode 100644
index 00000000000..b524f0c35a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-addsubv4sf.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-options "-O3 -msse3 -fdump-tree-slp2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse3-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse3_test
+#endif
+
+#include CHECK_H
+
+float x[4], y[4], z[4];
+void __attribute__((noipa)) foo ()
+{
+  x[0] = y[0] - z[0];
+  x[1] = y[1] + z[1];
+  x[2] = y[2] - z[2];
+  x[3] = y[3] + z[3];
+}
+void __attribute__((noipa)) bar ()
+{
+  x[0] = y[0] + z[0];
+  x[1] = y[1] - z[1];
+  x[2] = y[2] + z[2];
+  x[3] = y[3] - z[3];
+}
+static void
+TEST (void)
+{
+  for (int i = 0; i < 4; ++i)
+    {
+      y[i] = i + 1;
+      z[i] = 2 * i + 1;
+    }
+  foo ();
+  if (x[0] != 0 || x[1] != 5 || x[2] != -2 || x[3] != 11)
+    __builtin_abort ();
+  bar ();
+  if (x[0] != 2 || x[1] != -1 || x[2] != 8 || x[3] != -3)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "ADDSUB" 1 "slp2" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-addsubv8sf.c b/gcc/testsuite/gcc.target/i386/vect-addsubv8sf.c
new file mode 100644
index 00000000000..0eed33b6531
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-addsubv8sf.c
@@ -0,0 +1,46 @@
+/* { dg-do run { target avx_runtime } } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -mavx -fdump-tree-slp2" } */
+
+float x[8], y[8], z[8];
+void __attribute__((noipa)) foo ()
+{
+  x[0] = y[0] - z[0];
+  x[1] = y[1] + z[1];
+  x[2] = y[2] - z[2];
+  x[3] = y[3] + z[3];
+  x[4] = y[4] - z[4];
+  x[5] = y[5] + z[5];
+  x[6] = y[6] - z[6];
+  x[7] = y[7] + z[7];
+}
+void __attribute__((noipa)) bar ()
+{
+  x[0] = y[0] + z[0];
+  x[1] = y[1] - z[1];
+  x[2] = y[2] + z[2];
+  x[3] = y[3] - z[3];
+  x[4] = y[4] + z[4];
+  x[5] = y[5] - z[5];
+  x[6] = y[6] + z[6];
+  x[7] = y[7] - z[7];
+}
+int main()
+{
+  for (int i = 0; i < 8; ++i)
+    {
+      y[i] = i + 1;
+      z[i] = 2 * i + 1;
+    }
+  foo ();
+  if (x[0] != 0 || x[1] != 5 || x[2] != -2 || x[3] != 11
+      || x[4] != -4 || x[5] != 17 || x[6] != -6 || x[7] != 23)
+    __builtin_abort ();
+  bar ();
+  if (x[0] != 2 || x[1] != -1 || x[2] != 8 || x[3] != -3
+      || x[4] != 14 || x[5] != -5 || x[6] != 20 || x[7] != -7)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "ADDSUB" 1 "slp2" } } */
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index 2ed49cd9edc..d536494a1bd 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -1490,6 +1490,105 @@ complex_operations_pattern::build (vec_info * /* vinfo */)
   gcc_unreachable ();
 }
 
+
+/* The addsub_pattern.  */
+
+class addsub_pattern : public vect_pattern
+{
+  public:
+    addsub_pattern (slp_tree *node)
+	: vect_pattern (node, NULL, IFN_VEC_ADDSUB) {};
+
+    void build (vec_info *);
+
+    static vect_pattern*
+    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+};
+
+vect_pattern *
+addsub_pattern::recognize (slp_tree_to_load_perm_map_t *, slp_tree *node_)
+{
+  slp_tree node = *node_;
+  if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
+      || SLP_TREE_CHILDREN (node).length () != 2)
+    return NULL;
+
+  /* Match a blend of a plus and a minus op with the same number of plus and
+     minus lanes on the same operands.  */
+  slp_tree sub = SLP_TREE_CHILDREN (node)[0];
+  slp_tree add = SLP_TREE_CHILDREN (node)[1];
+  bool swapped_p = false;
+  if (vect_match_expression_p (sub, PLUS_EXPR))
+    {
+      std::swap (add, sub);
+      swapped_p = true;
+    }
+  if (!(vect_match_expression_p (add, PLUS_EXPR)
+	&& vect_match_expression_p (sub, MINUS_EXPR)))
+    return NULL;
+  if (!((SLP_TREE_CHILDREN (sub)[0] == SLP_TREE_CHILDREN (add)[0]
+	 && SLP_TREE_CHILDREN (sub)[1] == SLP_TREE_CHILDREN (add)[1])
+	|| (SLP_TREE_CHILDREN (sub)[0] == SLP_TREE_CHILDREN (add)[1]
+	    && SLP_TREE_CHILDREN (sub)[1] == SLP_TREE_CHILDREN (add)[0])))
+    return NULL;
+
+  for (unsigned i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
+    {
+      std::pair<unsigned, unsigned> perm = SLP_TREE_LANE_PERMUTATION (node)[i];
+      if (swapped_p)
+	perm.first = perm.first == 0 ? 1 : 0;
+      /* It has to be alternating -, +, -, ...
+	 While we could permute the .ADDSUB inputs and the .ADDSUB output
+	 that's only profitable over the add + sub + blend if at least
+	 one of the permute is optimized which we can't determine here.  */
+      if (perm.first != (i & 1)
+	  || perm.second != i)
+	return NULL;
+    }
+
+  if (!vect_pattern_validate_optab (IFN_VEC_ADDSUB, node))
+    return NULL;
+
+  return new addsub_pattern (node_);
+}
+
+void
+addsub_pattern::build (vec_info *vinfo)
+{
+  slp_tree node = *m_node;
+
+  slp_tree sub = SLP_TREE_CHILDREN (node)[0];
+  slp_tree add = SLP_TREE_CHILDREN (node)[1];
+  if (vect_match_expression_p (sub, PLUS_EXPR))
+    std::swap (add, sub);
+
+  /* Modify the blend node in-place.  */
+  SLP_TREE_CHILDREN (node)[0] = SLP_TREE_CHILDREN (sub)[0];
+  SLP_TREE_CHILDREN (node)[1] = SLP_TREE_CHILDREN (sub)[1];
+  SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (node)[0])++;
+  SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (node)[1])++;
+
+  /* Build IFN_VEC_ADDSUB from the sub representative operands.  */
+  stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (sub);
+  gcall *call = gimple_build_call_internal (IFN_VEC_ADDSUB, 2,
+					    gimple_assign_rhs1 (rep->stmt),
+					    gimple_assign_rhs2 (rep->stmt));
+  gimple_call_set_lhs (call, make_ssa_name
+			       (TREE_TYPE (gimple_assign_lhs (rep->stmt))));
+  gimple_call_set_nothrow (call, true);
+  gimple_set_bb (call, gimple_bb (rep->stmt));
+  SLP_TREE_REPRESENTATIVE (node) = vinfo->add_pattern_stmt (call, rep);
+  STMT_VINFO_RELEVANT (SLP_TREE_REPRESENTATIVE (node)) = vect_used_in_scope;
+  STMT_SLP_TYPE (SLP_TREE_REPRESENTATIVE (node)) = pure_slp;
+  STMT_VINFO_VECTYPE (SLP_TREE_REPRESENTATIVE (node)) = SLP_TREE_VECTYPE (node);
+  STMT_VINFO_SLP_VECT_ONLY_PATTERN (SLP_TREE_REPRESENTATIVE (node)) = true;
+  SLP_TREE_CODE (node) = ERROR_MARK;
+  SLP_TREE_LANE_PERMUTATION (node).release ();
+
+  vect_free_slp_tree (sub);
+  vect_free_slp_tree (add);
+}
+
 /*******************************************************************************
  * Pattern matching definitions
  ******************************************************************************/
@@ -1502,6 +1601,7 @@ vect_pattern_decl_t slp_patterns[]
      overlap in what they can detect.  */
 
   SLP_PATTERN (complex_operations_pattern),
+  SLP_PATTERN (addsub_pattern)
 };
 #undef SLP_PATTERN
 
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 69ee8faed09..227d6aa3ee8 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -3705,6 +3705,7 @@ vect_optimize_slp (vec_info *vinfo)
 	      case CFN_COMPLEX_ADD_ROT270:
 	      case CFN_COMPLEX_MUL:
 	      case CFN_COMPLEX_MUL_CONJ:
+	      case CFN_VEC_ADDSUB:
 		continue;
 	      default:;
 	      }
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 5c71fbc487f..fa28336d429 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2100,7 +2100,8 @@ class vect_pattern
       this->m_ifn = ifn;
       this->m_node = node;
       this->m_ops.create (0);
-      this->m_ops.safe_splice (*m_ops);
+      if (m_ops)
+	this->m_ops.safe_splice (*m_ops);
     }
 
   public:


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-06-24 11:09 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-24 11:09 [gcc r12-1773] Add x86 addsub SLP pattern Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).