public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r12-5130] AArch64: do not keep negated mask and inverse mask live at the same time
@ 2021-11-10 16:03 Tamar Christina
  0 siblings, 0 replies; only message in thread
From: Tamar Christina @ 2021-11-10 16:03 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:86ffc845b2d0bff59832dcf3cf6518f1358e30ac

commit r12-5130-g86ffc845b2d0bff59832dcf3cf6518f1358e30ac
Author: Tamar Christina <tamar.christina@arm.com>
Date:   Wed Nov 10 15:59:26 2021 +0000

    AArch64: do not keep negated mask and inverse mask live at the same time
    
    The following example:
    
    void f11(double * restrict z, double * restrict w, double * restrict x,
             double * restrict y, int n)
    {
        for (int i = 0; i < n; i++) {
            z[i] = (w[i] > 0) ? w[i] : y[i];
        }
    }
    
    Generates currently:
    
            ptrue   p2.b, all
            ld1d    z0.d, p0/z, [x1, x2, lsl 3]
            fcmgt   p1.d, p2/z, z0.d, #0.0
            bic     p3.b, p2/z, p0.b, p1.b
            ld1d    z1.d, p3/z, [x3, x2, lsl 3]
    
    and after the previous patches generates:
    
            ptrue   p3.b, all
            ld1d    z0.d, p0/z, [x1, x2, lsl 3]
            fcmgt   p1.d, p0/z, z0.d, #0.0
            fcmgt   p2.d, p3/z, z0.d, #0.0
            not     p1.b, p0/z, p1.b
            ld1d    z1.d, p1/z, [x3, x2, lsl 3]
    
    where a duplicate comparison is performed for w[i] > 0.
    
    This is because in the vectorizer we're emitting a comparison for both a and ~a
    where we just need to emit one of them and invert the other.  After this patch
    we generate:
    
            ld1d    z0.d, p0/z, [x1, x2, lsl 3]
            fcmgt   p1.d, p0/z, z0.d, #0.0
            mov     p2.b, p1.b
            not     p1.b, p0/z, p1.b
            ld1d    z1.d, p1/z, [x3, x2, lsl 3]
    
    In order to perform the check I have to fully expand the NOT stmts when
    recording them as the SSA names for the top level expressions differ but
    their arguments don't. e.g. in _31 = ~_34 the value of _34 differs but not
    the operands in _34.
    
    But we only do this when the operation is an ordered one because mixing
    ordered and unordered expressions can lead to de-optimized code.
    
    Note: This patch series is working incrementally towards generating the most
          efficient code for this and other loops in small steps. The mov is
          created by postreload when it does a late CSE.
    
    gcc/ChangeLog:
    
            * tree-vectorizer.h (struct scalar_cond_masked_key): Add inverted_p.
            (default_hash_traits<scalar_conf_masked_key>): Likewise.
            * tree-vect-stmts.c (vectorizable_condition): Check if inverse of mask
            is live.
            * tree-vectorizer.c (scalar_cond_masked_key::get_cond_ops_from_tree):
            Register mask inverses.
    
    gcc/testsuite/ChangeLog:
    
            * gcc.target/aarch64/sve/pred-not-gen-1.c: Update testcase.
            * gcc.target/aarch64/sve/pred-not-gen-2.c: Update testcase.
            * gcc.target/aarch64/sve/pred-not-gen-3.c: Update testcase.
            * gcc.target/aarch64/sve/pred-not-gen-4.c: Update testcase.

Diff:
---
 gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c |  5 +++--
 gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-2.c |  5 +++--
 gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-3.c |  5 +++--
 gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c |  5 +++--
 gcc/tree-vect-stmts.c                                 | 17 +++++++++++++++++
 gcc/tree-vectorizer.c                                 | 19 +++++++++++++++++++
 gcc/tree-vectorizer.h                                 | 10 +++++++---
 7 files changed, 55 insertions(+), 11 deletions(-)

diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
index 2c06564186c..c9a8b82c48a 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-1.c
@@ -1,5 +1,5 @@
-/* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O3 --save-temps" } */
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
 
 /*
 ** f10:
@@ -21,3 +21,4 @@ void f10(double * restrict z, double * restrict w, double * restrict x, double *
 
 /* { dg-final { scan-assembler-not {\tbic\t} } } */
 /* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-9]+/z, z[0-9]+\.d, #0} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-2.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-2.c
index 0c3b78d4c67..8d78f476364 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-2.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-2.c
@@ -1,5 +1,5 @@
-/* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O3 --save-temps" } */
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
 
 /*
 ** f11:
@@ -21,3 +21,4 @@ void f11(double * restrict z, double * restrict w, double * restrict x, double *
 
 /* { dg-final { scan-assembler-not {\tbic\t} } } */
 /* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmgt\tp[0-9]+\.d, p[0-9]+/z, z[0-9]+\.d, #0.0} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-3.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-3.c
index 248f8ab5719..38fbfef2b7a 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-3.c
@@ -1,5 +1,5 @@
-/* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O3 --save-temps" } */
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
 
 /*
 ** f12:
@@ -19,3 +19,4 @@ void f12(int * restrict z, int * restrict w, int * restrict x, int * restrict y,
 
 /* { dg-final { scan-assembler-not {\tbic\t} } } */
 /* { dg-final { scan-assembler-not {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} } } */
+/* { dg-final { scan-assembler-times {\tcmple\tp[0-9]+\.s, p[0-9]+/z, z[0-9]+\.s, #0} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
index 96200309880..0001dd3fc21 100644
--- a/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/pred-not-gen-4.c
@@ -1,5 +1,5 @@
-/* { dg-do assemble { target aarch64_asm_sve_ok } } */
-/* { dg-options "-O3 --save-temps" } */
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
 
 #include <math.h>
 
@@ -12,3 +12,4 @@ void f13(double * restrict z, double * restrict w, double * restrict x, double *
 
 /* { dg-final { scan-assembler-not {\tbic\t} } } */
 /* { dg-final { scan-assembler-times {\tnot\tp[0-9]+\.b, p[0-9]+/z, p[0-9]+\.b\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfcmuo\tp[0-9]+\.d, p[0-9]+/z, z[0-9]+\.d, z[0-9]+\.d} 1 } } */
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 03cc7267cf8..2284ad069e4 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -10344,6 +10344,7 @@ vectorizable_condition (vec_info *vinfo,
 	  else
 	    {
 	      bool honor_nans = HONOR_NANS (TREE_TYPE (cond.op0));
+	      tree_code orig_code = cond.code;
 	      cond.code = invert_tree_comparison (cond.code, honor_nans);
 	      if (loop_vinfo->scalar_cond_masked_set.contains (cond))
 		{
@@ -10351,6 +10352,22 @@ vectorizable_condition (vec_info *vinfo,
 		  cond_code = cond.code;
 		  swap_cond_operands = true;
 		}
+	      else
+		{
+		  /* Try the inverse of the current mask.  We check if the
+		     inverse mask is live and if so we generate a negate of
+		     the current mask such that we still honor NaNs.  */
+		  cond.inverted_p = true;
+		  cond.code = orig_code;
+		  if (loop_vinfo->scalar_cond_masked_set.contains (cond))
+		    {
+		      bitop1 = orig_code;
+		      bitop2 = BIT_NOT_EXPR;
+		      masks = &LOOP_VINFO_MASKS (loop_vinfo);
+		      cond_code = cond.code;
+		      swap_cond_operands = true;
+		    }
+		}
 	    }
 	}
     }
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 3247c9af23b..f493d63d055 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -1678,6 +1678,7 @@ scalar_cond_masked_key::get_cond_ops_from_tree (tree t)
       this->code = TREE_CODE (t);
       this->op0 = TREE_OPERAND (t, 0);
       this->op1 = TREE_OPERAND (t, 1);
+      this->inverted_p = false;
       return;
     }
 
@@ -1690,13 +1691,31 @@ scalar_cond_masked_key::get_cond_ops_from_tree (tree t)
 	    this->code = code;
 	    this->op0 = gimple_assign_rhs1 (stmt);
 	    this->op1 = gimple_assign_rhs2 (stmt);
+	    this->inverted_p = false;
 	    return;
 	  }
+	else if (code == BIT_NOT_EXPR)
+	  {
+	    tree n_op = gimple_assign_rhs1 (stmt);
+	    if ((stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (n_op))))
+	      {
+		code = gimple_assign_rhs_code (stmt);
+		if (TREE_CODE_CLASS (code) == tcc_comparison)
+		  {
+		    this->code = code;
+		    this->op0 = gimple_assign_rhs1 (stmt);
+		    this->op1 = gimple_assign_rhs2 (stmt);
+		    this->inverted_p = true;
+		    return;
+		  }
+	      }
+	  }
       }
 
   this->code = NE_EXPR;
   this->op0 = t;
   this->op1 = build_zero_cst (TREE_TYPE (t));
+  this->inverted_p = false;
 }
 
 /* See the comment above the declaration for details.  */
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index f8f30641512..bd6f334d15f 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -266,6 +266,7 @@ struct scalar_cond_masked_key
   void get_cond_ops_from_tree (tree);
 
   unsigned ncopies;
+  bool inverted_p;
   tree_code code;
   tree op0;
   tree op1;
@@ -285,6 +286,7 @@ struct default_hash_traits<scalar_cond_masked_key>
     inchash::add_expr (v.op0, h, 0);
     inchash::add_expr (v.op1, h, 0);
     h.add_int (v.ncopies);
+    h.add_flag (v.inverted_p);
     return h.end ();
   }
 
@@ -292,9 +294,10 @@ struct default_hash_traits<scalar_cond_masked_key>
   equal (value_type existing, value_type candidate)
   {
     return (existing.ncopies == candidate.ncopies
-           && existing.code == candidate.code
-           && operand_equal_p (existing.op0, candidate.op0, 0)
-           && operand_equal_p (existing.op1, candidate.op1, 0));
+	    && existing.code == candidate.code
+	    && existing.inverted_p == candidate.inverted_p
+	    && operand_equal_p (existing.op0, candidate.op0, 0)
+	    && operand_equal_p (existing.op1, candidate.op1, 0));
   }
 
   static const bool empty_zero_p = true;
@@ -303,6 +306,7 @@ struct default_hash_traits<scalar_cond_masked_key>
   mark_empty (value_type &v)
   {
     v.ncopies = 0;
+    v.inverted_p = false;
   }
 
   static inline bool


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-11-10 16:03 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-10 16:03 [gcc r12-5130] AArch64: do not keep negated mask and inverse mask live at the same time Tamar Christina

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).