public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* Remove redundant AND from count reduction loop
@ 2015-06-23 15:42 Richard Sandiford
  2015-06-23 21:36 ` Marc Glisse
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Sandiford @ 2015-06-23 15:42 UTC (permalink / raw)
  To: gcc-patches

We vectorise:

int
f (int *a, int n)
{
  int count = 0;
  for (int i = 0; i < n; ++i)
    if (a[i] < 255)
      count += 1;
  return count;
}

using an add reduction of a VEC_COND_EXPR <COND, {1, 1, ...}, {0, 0, ...}>.
This leads to the main loop having an AND with a loop invariant {1, 1, ...}.
E.g. on aarch64:

        movi    v2.4s, 0x1
.L4:
        lsl     x5, x4, 4
        add     x4, x4, 1
        cmp     w2, w4
        ldr     q1, [x0, x5]
        cmge    v1.4s, v3.4s, v1.4s
        and     v1.16b, v2.16b, v1.16b
        add     v0.4s, v0.4s, v1.4s
        bhi     .L4

This patch converts an ADD of that VEC_COND_EXPR into a SUB of COND:

.L4:
        lsl     x5, x4, 4
        add     x4, x4, 1
        cmp     w2, w4
        ldr     q1, [x0, x5]
        cmge    v1.4s, v2.4s, v1.4s
        sub     v0.4s, v0.4s, v1.4s
        bhi     .L4

At the moment the simplification is done during forwprop4, after the last
dce pass, and so the VEC_COND_EXPR survives until expand.  Richi says
this a known problem.  Of course, the expression gets deleted by
rtl dce, but it means that a scan-tree-dump of the *.optimized output
can't easily tell that the optimisation has triggered.  I've therefore
added a scan-assembler test instead.

Bootstrapped & regression-tested on x86_64-linux-gnu.  Also tested
on aarch64-elf.  OK to install?

Thanks,
Richard


gcc/
	* match.pd: Add patterns for vec_conds between -1 and 0, and
	between 1 and 0.

gcc/testsuite/
	* gcc.target/aarch64/vect-add-sub-cond.c: New test.

Index: gcc/match.pd
===================================================================
--- gcc/match.pd	2015-06-23 11:42:23.644645975 +0100
+++ gcc/match.pd	2015-06-23 11:42:23.760644655 +0100
@@ -973,6 +973,36 @@ along with GCC; see the file COPYING3.
   (cnd @0 @2 @1)))
 
 
+/* Vector comparisons are defined to produce all-one or all-zero results.  */
+(simplify
+ (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+   (convert @0)))
+
+/* We could instead convert all instances of the vec_cond to negate,
+   but that isn't necessarily a win on its own.  */
+(simplify
+ (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (minus @3 (convert @0))))
+
+(simplify
+ (plus:c @3 (view_convert_expr
+ 	     (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (minus @3 (convert @0))))
+
+(simplify
+ (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (plus @3 (convert @0))))
+
+(simplify
+ (minus @3 (view_convert_expr
+ 	    (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (plus @3 (convert @0))))
+
 /* Simplifications of comparisons.  */
 
 /* We can simplify a logical negation of a comparison to the
Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c
===================================================================
--- /dev/null	2015-06-02 17:27:28.541944012 +0100
+++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c	2015-06-23 12:06:27.120203685 +0100
@@ -0,0 +1,94 @@
+/* Make sure that vector comaprison results are not unnecessarily ANDed
+   with vectors of 1.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define COUNT1(X) if (X) count += 1
+#define COUNT2(X) if (X) count -= 1
+#define COUNT3(X) count += (X)
+#define COUNT4(X) count -= (X)
+
+#define COND1(X) (X)
+#define COND2(X) ((X) ? 1 : 0)
+#define COND3(X) ((X) ? -1 : 0)
+#define COND4(X) ((X) ? 0 : 1)
+#define COND5(X) ((X) ? 0 : -1)
+
+#define TEST_LT(X, Y) ((X) < (Y))
+#define TEST_LE(X, Y) ((X) <= (Y))
+#define TEST_GT(X, Y) ((X) > (Y))
+#define TEST_GE(X, Y) ((X) >= (Y))
+#define TEST_EQ(X, Y) ((X) == (Y))
+#define TEST_NE(X, Y) ((X) != (Y))
+
+#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \
+  TYPE \
+  reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    TYPE count = 0; \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      COUNT (TEST (CMP_ARRAY[i], x)); \
+    return count; \
+  }
+
+#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \
+  void \
+  plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \
+  } \
+  void \
+  plusc_##ID (void) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \
+  } \
+  void \
+  minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \
+  } \
+  void \
+  minusc_##ID (void) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \
+  }
+
+#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \
+  typedef __typeof__(ARRAY[0]) ID##_type; \
+  COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \
+  COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \
+  COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \
+  COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \
+  COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \
+  COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \
+  COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \
+  COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \
+  COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5)
+
+signed int asi[1024] __attribute__ ((aligned (16)));
+unsigned int aui[1024] __attribute__ ((aligned (16)));
+signed long long asl[1024] __attribute__ ((aligned (16)));
+unsigned long long aul[1024] __attribute__ ((aligned (16)));
+float af[1024] __attribute__ ((aligned (16)));
+double ad[1024] __attribute__ ((aligned (16)));
+
+ALL_LOOPS (si_si, aui, asi, TEST_LT)
+ALL_LOOPS (ui_si, aui, asi, TEST_LE)
+ALL_LOOPS (si_ui, aui, asi, TEST_GT)
+ALL_LOOPS (ui_ui, aui, asi, TEST_GE)
+ALL_LOOPS (sl_sl, asl, asl, TEST_NE)
+ALL_LOOPS (ul_ul, aul, aul, TEST_NE)
+ALL_LOOPS (si_f, asi, af, TEST_LE)
+ALL_LOOPS (ui_f, aui, af, TEST_GT)
+ALL_LOOPS (sl_d, asl, ad, TEST_GE)
+ALL_LOOPS (ul_d, aul, ad, TEST_GT)
+
+/* { dg-final { scan-assembler-not "\tand\t" } } */
+/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */
+/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */
+/* { dg-final { scan-assembler "\tldr\tq" } } */
+/* { dg-final { scan-assembler "\tstr\tq" } } */

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2015-06-29  9:14 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-23 15:42 Remove redundant AND from count reduction loop Richard Sandiford
2015-06-23 21:36 ` Marc Glisse
2015-06-24  9:25   ` Richard Biener
2015-06-24  9:59     ` Richard Sandiford
2015-06-24 10:22       ` Richard Biener
2015-06-24 11:29         ` Richard Sandiford
2015-06-24 11:56           ` Richard Biener
2015-06-24 12:37             ` Richard Sandiford
2015-06-24 13:11               ` Richard Biener
2015-06-24 13:53                 ` Richard Sandiford
2015-06-24 14:09                   ` Richard Biener
2015-06-25  8:19                     ` Richard Sandiford
2015-06-25 10:39                       ` Richard Biener
2015-06-25 11:52                         ` Richard Sandiford
2015-06-25 13:17                           ` Richard Biener
2015-06-24 16:42             ` Jeff Law
2015-06-25 22:48         ` Marc Glisse
2015-06-26  9:59           ` Richard Biener
2015-06-28 14:09             ` Marc Glisse
2015-06-29  9:16               ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).