Remove redundant AND from count reduction loop

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* Remove redundant AND from count reduction loop
@ 2015-06-23 15:42 Richard Sandiford
  2015-06-23 21:36 ` Marc Glisse
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Sandiford @ 2015-06-23 15:42 UTC (permalink / raw)
  To: gcc-patches

We vectorise:

int
f (int *a, int n)
{
  int count = 0;
  for (int i = 0; i < n; ++i)
    if (a[i] < 255)
      count += 1;
  return count;
}

using an add reduction of a VEC_COND_EXPR <COND, {1, 1, ...}, {0, 0, ...}>.
This leads to the main loop having an AND with a loop invariant {1, 1, ...}.
E.g. on aarch64:

        movi    v2.4s, 0x1
.L4:
        lsl     x5, x4, 4
        add     x4, x4, 1
        cmp     w2, w4
        ldr     q1, [x0, x5]
        cmge    v1.4s, v3.4s, v1.4s
        and     v1.16b, v2.16b, v1.16b
        add     v0.4s, v0.4s, v1.4s
        bhi     .L4

This patch converts an ADD of that VEC_COND_EXPR into a SUB of COND:

.L4:
        lsl     x5, x4, 4
        add     x4, x4, 1
        cmp     w2, w4
        ldr     q1, [x0, x5]
        cmge    v1.4s, v2.4s, v1.4s
        sub     v0.4s, v0.4s, v1.4s
        bhi     .L4

At the moment the simplification is done during forwprop4, after the last
dce pass, and so the VEC_COND_EXPR survives until expand.  Richi says
this a known problem.  Of course, the expression gets deleted by
rtl dce, but it means that a scan-tree-dump of the *.optimized output
can't easily tell that the optimisation has triggered.  I've therefore
added a scan-assembler test instead.

Bootstrapped & regression-tested on x86_64-linux-gnu.  Also tested
on aarch64-elf.  OK to install?

Thanks,
Richard


gcc/
	* match.pd: Add patterns for vec_conds between -1 and 0, and
	between 1 and 0.

gcc/testsuite/
	* gcc.target/aarch64/vect-add-sub-cond.c: New test.

Index: gcc/match.pd
===================================================================
--- gcc/match.pd	2015-06-23 11:42:23.644645975 +0100
+++ gcc/match.pd	2015-06-23 11:42:23.760644655 +0100
@@ -973,6 +973,36 @@ along with GCC; see the file COPYING3.
   (cnd @0 @2 @1)))
 
 
+/* Vector comparisons are defined to produce all-one or all-zero results.  */
+(simplify
+ (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+   (convert @0)))
+
+/* We could instead convert all instances of the vec_cond to negate,
+   but that isn't necessarily a win on its own.  */
+(simplify
+ (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (minus @3 (convert @0))))
+
+(simplify
+ (plus:c @3 (view_convert_expr
+ 	     (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (minus @3 (convert @0))))
+
+(simplify
+ (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (plus @3 (convert @0))))
+
+(simplify
+ (minus @3 (view_convert_expr
+ 	    (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
+ (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+  (plus @3 (convert @0))))
+
 /* Simplifications of comparisons.  */
 
 /* We can simplify a logical negation of a comparison to the
Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c
===================================================================
--- /dev/null	2015-06-02 17:27:28.541944012 +0100
+++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c	2015-06-23 12:06:27.120203685 +0100
@@ -0,0 +1,94 @@
+/* Make sure that vector comaprison results are not unnecessarily ANDed
+   with vectors of 1.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define COUNT1(X) if (X) count += 1
+#define COUNT2(X) if (X) count -= 1
+#define COUNT3(X) count += (X)
+#define COUNT4(X) count -= (X)
+
+#define COND1(X) (X)
+#define COND2(X) ((X) ? 1 : 0)
+#define COND3(X) ((X) ? -1 : 0)
+#define COND4(X) ((X) ? 0 : 1)
+#define COND5(X) ((X) ? 0 : -1)
+
+#define TEST_LT(X, Y) ((X) < (Y))
+#define TEST_LE(X, Y) ((X) <= (Y))
+#define TEST_GT(X, Y) ((X) > (Y))
+#define TEST_GE(X, Y) ((X) >= (Y))
+#define TEST_EQ(X, Y) ((X) == (Y))
+#define TEST_NE(X, Y) ((X) != (Y))
+
+#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \
+  TYPE \
+  reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    TYPE count = 0; \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      COUNT (TEST (CMP_ARRAY[i], x)); \
+    return count; \
+  }
+
+#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \
+  void \
+  plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \
+  } \
+  void \
+  plusc_##ID (void) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \
+  } \
+  void \
+  minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \
+  } \
+  void \
+  minusc_##ID (void) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \
+  }
+
+#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \
+  typedef __typeof__(ARRAY[0]) ID##_type; \
+  COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \
+  COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \
+  COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \
+  COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \
+  COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \
+  COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \
+  COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \
+  COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \
+  COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5)
+
+signed int asi[1024] __attribute__ ((aligned (16)));
+unsigned int aui[1024] __attribute__ ((aligned (16)));
+signed long long asl[1024] __attribute__ ((aligned (16)));
+unsigned long long aul[1024] __attribute__ ((aligned (16)));
+float af[1024] __attribute__ ((aligned (16)));
+double ad[1024] __attribute__ ((aligned (16)));
+
+ALL_LOOPS (si_si, aui, asi, TEST_LT)
+ALL_LOOPS (ui_si, aui, asi, TEST_LE)
+ALL_LOOPS (si_ui, aui, asi, TEST_GT)
+ALL_LOOPS (ui_ui, aui, asi, TEST_GE)
+ALL_LOOPS (sl_sl, asl, asl, TEST_NE)
+ALL_LOOPS (ul_ul, aul, aul, TEST_NE)
+ALL_LOOPS (si_f, asi, af, TEST_LE)
+ALL_LOOPS (ui_f, aui, af, TEST_GT)
+ALL_LOOPS (sl_d, asl, ad, TEST_GE)
+ALL_LOOPS (ul_d, aul, ad, TEST_GT)
+
+/* { dg-final { scan-assembler-not "\tand\t" } } */
+/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */
+/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */
+/* { dg-final { scan-assembler "\tldr\tq" } } */
+/* { dg-final { scan-assembler "\tstr\tq" } } */

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-23 15:42 Remove redundant AND from count reduction loop Richard Sandiford
@ 2015-06-23 21:36 ` Marc Glisse
  2015-06-24  9:25   ` Richard Biener
  0 siblings, 1 reply; 20+ messages in thread
From: Marc Glisse @ 2015-06-23 21:36 UTC (permalink / raw)
  To: Richard Sandiford; +Cc: gcc-patches

On Tue, 23 Jun 2015, Richard Sandiford wrote:

> +/* Vector comparisons are defined to produce all-one or all-zero results.  */
> +(simplify
> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
> +   (convert @0)))

I am trying to understand why the test tree_nop_conversion_p is the right 
one (at least for the transformations not using VIEW_CONVERT_EXPR). By 
definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer 
vector types of the same size and number of elements. It thus seems like a 
conversion is always fine. For vectors, tree_nop_conversion_p apparently 
only checks that they have the same mode (quite often VOIDmode I guess).

> +/* We could instead convert all instances of the vec_cond to negate,
> +   but that isn't necessarily a win on its own.  */
> +(simplify
> + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
> +  (minus @3 (convert @0))))
> +
> +(simplify
> + (plus:c @3 (view_convert_expr

Aren't we suppose to drop _expr in match.pd?

> + 	     (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
> +  (minus @3 (convert @0))))
> +
> +(simplify
> + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
> +  (plus @3 (convert @0))))
> +
> +(simplify
> + (minus @3 (view_convert_expr
> + 	    (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
> +  (plus @3 (convert @0))))
> +
> /* Simplifications of comparisons.  */
>
> /* We can simplify a logical negation of a comparison to the
> Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c
> ===================================================================
> --- /dev/null	2015-06-02 17:27:28.541944012 +0100
> +++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c	2015-06-23 12:06:27.120203685 +0100
> @@ -0,0 +1,94 @@
> +/* Make sure that vector comaprison results are not unnecessarily ANDed
> +   with vectors of 1.  */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +#define COUNT1(X) if (X) count += 1
> +#define COUNT2(X) if (X) count -= 1
> +#define COUNT3(X) count += (X)
> +#define COUNT4(X) count -= (X)
> +
> +#define COND1(X) (X)
> +#define COND2(X) ((X) ? 1 : 0)
> +#define COND3(X) ((X) ? -1 : 0)
> +#define COND4(X) ((X) ? 0 : 1)
> +#define COND5(X) ((X) ? 0 : -1)
> +
> +#define TEST_LT(X, Y) ((X) < (Y))
> +#define TEST_LE(X, Y) ((X) <= (Y))
> +#define TEST_GT(X, Y) ((X) > (Y))
> +#define TEST_GE(X, Y) ((X) >= (Y))
> +#define TEST_EQ(X, Y) ((X) == (Y))
> +#define TEST_NE(X, Y) ((X) != (Y))
> +
> +#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \
> +  TYPE \
> +  reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \
> +  { \
> +    TYPE count = 0; \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      COUNT (TEST (CMP_ARRAY[i], x)); \
> +    return count; \
> +  }
> +
> +#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \
> +  void \
> +  plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \
> +  } \
> +  void \
> +  plusc_##ID (void) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \
> +  } \
> +  void \
> +  minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \
> +  } \
> +  void \
> +  minusc_##ID (void) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \
> +  }
> +
> +#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \
> +  typedef __typeof__(ARRAY[0]) ID##_type; \
> +  COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \
> +  COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \
> +  COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \
> +  COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \
> +  COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \
> +  COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \
> +  COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \
> +  COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \
> +  COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5)
> +
> +signed int asi[1024] __attribute__ ((aligned (16)));
> +unsigned int aui[1024] __attribute__ ((aligned (16)));
> +signed long long asl[1024] __attribute__ ((aligned (16)));
> +unsigned long long aul[1024] __attribute__ ((aligned (16)));
> +float af[1024] __attribute__ ((aligned (16)));
> +double ad[1024] __attribute__ ((aligned (16)));
> +
> +ALL_LOOPS (si_si, aui, asi, TEST_LT)
> +ALL_LOOPS (ui_si, aui, asi, TEST_LE)
> +ALL_LOOPS (si_ui, aui, asi, TEST_GT)
> +ALL_LOOPS (ui_ui, aui, asi, TEST_GE)
> +ALL_LOOPS (sl_sl, asl, asl, TEST_NE)
> +ALL_LOOPS (ul_ul, aul, aul, TEST_NE)
> +ALL_LOOPS (si_f, asi, af, TEST_LE)
> +ALL_LOOPS (ui_f, aui, af, TEST_GT)
> +ALL_LOOPS (sl_d, asl, ad, TEST_GE)
> +ALL_LOOPS (ul_d, aul, ad, TEST_GT)
> +
> +/* { dg-final { scan-assembler-not "\tand\t" } } */
> +/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */
> +/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */
> +/* { dg-final { scan-assembler "\tldr\tq" } } */
> +/* { dg-final { scan-assembler "\tstr\tq" } } */

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-23 21:36 ` Marc Glisse
@ 2015-06-24  9:25   ` Richard Biener
  2015-06-24  9:59     ` Richard Sandiford
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Biener @ 2015-06-24  9:25 UTC (permalink / raw)
  To: GCC Patches; +Cc: Richard Sandiford

On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> On Tue, 23 Jun 2015, Richard Sandiford wrote:
>
>> +/* Vector comparisons are defined to produce all-one or all-zero results.
>> */
>> +(simplify
>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>> +   (convert @0)))
>
>
> I am trying to understand why the test tree_nop_conversion_p is the right
> one (at least for the transformations not using VIEW_CONVERT_EXPR). By
> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer vector
> types of the same size and number of elements. It thus seems like a
> conversion is always fine. For vectors, tree_nop_conversion_p apparently
> only checks that they have the same mode (quite often VOIDmode I guess).

The only conversion we seem to allow is changing the signed vector from
the comparison result to an unsigned vector (same number of elements
and same mode of the elements).  That is, a check using
TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably
be better (well, technically a TYPE_VECTOR_SUBPARTS && element
mode compare should be better as generic vectors might not have a vector mode).

I'm fine with using tree_nop_conversion_p for now.

>> +/* We could instead convert all instances of the vec_cond to negate,
>> +   but that isn't necessarily a win on its own.  */

so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks more compact
at least ;)  It would also simplify the patterns below.

I'm missing a comment on the transform done by the following patterns.

>> +(simplify
>> + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>> +  (minus @3 (convert @0))))
>> +
>> +(simplify
>> + (plus:c @3 (view_convert_expr
>
>
> Aren't we suppose to drop _expr in match.pd?

Yes.  I probably should adjust genmatch.c to reject the _expr variants ;)

>
>> +            (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>> +  (minus @3 (convert @0))))
>> +
>> +(simplify
>> + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>> +  (plus @3 (convert @0))))
>> +
>> +(simplify
>> + (minus @3 (view_convert_expr
>> +           (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>> +  (plus @3 (convert @0))))
>> +

Generally for sign-conversions of vectors you should use view_convert.

The above also hints at missing conditional view_convert support
and a way to iterate over commutative vs. non-commutative ops so
we could write

(for op (plus:c minus)
     rop (minus plus)
  (op @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
   (rop @3 (view_convert @0)))))

I'll see implementing that.

Richard.


>> /* Simplifications of comparisons.  */
>>
>> /* We can simplify a logical negation of a comparison to the
>> Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c
>> ===================================================================
>> --- /dev/null   2015-06-02 17:27:28.541944012 +0100
>> +++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c        2015-06-23
>> 12:06:27.120203685 +0100
>> @@ -0,0 +1,94 @@
>> +/* Make sure that vector comaprison results are not unnecessarily ANDed
>> +   with vectors of 1.  */
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2 -ftree-vectorize" } */
>> +
>> +#define COUNT1(X) if (X) count += 1
>> +#define COUNT2(X) if (X) count -= 1
>> +#define COUNT3(X) count += (X)
>> +#define COUNT4(X) count -= (X)
>> +
>> +#define COND1(X) (X)
>> +#define COND2(X) ((X) ? 1 : 0)
>> +#define COND3(X) ((X) ? -1 : 0)
>> +#define COND4(X) ((X) ? 0 : 1)
>> +#define COND5(X) ((X) ? 0 : -1)
>> +
>> +#define TEST_LT(X, Y) ((X) < (Y))
>> +#define TEST_LE(X, Y) ((X) <= (Y))
>> +#define TEST_GT(X, Y) ((X) > (Y))
>> +#define TEST_GE(X, Y) ((X) >= (Y))
>> +#define TEST_EQ(X, Y) ((X) == (Y))
>> +#define TEST_NE(X, Y) ((X) != (Y))
>> +
>> +#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \
>> +  TYPE \
>> +  reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \
>> +  { \
>> +    TYPE count = 0; \
>> +    for (unsigned int i = 0; i < 1024; ++i) \
>> +      COUNT (TEST (CMP_ARRAY[i], x)); \
>> +    return count; \
>> +  }
>> +
>> +#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \
>> +  void \
>> +  plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
>> +  { \
>> +    for (unsigned int i = 0; i < 1024; ++i) \
>> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \
>> +  } \
>> +  void \
>> +  plusc_##ID (void) \
>> +  { \
>> +    for (unsigned int i = 0; i < 1024; ++i) \
>> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \
>> +  } \
>> +  void \
>> +  minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
>> +  { \
>> +    for (unsigned int i = 0; i < 1024; ++i) \
>> +      ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \
>> +  } \
>> +  void \
>> +  minusc_##ID (void) \
>> +  { \
>> +    for (unsigned int i = 0; i < 1024; ++i) \
>> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \
>> +  }
>> +
>> +#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \
>> +  typedef __typeof__(ARRAY[0]) ID##_type; \
>> +  COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \
>> +  COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \
>> +  COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \
>> +  COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \
>> +  COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \
>> +  COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \
>> +  COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \
>> +  COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \
>> +  COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5)
>> +
>> +signed int asi[1024] __attribute__ ((aligned (16)));
>> +unsigned int aui[1024] __attribute__ ((aligned (16)));
>> +signed long long asl[1024] __attribute__ ((aligned (16)));
>> +unsigned long long aul[1024] __attribute__ ((aligned (16)));
>> +float af[1024] __attribute__ ((aligned (16)));
>> +double ad[1024] __attribute__ ((aligned (16)));
>> +
>> +ALL_LOOPS (si_si, aui, asi, TEST_LT)
>> +ALL_LOOPS (ui_si, aui, asi, TEST_LE)
>> +ALL_LOOPS (si_ui, aui, asi, TEST_GT)
>> +ALL_LOOPS (ui_ui, aui, asi, TEST_GE)
>> +ALL_LOOPS (sl_sl, asl, asl, TEST_NE)
>> +ALL_LOOPS (ul_ul, aul, aul, TEST_NE)
>> +ALL_LOOPS (si_f, asi, af, TEST_LE)
>> +ALL_LOOPS (ui_f, aui, af, TEST_GT)
>> +ALL_LOOPS (sl_d, asl, ad, TEST_GE)
>> +ALL_LOOPS (ul_d, aul, ad, TEST_GT)
>> +
>> +/* { dg-final { scan-assembler-not "\tand\t" } } */
>> +/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */
>> +/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */
>> +/* { dg-final { scan-assembler "\tldr\tq" } } */
>> +/* { dg-final { scan-assembler "\tstr\tq" } } */
>
>
> --
> Marc Glisse

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-24  9:25   ` Richard Biener
@ 2015-06-24  9:59     ` Richard Sandiford
  2015-06-24 10:22       ` Richard Biener
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Sandiford @ 2015-06-24  9:59 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

Richard Biener <richard.guenther@gmail.com> writes:
> On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
>> On Tue, 23 Jun 2015, Richard Sandiford wrote:
>>
>>> +/* Vector comparisons are defined to produce all-one or all-zero results.
>>> */
>>> +(simplify
>>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>> +   (convert @0)))
>>
>>
>> I am trying to understand why the test tree_nop_conversion_p is the right
>> one (at least for the transformations not using VIEW_CONVERT_EXPR). By
>> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer vector
>> types of the same size and number of elements. It thus seems like a
>> conversion is always fine. For vectors, tree_nop_conversion_p apparently
>> only checks that they have the same mode (quite often VOIDmode I guess).
>
> The only conversion we seem to allow is changing the signed vector from
> the comparison result to an unsigned vector (same number of elements
> and same mode of the elements).  That is, a check using
> TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably
> be better (well, technically a TYPE_VECTOR_SUBPARTS && element
> mode compare should be better as generic vectors might not have a vector mode).

OK.  The reason I was being paranoid was that I couldn't see anywhere
where we enforced that the vector condition in a VEC_COND had to have
the same element width as the values being selected.  tree-cfg.c
only checks that rhs2 and rhs3 are compatible with the result.
There doesn't seem to be any checking of rhs1 vs. the other types.
So I wasn't sure whether anything stopped us from, e.g., comparing two
V4HIs and using the result to select between two V4SIs.

> I'm fine with using tree_nop_conversion_p for now.

I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element
mode.  How about:

 (if (VECTOR_INTEGER_TYPE_P (type)
      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
      && (TYPE_MODE (TREE_TYPE (type))
          == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))

(But is it really OK to be adding more mode-based compatibility checks?
I thought you were hoping to move away from modes in the middle end.)

>>> +/* We could instead convert all instances of the vec_cond to negate,
>>> +   but that isn't necessarily a win on its own.  */
>
> so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks more compact
> at least ;)  It would also simplify the patterns below.

In the past I've dealt with processors where arithmetic wasn't handled
as efficiently as logical ops.  Seems like an especial risk for 64-bit
elements, from a quick scan of the i386 scheduling models.

> I'm missing a comment on the transform done by the following patterns.

Heh.  The comment was supposed to be describing all four at once.
I originally had then bunched together without whitespace, but it
looked bad.

>>> +(simplify
>>> + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>> +  (minus @3 (convert @0))))
>>> +
>>> +(simplify
>>> + (plus:c @3 (view_convert_expr
>>
>>
>> Aren't we suppose to drop _expr in match.pd?
>
> Yes.  I probably should adjust genmatch.c to reject the _expr variants ;)

OK.

>>> +            (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>> +  (minus @3 (convert @0))))
>>> +
>>> +(simplify
>>> + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>> +  (plus @3 (convert @0))))
>>> +
>>> +(simplify
>>> + (minus @3 (view_convert_expr
>>> +           (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>> +  (plus @3 (convert @0))))
>>> +
>
> Generally for sign-conversions of vectors you should use view_convert.

OK.

> The above also hints at missing conditional view_convert support
> and a way to iterate over commutative vs. non-commutative ops so
> we could write
>
> (for op (plus:c minus)
>      rop (minus plus)
>   (op @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>   (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>    (rop @3 (view_convert @0)))))
>
> I'll see implementing that.

Looks good. :-)

I also realised later that:

/* Vector comparisons are defined to produce all-one or all-zero results.  */
(simplify
 (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
 (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
   (convert @0)))

is redundant with some fold-const.c code.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-24  9:59     ` Richard Sandiford
@ 2015-06-24 10:22       ` Richard Biener
  2015-06-24 11:29         ` Richard Sandiford
  2015-06-25 22:48         ` Marc Glisse
  0 siblings, 2 replies; 20+ messages in thread
From: Richard Biener @ 2015-06-24 10:22 UTC (permalink / raw)
  To: Richard Biener, GCC Patches, richard.sandiford

On Wed, Jun 24, 2015 at 11:57 AM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
> Richard Biener <richard.guenther@gmail.com> writes:
>> On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
>>> On Tue, 23 Jun 2015, Richard Sandiford wrote:
>>>
>>>> +/* Vector comparisons are defined to produce all-one or all-zero results.
>>>> */
>>>> +(simplify
>>>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>> +   (convert @0)))
>>>
>>>
>>> I am trying to understand why the test tree_nop_conversion_p is the right
>>> one (at least for the transformations not using VIEW_CONVERT_EXPR). By
>>> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer vector
>>> types of the same size and number of elements. It thus seems like a
>>> conversion is always fine. For vectors, tree_nop_conversion_p apparently
>>> only checks that they have the same mode (quite often VOIDmode I guess).
>>
>> The only conversion we seem to allow is changing the signed vector from
>> the comparison result to an unsigned vector (same number of elements
>> and same mode of the elements).  That is, a check using
>> TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably
>> be better (well, technically a TYPE_VECTOR_SUBPARTS && element
>> mode compare should be better as generic vectors might not have a vector mode).
>
> OK.  The reason I was being paranoid was that I couldn't see anywhere
> where we enforced that the vector condition in a VEC_COND had to have
> the same element width as the values being selected.

We don't require that indeed.

>  tree-cfg.c
> only checks that rhs2 and rhs3 are compatible with the result.
> There doesn't seem to be any checking of rhs1 vs. the other types.
> So I wasn't sure whether anything stopped us from, e.g., comparing two
> V4HIs and using the result to select between two V4SIs.

Nothing does (or should).

>> I'm fine with using tree_nop_conversion_p for now.
>
> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element
> mode.  How about:
>
>  (if (VECTOR_INTEGER_TYPE_P (type)
>       && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>       && (TYPE_MODE (TREE_TYPE (type))
>           == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
>
> (But is it really OK to be adding more mode-based compatibility checks?
> I thought you were hoping to move away from modes in the middle end.)

The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant
(the type of a comparison is always a signed vector integer type).
Yes, mode-based
checks are ok.  I don't see us moving away from them.

>>>> +/* We could instead convert all instances of the vec_cond to negate,
>>>> +   but that isn't necessarily a win on its own.  */
>>
>> so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks more compact
>> at least ;)  It would also simplify the patterns below.
>
> In the past I've dealt with processors where arithmetic wasn't handled
> as efficiently as logical ops.  Seems like an especial risk for 64-bit
> elements, from a quick scan of the i386 scheduling models.

But then expansion could undo this ...

>> I'm missing a comment on the transform done by the following patterns.
>
> Heh.  The comment was supposed to be describing all four at once.
> I originally had then bunched together without whitespace, but it
> looked bad.
>
>>>> +(simplify
>>>> + (plus:c @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>> +  (minus @3 (convert @0))))
>>>> +
>>>> +(simplify
>>>> + (plus:c @3 (view_convert_expr
>>>
>>>
>>> Aren't we suppose to drop _expr in match.pd?
>>
>> Yes.  I probably should adjust genmatch.c to reject the _expr variants ;)
>
> OK.
>
>>>> +            (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>> +  (minus @3 (convert @0))))
>>>> +
>>>> +(simplify
>>>> + (minus @3 (vec_cond @0 integer_each_onep@1 integer_zerop@2))
>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>> +  (plus @3 (convert @0))))
>>>> +
>>>> +(simplify
>>>> + (minus @3 (view_convert_expr
>>>> +           (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>> +  (plus @3 (convert @0))))
>>>> +
>>
>> Generally for sign-conversions of vectors you should use view_convert.
>
> OK.
>
>> The above also hints at missing conditional view_convert support
>> and a way to iterate over commutative vs. non-commutative ops so
>> we could write
>>
>> (for op (plus:c minus)
>>      rop (minus plus)
>>   (op @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>>   (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>    (rop @3 (view_convert @0)))))
>>
>> I'll see implementing that.
>
> Looks good. :-)
>
> I also realised later that:
>
> /* Vector comparisons are defined to produce all-one or all-zero results.  */
> (simplify
>  (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>    (convert @0)))
>
> is redundant with some fold-const.c code.

If so then you should remove the fold-const.c at the time you add the pattern.

Note that ISTR code performing exactly the opposite transform in
fold-const.c ...

Richard.

> Thanks,
> Richard
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-24 10:22       ` Richard Biener
@ 2015-06-24 11:29         ` Richard Sandiford
  2015-06-24 11:56           ` Richard Biener
  2015-06-25 22:48         ` Marc Glisse
  1 sibling, 1 reply; 20+ messages in thread
From: Richard Sandiford @ 2015-06-24 11:29 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

Richard Biener <richard.guenther@gmail.com> writes:
>>> I'm fine with using tree_nop_conversion_p for now.
>>
>> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element
>> mode.  How about:
>>
>>  (if (VECTOR_INTEGER_TYPE_P (type)
>>       && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>>       && (TYPE_MODE (TREE_TYPE (type))
>>           == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
>>
>> (But is it really OK to be adding more mode-based compatibility checks?
>> I thought you were hoping to move away from modes in the middle end.)
>
> The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant
> (the type of a comparison is always a signed vector integer type).

OK, will just use VECTOR_TYPE_P then.

>>>>> +/* We could instead convert all instances of the vec_cond to negate,
>>>>> +   but that isn't necessarily a win on its own.  */
>>>
>>> so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks more compact
>>> at least ;)  It would also simplify the patterns below.
>>
>> In the past I've dealt with processors where arithmetic wasn't handled
>> as efficiently as logical ops.  Seems like an especial risk for 64-bit
>> elements, from a quick scan of the i386 scheduling models.
>
> But then expansion could undo this ...

So do the inverse fold and convert (neg (cond)) to (vec_cond cond 1 0)?
Is there precendent for doing that kind of thing?

>> I also realised later that:
>>
>> /* Vector comparisons are defined to produce all-one or all-zero results.  */
>> (simplify
>>  (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>>  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>    (convert @0)))
>>
>> is redundant with some fold-const.c code.
>
> If so then you should remove the fold-const.c at the time you add the pattern.

Can I just drop that part of the patch instead?  The fold-const.c
code handles COND_EXPR and VEC_COND_EXPR analogously, so I'd have
to move COND_EXPR at the same time.  And then the natural follow-on
would be: why not move the other COND_EXPR and VEC_COND_EXPR folds too? :-)

> Note that ISTR code performing exactly the opposite transform in
> fold-const.c ...

That's another reason why I'm worried about just doing the (negate ...)
thing without knowing whether the negate can be folded into anything else.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-24 11:29         ` Richard Sandiford
@ 2015-06-24 11:56           ` Richard Biener
  2015-06-24 12:37             ` Richard Sandiford
  2015-06-24 16:42             ` Jeff Law
  0 siblings, 2 replies; 20+ messages in thread
From: Richard Biener @ 2015-06-24 11:56 UTC (permalink / raw)
  To: Richard Biener, GCC Patches, richard.sandiford

On Wed, Jun 24, 2015 at 1:10 PM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
> Richard Biener <richard.guenther@gmail.com> writes:
>>>> I'm fine with using tree_nop_conversion_p for now.
>>>
>>> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element
>>> mode.  How about:
>>>
>>>  (if (VECTOR_INTEGER_TYPE_P (type)
>>>       && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>>>       && (TYPE_MODE (TREE_TYPE (type))
>>>           == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
>>>
>>> (But is it really OK to be adding more mode-based compatibility checks?
>>> I thought you were hoping to move away from modes in the middle end.)
>>
>> The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant
>> (the type of a comparison is always a signed vector integer type).
>
> OK, will just use VECTOR_TYPE_P then.

Given we're in a VEC_COND_EXPR that's redundant as well.

>>>>>> +/* We could instead convert all instances of the vec_cond to negate,
>>>>>> +   but that isn't necessarily a win on its own.  */
>>>>
>>>> so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks more compact
>>>> at least ;)  It would also simplify the patterns below.
>>>
>>> In the past I've dealt with processors where arithmetic wasn't handled
>>> as efficiently as logical ops.  Seems like an especial risk for 64-bit
>>> elements, from a quick scan of the i386 scheduling models.
>>
>> But then expansion could undo this ...
>
> So do the inverse fold and convert (neg (cond)) to (vec_cond cond 1 0)?
> Is there precendent for doing that kind of thing?

Expanding it as this, yes.  Whether there is precedence no idea, but
surely the expand_unop path could, if there is no optab for neg:vector_mode,
try expanding as vec_cond .. 1 0.  There is precedence for different
expansion paths dependent on optabs (or even rtx cost?).  Of course
expand_unop doesn't get the original tree ops (expand_expr.c does,
where some special-casing using get_gimple_for_expr is).  Not sure
if expand_unop would get 'cond' in a form where it can recognize
the result is either -1 or 0.

>>> I also realised later that:
>>>
>>> /* Vector comparisons are defined to produce all-one or all-zero results.  */
>>> (simplify
>>>  (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>>>  (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>    (convert @0)))
>>>
>>> is redundant with some fold-const.c code.
>>
>> If so then you should remove the fold-const.c at the time you add the pattern.
>
> Can I just drop that part of the patch instead?  The fold-const.c
> code handles COND_EXPR and VEC_COND_EXPR analogously, so I'd have
> to move COND_EXPR at the same time.  And then the natural follow-on
> would be: why not move the other COND_EXPR and VEC_COND_EXPR folds too? :-)

Yes, why not? ;)  But sure, you can also drop the case for now.

>> Note that ISTR code performing exactly the opposite transform in
>> fold-const.c ...
>
> That's another reason why I'm worried about just doing the (negate ...)
> thing without knowing whether the negate can be folded into anything else.

I'm not aware of anything here.

Richard.

> Thanks,
> Richard
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-24 11:56           ` Richard Biener
@ 2015-06-24 12:37             ` Richard Sandiford
  2015-06-24 13:11               ` Richard Biener
  2015-06-24 16:42             ` Jeff Law
  1 sibling, 1 reply; 20+ messages in thread
From: Richard Sandiford @ 2015-06-24 12:37 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

Richard Biener <richard.guenther@gmail.com> writes:
> On Wed, Jun 24, 2015 at 1:10 PM, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>> Richard Biener <richard.guenther@gmail.com> writes:
>>>>> I'm fine with using tree_nop_conversion_p for now.
>>>>
>>>> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element
>>>> mode.  How about:
>>>>
>>>>  (if (VECTOR_INTEGER_TYPE_P (type)
>>>>       && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>>>>       && (TYPE_MODE (TREE_TYPE (type))
>>>>           == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
>>>>
>>>> (But is it really OK to be adding more mode-based compatibility checks?
>>>> I thought you were hoping to move away from modes in the middle end.)
>>>
>>> The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant
>>> (the type of a comparison is always a signed vector integer type).
>>
>> OK, will just use VECTOR_TYPE_P then.
>
> Given we're in a VEC_COND_EXPR that's redundant as well.

Hmm, but is it really guaranteed in:

 (plus:c @3 (view_convert (vec_cond @0 integer_each_onep@1 integer_zerop@2)))

that the @3 and the view_convert are also vectors?  I thought we allowed
view_converts from vector to non-vector types.

>>>>>>> +/* We could instead convert all instances of the vec_cond to negate,
>>>>>>> +   but that isn't necessarily a win on its own.  */
>>>>>
>>>>> so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks
>>>>> more compact
>>>>> at least ;)  It would also simplify the patterns below.
>>>>
>>>> In the past I've dealt with processors where arithmetic wasn't handled
>>>> as efficiently as logical ops.  Seems like an especial risk for 64-bit
>>>> elements, from a quick scan of the i386 scheduling models.
>>>
>>> But then expansion could undo this ...
>>
>> So do the inverse fold and convert (neg (cond)) to (vec_cond cond 1 0)?
>> Is there precendent for doing that kind of thing?
>
> Expanding it as this, yes.  Whether there is precedence no idea, but
> surely the expand_unop path could, if there is no optab for neg:vector_mode,
> try expanding as vec_cond .. 1 0.

Yeah, that part isn't the problem.  It's when there is an implementation
of (neg ...) (which I'd hope all real integer vector architectures would
support) but it's not as efficient as the (and ...) that most targets
would use for a (vec_cond ... 0).

> There is precedence for different
> expansion paths dependent on optabs (or even rtx cost?).  Of course
> expand_unop doesn't get the original tree ops (expand_expr.c does,
> where some special-casing using get_gimple_for_expr is).  Not sure
> if expand_unop would get 'cond' in a form where it can recognize
> the result is either -1 or 0.

It just seems inconsistent to have the optabs machinery try to detect
this ad-hoc combination opportunity while still leaving the vcond optab
to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0).
The vcond optabs would still have the logic needed to produce the
right code, but we'd be circumventing it and trying to reimplement
one particular case in a different way.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-24 12:37             ` Richard Sandiford
@ 2015-06-24 13:11               ` Richard Biener
  2015-06-24 13:53                 ` Richard Sandiford
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Biener @ 2015-06-24 13:11 UTC (permalink / raw)
  To: Richard Biener, GCC Patches, richard.sandiford

On Wed, Jun 24, 2015 at 2:28 PM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
> Richard Biener <richard.guenther@gmail.com> writes:
>> On Wed, Jun 24, 2015 at 1:10 PM, Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>>> Richard Biener <richard.guenther@gmail.com> writes:
>>>>>> I'm fine with using tree_nop_conversion_p for now.
>>>>>
>>>>> I like the suggestion about checking TYPE_VECTOR_SUBPARTS and the element
>>>>> mode.  How about:
>>>>>
>>>>>  (if (VECTOR_INTEGER_TYPE_P (type)
>>>>>       && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>>>>>       && (TYPE_MODE (TREE_TYPE (type))
>>>>>           == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
>>>>>
>>>>> (But is it really OK to be adding more mode-based compatibility checks?
>>>>> I thought you were hoping to move away from modes in the middle end.)
>>>>
>>>> The TYPE_MODE check makes the VECTOR_INTEGER_TYPE_P check redundant
>>>> (the type of a comparison is always a signed vector integer type).
>>>
>>> OK, will just use VECTOR_TYPE_P then.
>>
>> Given we're in a VEC_COND_EXPR that's redundant as well.
>
> Hmm, but is it really guaranteed in:
>
>  (plus:c @3 (view_convert (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>
> that the @3 and the view_convert are also vectors?  I thought we allowed
> view_converts from vector to non-vector types.

Hmm, true.

>>>>>>>> +/* We could instead convert all instances of the vec_cond to negate,
>>>>>>>> +   but that isn't necessarily a win on its own.  */
>>>>>>
>>>>>> so p ? 1 : 0 -> -p?  Why isn't that a win on its own?  It looks
>>>>>> more compact
>>>>>> at least ;)  It would also simplify the patterns below.
>>>>>
>>>>> In the past I've dealt with processors where arithmetic wasn't handled
>>>>> as efficiently as logical ops.  Seems like an especial risk for 64-bit
>>>>> elements, from a quick scan of the i386 scheduling models.
>>>>
>>>> But then expansion could undo this ...
>>>
>>> So do the inverse fold and convert (neg (cond)) to (vec_cond cond 1 0)?
>>> Is there precendent for doing that kind of thing?
>>
>> Expanding it as this, yes.  Whether there is precedence no idea, but
>> surely the expand_unop path could, if there is no optab for neg:vector_mode,
>> try expanding as vec_cond .. 1 0.
>
> Yeah, that part isn't the problem.  It's when there is an implementation
> of (neg ...) (which I'd hope all real integer vector architectures would
> support) but it's not as efficient as the (and ...) that most targets
> would use for a (vec_cond ... 0).

I would suppose that a single-operand op (neg) is always bettter than a
two-operand (and) one.  But you of course never know...

>> There is precedence for different
>> expansion paths dependent on optabs (or even rtx cost?).  Of course
>> expand_unop doesn't get the original tree ops (expand_expr.c does,
>> where some special-casing using get_gimple_for_expr is).  Not sure
>> if expand_unop would get 'cond' in a form where it can recognize
>> the result is either -1 or 0.
>
> It just seems inconsistent to have the optabs machinery try to detect
> this ad-hoc combination opportunity while still leaving the vcond optab
> to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0).
> The vcond optabs would still have the logic needed to produce the
> right code, but we'd be circumventing it and trying to reimplement
> one particular case in a different way.

That's true.  One could also leave it to combine / simplify_rtx and
thus rtx_cost.  But that's true of all of the match.pd stuff you add, no?

Richard.

> Thanks,
> Richard
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-24 13:11               ` Richard Biener
@ 2015-06-24 13:53                 ` Richard Sandiford
  2015-06-24 14:09                   ` Richard Biener
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Sandiford @ 2015-06-24 13:53 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

>>> There is precedence for different
>>> expansion paths dependent on optabs (or even rtx cost?).  Of course
>>> expand_unop doesn't get the original tree ops (expand_expr.c does,
>>> where some special-casing using get_gimple_for_expr is).  Not sure
>>> if expand_unop would get 'cond' in a form where it can recognize
>>> the result is either -1 or 0.
>>
>> It just seems inconsistent to have the optabs machinery try to detect
>> this ad-hoc combination opportunity while still leaving the vcond optab
>> to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0).
>> The vcond optabs would still have the logic needed to produce the
>> right code, but we'd be circumventing it and trying to reimplement
>> one particular case in a different way.
>
> That's true.  One could also leave it to combine / simplify_rtx and
> thus rtx_cost.  But that's true of all of the match.pd stuff you add, no?

It's probably true of most match.pd stuff in general though :-)
One advantage of match.pd of course is that it works across
block boundaries.

The difference between the stuff I added and converting vec_cond_expr
to negate is that the stuff I added avoids the vec_cond_expr altogether
and so ought to be an unequivocal win.  Replacing vec_cond_expr with
negate just rewrites it into another (arguably more surprising) form.

Thanks,
Richard

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-24 13:53                 ` Richard Sandiford
@ 2015-06-24 14:09                   ` Richard Biener
  2015-06-25  8:19                     ` Richard Sandiford
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Biener @ 2015-06-24 14:09 UTC (permalink / raw)
  To: Richard Biener, GCC Patches, richard.sandiford

On Wed, Jun 24, 2015 at 3:37 PM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
>>>> There is precedence for different
>>>> expansion paths dependent on optabs (or even rtx cost?).  Of course
>>>> expand_unop doesn't get the original tree ops (expand_expr.c does,
>>>> where some special-casing using get_gimple_for_expr is).  Not sure
>>>> if expand_unop would get 'cond' in a form where it can recognize
>>>> the result is either -1 or 0.
>>>
>>> It just seems inconsistent to have the optabs machinery try to detect
>>> this ad-hoc combination opportunity while still leaving the vcond optab
>>> to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0).
>>> The vcond optabs would still have the logic needed to produce the
>>> right code, but we'd be circumventing it and trying to reimplement
>>> one particular case in a different way.
>>
>> That's true.  One could also leave it to combine / simplify_rtx and
>> thus rtx_cost.  But that's true of all of the match.pd stuff you add, no?
>
> It's probably true of most match.pd stuff in general though :-)
> One advantage of match.pd of course is that it works across
> block boundaries.
>
> The difference between the stuff I added and converting vec_cond_expr
> to negate is that the stuff I added avoids the vec_cond_expr altogether
> and so ought to be an unequivocal win.  Replacing vec_cond_expr with
> negate just rewrites it into another (arguably more surprising) form.

True.  Btw, conditional view_convert is now in trunk so you can at least
merge both plus:c patterns and both minus patterns.

Richard.

> Thanks,
> Richard
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-24 14:09                   ` Richard Biener
@ 2015-06-25  8:19                     ` Richard Sandiford
  2015-06-25 10:39                       ` Richard Biener
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Sandiford @ 2015-06-25  8:19 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

Richard Biener <richard.guenther@gmail.com> writes:
> On Wed, Jun 24, 2015 at 3:37 PM, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>>>> There is precedence for different
>>>>> expansion paths dependent on optabs (or even rtx cost?).  Of course
>>>>> expand_unop doesn't get the original tree ops (expand_expr.c does,
>>>>> where some special-casing using get_gimple_for_expr is).  Not sure
>>>>> if expand_unop would get 'cond' in a form where it can recognize
>>>>> the result is either -1 or 0.
>>>>
>>>> It just seems inconsistent to have the optabs machinery try to detect
>>>> this ad-hoc combination opportunity while still leaving the vcond optab
>>>> to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0).
>>>> The vcond optabs would still have the logic needed to produce the
>>>> right code, but we'd be circumventing it and trying to reimplement
>>>> one particular case in a different way.
>>>
>>> That's true.  One could also leave it to combine / simplify_rtx and
>>> thus rtx_cost.  But that's true of all of the match.pd stuff you add, no?
>>
>> It's probably true of most match.pd stuff in general though :-)
>> One advantage of match.pd of course is that it works across
>> block boundaries.
>>
>> The difference between the stuff I added and converting vec_cond_expr
>> to negate is that the stuff I added avoids the vec_cond_expr altogether
>> and so ought to be an unequivocal win.  Replacing vec_cond_expr with
>> negate just rewrites it into another (arguably more surprising) form.
>
> True.  Btw, conditional view_convert is now in trunk so you can at least
> merge both plus:c patterns and both minus patterns.

Thanks for adding that.  How does this version look?  Retested on
x86_64-linux-gnu and aarch64-elf.

Richard


gcc/
	* match.pd: Add patterns for vec_conds between 1 and 0.

gcc/testsuite/
	* gcc.target/aarch64/vect-add-sub-cond.c: New test.

Index: gcc/match.pd
===================================================================
--- gcc/match.pd	2015-06-24 20:24:31.344998571 +0100
+++ gcc/match.pd	2015-06-24 20:24:31.340998617 +0100
@@ -1014,6 +1014,26 @@ along with GCC; see the file COPYING3.
   (cnd (logical_inverted_value truth_valued_p@0) @1 @2)
   (cnd @0 @2 @1)))
 
+/* A + (B vcmp C ? 1 : 0) -> A - (B vcmp C), since vector comparisons
+   return all-1 or all-0 results.  */
+/* ??? We could instead convert all instances of the vec_cond to negate,
+   but that isn't necessarily a win on its own.  */
+(simplify
+ (plus:c @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
+ (if (VECTOR_TYPE_P (type)
+      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
+      && (TYPE_MODE (TREE_TYPE (type))
+          == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
+  (minus @3 (view_convert @0))))
+
+/* ... likewise A - (B vcmp C ? 1 : 0) -> A + (B vcmp C).  */
+(simplify
+ (minus @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
+ (if (VECTOR_TYPE_P (type)
+      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
+      && (TYPE_PRECISION (TREE_TYPE (type))
+          == TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@0)))))
+  (plus @3 (view_convert @0))))
 
 /* Simplifications of comparisons.  */
 
Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c
===================================================================
--- /dev/null	2015-06-02 17:27:28.541944012 +0100
+++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c	2015-06-24 20:24:31.340998617 +0100
@@ -0,0 +1,94 @@
+/* Make sure that vector comaprison results are not unnecessarily ANDed
+   with vectors of 1.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define COUNT1(X) if (X) count += 1
+#define COUNT2(X) if (X) count -= 1
+#define COUNT3(X) count += (X)
+#define COUNT4(X) count -= (X)
+
+#define COND1(X) (X)
+#define COND2(X) ((X) ? 1 : 0)
+#define COND3(X) ((X) ? -1 : 0)
+#define COND4(X) ((X) ? 0 : 1)
+#define COND5(X) ((X) ? 0 : -1)
+
+#define TEST_LT(X, Y) ((X) < (Y))
+#define TEST_LE(X, Y) ((X) <= (Y))
+#define TEST_GT(X, Y) ((X) > (Y))
+#define TEST_GE(X, Y) ((X) >= (Y))
+#define TEST_EQ(X, Y) ((X) == (Y))
+#define TEST_NE(X, Y) ((X) != (Y))
+
+#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \
+  TYPE \
+  reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    TYPE count = 0; \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      COUNT (TEST (CMP_ARRAY[i], x)); \
+    return count; \
+  }
+
+#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \
+  void \
+  plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \
+  } \
+  void \
+  plusc_##ID (void) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \
+  } \
+  void \
+  minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \
+  } \
+  void \
+  minusc_##ID (void) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \
+  }
+
+#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \
+  typedef __typeof__(ARRAY[0]) ID##_type; \
+  COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \
+  COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \
+  COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \
+  COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \
+  COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \
+  COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \
+  COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \
+  COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \
+  COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5)
+
+signed int asi[1024] __attribute__ ((aligned (16)));
+unsigned int aui[1024] __attribute__ ((aligned (16)));
+signed long long asl[1024] __attribute__ ((aligned (16)));
+unsigned long long aul[1024] __attribute__ ((aligned (16)));
+float af[1024] __attribute__ ((aligned (16)));
+double ad[1024] __attribute__ ((aligned (16)));
+
+ALL_LOOPS (si_si, aui, asi, TEST_LT)
+ALL_LOOPS (ui_si, aui, asi, TEST_LE)
+ALL_LOOPS (si_ui, aui, asi, TEST_GT)
+ALL_LOOPS (ui_ui, aui, asi, TEST_GE)
+ALL_LOOPS (sl_sl, asl, asl, TEST_NE)
+ALL_LOOPS (ul_ul, aul, aul, TEST_EQ)
+ALL_LOOPS (si_f, asi, af, TEST_LE)
+ALL_LOOPS (ui_f, aui, af, TEST_GT)
+ALL_LOOPS (sl_d, asl, ad, TEST_GE)
+ALL_LOOPS (ul_d, aul, ad, TEST_GT)
+
+/* { dg-final { scan-assembler-not "\tand\t" } } */
+/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */
+/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */
+/* { dg-final { scan-assembler "\tldr\tq" } } */
+/* { dg-final { scan-assembler "\tstr\tq" } } */

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-25  8:19                     ` Richard Sandiford
@ 2015-06-25 10:39                       ` Richard Biener
  2015-06-25 11:52                         ` Richard Sandiford
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Biener @ 2015-06-25 10:39 UTC (permalink / raw)
  To: Richard Biener, GCC Patches, richard.sandiford

On Thu, Jun 25, 2015 at 10:15 AM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
> Richard Biener <richard.guenther@gmail.com> writes:
>> On Wed, Jun 24, 2015 at 3:37 PM, Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>>>>>> There is precedence for different
>>>>>> expansion paths dependent on optabs (or even rtx cost?).  Of course
>>>>>> expand_unop doesn't get the original tree ops (expand_expr.c does,
>>>>>> where some special-casing using get_gimple_for_expr is).  Not sure
>>>>>> if expand_unop would get 'cond' in a form where it can recognize
>>>>>> the result is either -1 or 0.
>>>>>
>>>>> It just seems inconsistent to have the optabs machinery try to detect
>>>>> this ad-hoc combination opportunity while still leaving the vcond optab
>>>>> to handle more arbitrary cases, like (vec_cond (eq x y) 0xbeef 0).
>>>>> The vcond optabs would still have the logic needed to produce the
>>>>> right code, but we'd be circumventing it and trying to reimplement
>>>>> one particular case in a different way.
>>>>
>>>> That's true.  One could also leave it to combine / simplify_rtx and
>>>> thus rtx_cost.  But that's true of all of the match.pd stuff you add, no?
>>>
>>> It's probably true of most match.pd stuff in general though :-)
>>> One advantage of match.pd of course is that it works across
>>> block boundaries.
>>>
>>> The difference between the stuff I added and converting vec_cond_expr
>>> to negate is that the stuff I added avoids the vec_cond_expr altogether
>>> and so ought to be an unequivocal win.  Replacing vec_cond_expr with
>>> negate just rewrites it into another (arguably more surprising) form.
>>
>> True.  Btw, conditional view_convert is now in trunk so you can at least
>> merge both plus:c patterns and both minus patterns.
>
> Thanks for adding that.  How does this version look?  Retested on
> x86_64-linux-gnu and aarch64-elf.
>
> Richard
>
>
> gcc/
>         * match.pd: Add patterns for vec_conds between 1 and 0.
>
> gcc/testsuite/
>         * gcc.target/aarch64/vect-add-sub-cond.c: New test.
>
> Index: gcc/match.pd
> ===================================================================
> --- gcc/match.pd        2015-06-24 20:24:31.344998571 +0100
> +++ gcc/match.pd        2015-06-24 20:24:31.340998617 +0100
> @@ -1014,6 +1014,26 @@ along with GCC; see the file COPYING3.
>    (cnd (logical_inverted_value truth_valued_p@0) @1 @2)
>    (cnd @0 @2 @1)))
>
> +/* A + (B vcmp C ? 1 : 0) -> A - (B vcmp C), since vector comparisons
> +   return all-1 or all-0 results.  */
> +/* ??? We could instead convert all instances of the vec_cond to negate,
> +   but that isn't necessarily a win on its own.  */
> +(simplify
> + (plus:c @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
> + (if (VECTOR_TYPE_P (type)
> +      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
> +      && (TYPE_MODE (TREE_TYPE (type))
> +          == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
> +  (minus @3 (view_convert @0))))
> +
> +/* ... likewise A - (B vcmp C ? 1 : 0) -> A + (B vcmp C).  */
> +(simplify
> + (minus @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
> + (if (VECTOR_TYPE_P (type)
> +      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
> +      && (TYPE_PRECISION (TREE_TYPE (type))
> +          == TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@0)))))

Either TYPE_PRECISION or TYPE_MODE please ;)  I think that TYPE_MODE is more
correct if you consider (minus V4SF (view_convert:V4SF (vec_cond V4SI
V4SI V4SI))
where you would end up with a non-sensical TYPE_PRECISION query on V4SF.
So probably VECTOR_INTEGER_TYPE_P again, then TYPE_PRECISION is good.

Ok with that change.

Thanks,
Richard.

> +  (plus @3 (view_convert @0))))
>
>  /* Simplifications of comparisons.  */
>
> Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c
> ===================================================================
> --- /dev/null   2015-06-02 17:27:28.541944012 +0100
> +++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c        2015-06-24 20:24:31.340998617 +0100
> @@ -0,0 +1,94 @@
> +/* Make sure that vector comaprison results are not unnecessarily ANDed
> +   with vectors of 1.  */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +#define COUNT1(X) if (X) count += 1
> +#define COUNT2(X) if (X) count -= 1
> +#define COUNT3(X) count += (X)
> +#define COUNT4(X) count -= (X)
> +
> +#define COND1(X) (X)
> +#define COND2(X) ((X) ? 1 : 0)
> +#define COND3(X) ((X) ? -1 : 0)
> +#define COND4(X) ((X) ? 0 : 1)
> +#define COND5(X) ((X) ? 0 : -1)
> +
> +#define TEST_LT(X, Y) ((X) < (Y))
> +#define TEST_LE(X, Y) ((X) <= (Y))
> +#define TEST_GT(X, Y) ((X) > (Y))
> +#define TEST_GE(X, Y) ((X) >= (Y))
> +#define TEST_EQ(X, Y) ((X) == (Y))
> +#define TEST_NE(X, Y) ((X) != (Y))
> +
> +#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \
> +  TYPE \
> +  reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \
> +  { \
> +    TYPE count = 0; \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      COUNT (TEST (CMP_ARRAY[i], x)); \
> +    return count; \
> +  }
> +
> +#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \
> +  void \
> +  plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \
> +  } \
> +  void \
> +  plusc_##ID (void) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \
> +  } \
> +  void \
> +  minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \
> +  } \
> +  void \
> +  minusc_##ID (void) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \
> +  }
> +
> +#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \
> +  typedef __typeof__(ARRAY[0]) ID##_type; \
> +  COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \
> +  COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \
> +  COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \
> +  COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \
> +  COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \
> +  COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \
> +  COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \
> +  COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \
> +  COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5)
> +
> +signed int asi[1024] __attribute__ ((aligned (16)));
> +unsigned int aui[1024] __attribute__ ((aligned (16)));
> +signed long long asl[1024] __attribute__ ((aligned (16)));
> +unsigned long long aul[1024] __attribute__ ((aligned (16)));
> +float af[1024] __attribute__ ((aligned (16)));
> +double ad[1024] __attribute__ ((aligned (16)));
> +
> +ALL_LOOPS (si_si, aui, asi, TEST_LT)
> +ALL_LOOPS (ui_si, aui, asi, TEST_LE)
> +ALL_LOOPS (si_ui, aui, asi, TEST_GT)
> +ALL_LOOPS (ui_ui, aui, asi, TEST_GE)
> +ALL_LOOPS (sl_sl, asl, asl, TEST_NE)
> +ALL_LOOPS (ul_ul, aul, aul, TEST_EQ)
> +ALL_LOOPS (si_f, asi, af, TEST_LE)
> +ALL_LOOPS (ui_f, aui, af, TEST_GT)
> +ALL_LOOPS (sl_d, asl, ad, TEST_GE)
> +ALL_LOOPS (ul_d, aul, ad, TEST_GT)
> +
> +/* { dg-final { scan-assembler-not "\tand\t" } } */
> +/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */
> +/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */
> +/* { dg-final { scan-assembler "\tldr\tq" } } */
> +/* { dg-final { scan-assembler "\tstr\tq" } } */
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-25 10:39                       ` Richard Biener
@ 2015-06-25 11:52                         ` Richard Sandiford
  2015-06-25 13:17                           ` Richard Biener
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Sandiford @ 2015-06-25 11:52 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

Richard Biener <richard.guenther@gmail.com> writes:
> On Thu, Jun 25, 2015 at 10:15 AM, Richard Sandiford
>> Index: gcc/match.pd
>> ===================================================================
>> --- gcc/match.pd        2015-06-24 20:24:31.344998571 +0100
>> +++ gcc/match.pd        2015-06-24 20:24:31.340998617 +0100
>> @@ -1014,6 +1014,26 @@ along with GCC; see the file COPYING3.
>>    (cnd (logical_inverted_value truth_valued_p@0) @1 @2)
>>    (cnd @0 @2 @1)))
>>
>> +/* A + (B vcmp C ? 1 : 0) -> A - (B vcmp C), since vector comparisons
>> +   return all-1 or all-0 results.  */
>> +/* ??? We could instead convert all instances of the vec_cond to negate,
>> +   but that isn't necessarily a win on its own.  */
>> +(simplify
>> + (plus:c @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>> + (if (VECTOR_TYPE_P (type)
>> +      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>> +      && (TYPE_MODE (TREE_TYPE (type))
>> +          == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
>> +  (minus @3 (view_convert @0))))
>> +
>> +/* ... likewise A - (B vcmp C ? 1 : 0) -> A + (B vcmp C).  */
>> +(simplify
>> + (minus @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>> + (if (VECTOR_TYPE_P (type)
>> +      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>> +      && (TYPE_PRECISION (TREE_TYPE (type))
>> +          == TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@0)))))
>
> Either TYPE_PRECISION or TYPE_MODE please ;)

Bah.  The main reason I hate cut-&-paste is that I'm so hopeless at it.

> I think that TYPE_MODE is more correct if you consider (minus V4SF
> (view_convert:V4SF (vec_cond V4SI V4SI V4SI)) where you would end up
> with a non-sensical TYPE_PRECISION query on V4SF.  So probably
> VECTOR_INTEGER_TYPE_P again, then TYPE_PRECISION is good.

Actually, they were both meant to be TYPE_MODE, as below.  Is this OK?

Thanks,
Richard


gcc/
	* match.pd: Add patterns for vec_conds between 1 and 0.

gcc/testsuite/
	* gcc.target/aarch64/vect-add-sub-cond.c: New test.

Index: gcc/match.pd
===================================================================
--- gcc/match.pd	2015-06-25 11:06:50.462827031 +0100
+++ gcc/match.pd	2015-06-25 11:07:23.742445798 +0100
@@ -1014,6 +1014,26 @@ along with GCC; see the file COPYING3.
   (cnd (logical_inverted_value truth_valued_p@0) @1 @2)
   (cnd @0 @2 @1)))
 
+/* A + (B vcmp C ? 1 : 0) -> A - (B vcmp C), since vector comparisons
+   return all-1 or all-0 results.  */
+/* ??? We could instead convert all instances of the vec_cond to negate,
+   but that isn't necessarily a win on its own.  */
+(simplify
+ (plus:c @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
+ (if (VECTOR_TYPE_P (type)
+      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
+      && (TYPE_MODE (TREE_TYPE (type))
+          == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
+  (minus @3 (view_convert @0))))
+
+/* ... likewise A - (B vcmp C ? 1 : 0) -> A + (B vcmp C).  */
+(simplify
+ (minus @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
+ (if (VECTOR_TYPE_P (type)
+      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
+      && (TYPE_MODE (TREE_TYPE (type))
+          == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
+  (plus @3 (view_convert @0))))
 
 /* Simplifications of comparisons.  */
 
Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c
===================================================================
--- /dev/null	2015-06-02 17:27:28.541944012 +0100
+++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c	2015-06-25 11:06:50.458827055 +0100
@@ -0,0 +1,94 @@
+/* Make sure that vector comaprison results are not unnecessarily ANDed
+   with vectors of 1.  */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define COUNT1(X) if (X) count += 1
+#define COUNT2(X) if (X) count -= 1
+#define COUNT3(X) count += (X)
+#define COUNT4(X) count -= (X)
+
+#define COND1(X) (X)
+#define COND2(X) ((X) ? 1 : 0)
+#define COND3(X) ((X) ? -1 : 0)
+#define COND4(X) ((X) ? 0 : 1)
+#define COND5(X) ((X) ? 0 : -1)
+
+#define TEST_LT(X, Y) ((X) < (Y))
+#define TEST_LE(X, Y) ((X) <= (Y))
+#define TEST_GT(X, Y) ((X) > (Y))
+#define TEST_GE(X, Y) ((X) >= (Y))
+#define TEST_EQ(X, Y) ((X) == (Y))
+#define TEST_NE(X, Y) ((X) != (Y))
+
+#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \
+  TYPE \
+  reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    TYPE count = 0; \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      COUNT (TEST (CMP_ARRAY[i], x)); \
+    return count; \
+  }
+
+#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \
+  void \
+  plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \
+  } \
+  void \
+  plusc_##ID (void) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \
+  } \
+  void \
+  minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \
+  } \
+  void \
+  minusc_##ID (void) \
+  { \
+    for (unsigned int i = 0; i < 1024; ++i) \
+      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \
+  }
+
+#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \
+  typedef __typeof__(ARRAY[0]) ID##_type; \
+  COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \
+  COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \
+  COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \
+  COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \
+  COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \
+  COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \
+  COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \
+  COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \
+  COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5)
+
+signed int asi[1024] __attribute__ ((aligned (16)));
+unsigned int aui[1024] __attribute__ ((aligned (16)));
+signed long long asl[1024] __attribute__ ((aligned (16)));
+unsigned long long aul[1024] __attribute__ ((aligned (16)));
+float af[1024] __attribute__ ((aligned (16)));
+double ad[1024] __attribute__ ((aligned (16)));
+
+ALL_LOOPS (si_si, aui, asi, TEST_LT)
+ALL_LOOPS (ui_si, aui, asi, TEST_LE)
+ALL_LOOPS (si_ui, aui, asi, TEST_GT)
+ALL_LOOPS (ui_ui, aui, asi, TEST_GE)
+ALL_LOOPS (sl_sl, asl, asl, TEST_NE)
+ALL_LOOPS (ul_ul, aul, aul, TEST_EQ)
+ALL_LOOPS (si_f, asi, af, TEST_LE)
+ALL_LOOPS (ui_f, aui, af, TEST_GT)
+ALL_LOOPS (sl_d, asl, ad, TEST_GE)
+ALL_LOOPS (ul_d, aul, ad, TEST_GT)
+
+/* { dg-final { scan-assembler-not "\tand\t" } } */
+/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */
+/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */
+/* { dg-final { scan-assembler "\tldr\tq" } } */
+/* { dg-final { scan-assembler "\tstr\tq" } } */

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-25 11:52                         ` Richard Sandiford
@ 2015-06-25 13:17                           ` Richard Biener
  0 siblings, 0 replies; 20+ messages in thread
From: Richard Biener @ 2015-06-25 13:17 UTC (permalink / raw)
  To: Richard Biener, GCC Patches, richard.sandiford

On Thu, Jun 25, 2015 at 1:51 PM, Richard Sandiford
<richard.sandiford@arm.com> wrote:
> Richard Biener <richard.guenther@gmail.com> writes:
>> On Thu, Jun 25, 2015 at 10:15 AM, Richard Sandiford
>>> Index: gcc/match.pd
>>> ===================================================================
>>> --- gcc/match.pd        2015-06-24 20:24:31.344998571 +0100
>>> +++ gcc/match.pd        2015-06-24 20:24:31.340998617 +0100
>>> @@ -1014,6 +1014,26 @@ along with GCC; see the file COPYING3.
>>>    (cnd (logical_inverted_value truth_valued_p@0) @1 @2)
>>>    (cnd @0 @2 @1)))
>>>
>>> +/* A + (B vcmp C ? 1 : 0) -> A - (B vcmp C), since vector comparisons
>>> +   return all-1 or all-0 results.  */
>>> +/* ??? We could instead convert all instances of the vec_cond to negate,
>>> +   but that isn't necessarily a win on its own.  */
>>> +(simplify
>>> + (plus:c @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>>> + (if (VECTOR_TYPE_P (type)
>>> +      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>>> +      && (TYPE_MODE (TREE_TYPE (type))
>>> +          == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
>>> +  (minus @3 (view_convert @0))))
>>> +
>>> +/* ... likewise A - (B vcmp C ? 1 : 0) -> A + (B vcmp C).  */
>>> +(simplify
>>> + (minus @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
>>> + (if (VECTOR_TYPE_P (type)
>>> +      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
>>> +      && (TYPE_PRECISION (TREE_TYPE (type))
>>> +          == TYPE_PRECISION (TREE_TYPE (TREE_TYPE (@0)))))
>>
>> Either TYPE_PRECISION or TYPE_MODE please ;)
>
> Bah.  The main reason I hate cut-&-paste is that I'm so hopeless at it.
>
>> I think that TYPE_MODE is more correct if you consider (minus V4SF
>> (view_convert:V4SF (vec_cond V4SI V4SI V4SI)) where you would end up
>> with a non-sensical TYPE_PRECISION query on V4SF.  So probably
>> VECTOR_INTEGER_TYPE_P again, then TYPE_PRECISION is good.
>
> Actually, they were both meant to be TYPE_MODE, as below.  Is this OK?

Yes.

Thanks,
Richard.

> Thanks,
> Richard
>
>
> gcc/
>         * match.pd: Add patterns for vec_conds between 1 and 0.
>
> gcc/testsuite/
>         * gcc.target/aarch64/vect-add-sub-cond.c: New test.
>
> Index: gcc/match.pd
> ===================================================================
> --- gcc/match.pd        2015-06-25 11:06:50.462827031 +0100
> +++ gcc/match.pd        2015-06-25 11:07:23.742445798 +0100
> @@ -1014,6 +1014,26 @@ along with GCC; see the file COPYING3.
>    (cnd (logical_inverted_value truth_valued_p@0) @1 @2)
>    (cnd @0 @2 @1)))
>
> +/* A + (B vcmp C ? 1 : 0) -> A - (B vcmp C), since vector comparisons
> +   return all-1 or all-0 results.  */
> +/* ??? We could instead convert all instances of the vec_cond to negate,
> +   but that isn't necessarily a win on its own.  */
> +(simplify
> + (plus:c @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
> + (if (VECTOR_TYPE_P (type)
> +      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
> +      && (TYPE_MODE (TREE_TYPE (type))
> +          == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
> +  (minus @3 (view_convert @0))))
> +
> +/* ... likewise A - (B vcmp C ? 1 : 0) -> A + (B vcmp C).  */
> +(simplify
> + (minus @3 (view_convert? (vec_cond @0 integer_each_onep@1 integer_zerop@2)))
> + (if (VECTOR_TYPE_P (type)
> +      && TYPE_VECTOR_SUBPARTS (type) == TYPE_VECTOR_SUBPARTS (TREE_TYPE (@0))
> +      && (TYPE_MODE (TREE_TYPE (type))
> +          == TYPE_MODE (TREE_TYPE (TREE_TYPE (@0)))))
> +  (plus @3 (view_convert @0))))
>
>  /* Simplifications of comparisons.  */
>
> Index: gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c
> ===================================================================
> --- /dev/null   2015-06-02 17:27:28.541944012 +0100
> +++ gcc/testsuite/gcc.target/aarch64/vect-add-sub-cond.c        2015-06-25 11:06:50.458827055 +0100
> @@ -0,0 +1,94 @@
> +/* Make sure that vector comaprison results are not unnecessarily ANDed
> +   with vectors of 1.  */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +#define COUNT1(X) if (X) count += 1
> +#define COUNT2(X) if (X) count -= 1
> +#define COUNT3(X) count += (X)
> +#define COUNT4(X) count -= (X)
> +
> +#define COND1(X) (X)
> +#define COND2(X) ((X) ? 1 : 0)
> +#define COND3(X) ((X) ? -1 : 0)
> +#define COND4(X) ((X) ? 0 : 1)
> +#define COND5(X) ((X) ? 0 : -1)
> +
> +#define TEST_LT(X, Y) ((X) < (Y))
> +#define TEST_LE(X, Y) ((X) <= (Y))
> +#define TEST_GT(X, Y) ((X) > (Y))
> +#define TEST_GE(X, Y) ((X) >= (Y))
> +#define TEST_EQ(X, Y) ((X) == (Y))
> +#define TEST_NE(X, Y) ((X) != (Y))
> +
> +#define COUNT_LOOP(ID, TYPE, CMP_ARRAY, TEST, COUNT) \
> +  TYPE \
> +  reduc_##ID (__typeof__ (CMP_ARRAY[0]) x) \
> +  { \
> +    TYPE count = 0; \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      COUNT (TEST (CMP_ARRAY[i], x)); \
> +    return count; \
> +  }
> +
> +#define COND_LOOP(ID, ARRAY, CMP_ARRAY, TEST, COND) \
> +  void \
> +  plus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], x)); \
> +  } \
> +  void \
> +  plusc_##ID (void) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 10)); \
> +  } \
> +  void \
> +  minus_##ID (__typeof__ (CMP_ARRAY[0]) x) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] -= COND (TEST (CMP_ARRAY[i], x)); \
> +  } \
> +  void \
> +  minusc_##ID (void) \
> +  { \
> +    for (unsigned int i = 0; i < 1024; ++i) \
> +      ARRAY[i] += COND (TEST (CMP_ARRAY[i], 1)); \
> +  }
> +
> +#define ALL_LOOPS(ID, ARRAY, CMP_ARRAY, TEST) \
> +  typedef __typeof__(ARRAY[0]) ID##_type; \
> +  COUNT_LOOP (ID##_1, ID##_type, CMP_ARRAY, TEST, COUNT1) \
> +  COUNT_LOOP (ID##_2, ID##_type, CMP_ARRAY, TEST, COUNT2) \
> +  COUNT_LOOP (ID##_3, ID##_type, CMP_ARRAY, TEST, COUNT3) \
> +  COUNT_LOOP (ID##_4, ID##_type, CMP_ARRAY, TEST, COUNT4) \
> +  COND_LOOP (ID##_1, ARRAY, CMP_ARRAY, TEST, COND1) \
> +  COND_LOOP (ID##_2, ARRAY, CMP_ARRAY, TEST, COND2) \
> +  COND_LOOP (ID##_3, ARRAY, CMP_ARRAY, TEST, COND3) \
> +  COND_LOOP (ID##_4, ARRAY, CMP_ARRAY, TEST, COND4) \
> +  COND_LOOP (ID##_5, ARRAY, CMP_ARRAY, TEST, COND5)
> +
> +signed int asi[1024] __attribute__ ((aligned (16)));
> +unsigned int aui[1024] __attribute__ ((aligned (16)));
> +signed long long asl[1024] __attribute__ ((aligned (16)));
> +unsigned long long aul[1024] __attribute__ ((aligned (16)));
> +float af[1024] __attribute__ ((aligned (16)));
> +double ad[1024] __attribute__ ((aligned (16)));
> +
> +ALL_LOOPS (si_si, aui, asi, TEST_LT)
> +ALL_LOOPS (ui_si, aui, asi, TEST_LE)
> +ALL_LOOPS (si_ui, aui, asi, TEST_GT)
> +ALL_LOOPS (ui_ui, aui, asi, TEST_GE)
> +ALL_LOOPS (sl_sl, asl, asl, TEST_NE)
> +ALL_LOOPS (ul_ul, aul, aul, TEST_EQ)
> +ALL_LOOPS (si_f, asi, af, TEST_LE)
> +ALL_LOOPS (ui_f, aui, af, TEST_GT)
> +ALL_LOOPS (sl_d, asl, ad, TEST_GE)
> +ALL_LOOPS (ul_d, aul, ad, TEST_GT)
> +
> +/* { dg-final { scan-assembler-not "\tand\t" } } */
> +/* { dg-final { scan-assembler-not "\tld\[^\t\]*\t\[wx\]" } } */
> +/* { dg-final { scan-assembler-not "\tst\[^\t\]*\t\[wx\]" } } */
> +/* { dg-final { scan-assembler "\tldr\tq" } } */
> +/* { dg-final { scan-assembler "\tstr\tq" } } */
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-24 11:56           ` Richard Biener
  2015-06-24 12:37             ` Richard Sandiford
@ 2015-06-24 16:42             ` Jeff Law
  1 sibling, 0 replies; 20+ messages in thread
From: Jeff Law @ 2015-06-24 16:42 UTC (permalink / raw)
  To: Richard Biener, GCC Patches, richard.sandiford

On 06/24/2015 05:43 AM, Richard Biener wrote:

>>> Note that ISTR code performing exactly the opposite transform in
>>> fold-const.c ...
>>
>> That's another reason why I'm worried about just doing the (negate ...)
>> thing without knowing whether the negate can be folded into anything else.
>
> I'm not aware of anything here.
It's worth looking at -- I've certainly seen cases where we end up 
infinite recursion because we've got a transformation in once place (say 
match.pd) and its inverse elsewhere (fold-const.c).

Jeff

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-24 10:22       ` Richard Biener
  2015-06-24 11:29         ` Richard Sandiford
@ 2015-06-25 22:48         ` Marc Glisse
  2015-06-26  9:59           ` Richard Biener
  1 sibling, 1 reply; 20+ messages in thread
From: Marc Glisse @ 2015-06-25 22:48 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches, richard.sandiford

On Wed, 24 Jun 2015, Richard Biener wrote:

> On Wed, Jun 24, 2015 at 11:57 AM, Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>> Richard Biener <richard.guenther@gmail.com> writes:
>>> On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
>>>> On Tue, 23 Jun 2015, Richard Sandiford wrote:
>>>>
>>>>> +/* Vector comparisons are defined to produce all-one or all-zero results.
>>>>> */
>>>>> +(simplify
>>>>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>>> +   (convert @0)))
>>>>
>>>>
>>>> I am trying to understand why the test tree_nop_conversion_p is the right
>>>> one (at least for the transformations not using VIEW_CONVERT_EXPR). By
>>>> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer vector
>>>> types of the same size and number of elements. It thus seems like a
>>>> conversion is always fine. For vectors, tree_nop_conversion_p apparently
>>>> only checks that they have the same mode (quite often VOIDmode I guess).
>>>
>>> The only conversion we seem to allow is changing the signed vector from
>>> the comparison result to an unsigned vector (same number of elements
>>> and same mode of the elements).  That is, a check using
>>> TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably
>>> be better (well, technically a TYPE_VECTOR_SUBPARTS && element
>>> mode compare should be better as generic vectors might not have a vector mode).
>>
>> OK.  The reason I was being paranoid was that I couldn't see anywhere
>> where we enforced that the vector condition in a VEC_COND had to have
>> the same element width as the values being selected.
>
> We don't require that indeed.
>
>>  tree-cfg.c
>> only checks that rhs2 and rhs3 are compatible with the result.
>> There doesn't seem to be any checking of rhs1 vs. the other types.
>> So I wasn't sure whether anything stopped us from, e.g., comparing two
>> V4HIs and using the result to select between two V4SIs.
>
> Nothing does (or should).

The documentation patch you approved in 
https://gcc.gnu.org/ml/gcc-patches/2012-10/msg01109.html says something 
different. If it is really wrong, could you fix it?

-- 
Marc Glisse

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-25 22:48         ` Marc Glisse
@ 2015-06-26  9:59           ` Richard Biener
  2015-06-28 14:09             ` Marc Glisse
  0 siblings, 1 reply; 20+ messages in thread
From: Richard Biener @ 2015-06-26  9:59 UTC (permalink / raw)
  To: GCC Patches; +Cc: richard.sandiford

On Fri, Jun 26, 2015 at 12:39 AM, Marc Glisse <marc.glisse@inria.fr> wrote:
> On Wed, 24 Jun 2015, Richard Biener wrote:
>
>> On Wed, Jun 24, 2015 at 11:57 AM, Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>>>
>>> Richard Biener <richard.guenther@gmail.com> writes:
>>>>
>>>> On Tue, Jun 23, 2015 at 11:27 PM, Marc Glisse <marc.glisse@inria.fr>
>>>> wrote:
>>>>>
>>>>> On Tue, 23 Jun 2015, Richard Sandiford wrote:
>>>>>
>>>>>> +/* Vector comparisons are defined to produce all-one or all-zero
>>>>>> results.
>>>>>> */
>>>>>> +(simplify
>>>>>> + (vec_cond @0 integer_all_onesp@1 integer_zerop@2)
>>>>>> + (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>>>>>> +   (convert @0)))
>>>>>
>>>>>
>>>>>
>>>>> I am trying to understand why the test tree_nop_conversion_p is the
>>>>> right
>>>>> one (at least for the transformations not using VIEW_CONVERT_EXPR). By
>>>>> definition of VEC_COND_EXPR, type and TREE_TYPE (@0) are both integer
>>>>> vector
>>>>> types of the same size and number of elements. It thus seems like a
>>>>> conversion is always fine. For vectors, tree_nop_conversion_p
>>>>> apparently
>>>>> only checks that they have the same mode (quite often VOIDmode I
>>>>> guess).
>>>>
>>>>
>>>> The only conversion we seem to allow is changing the signed vector from
>>>> the comparison result to an unsigned vector (same number of elements
>>>> and same mode of the elements).  That is, a check using
>>>> TYPE_MODE (type) == TYPE_MODE (TREE_TYPE (@0)) would probably
>>>> be better (well, technically a TYPE_VECTOR_SUBPARTS && element
>>>> mode compare should be better as generic vectors might not have a vector
>>>> mode).
>>>
>>>
>>> OK.  The reason I was being paranoid was that I couldn't see anywhere
>>> where we enforced that the vector condition in a VEC_COND had to have
>>> the same element width as the values being selected.
>>
>>
>> We don't require that indeed.
>>
>>>  tree-cfg.c
>>> only checks that rhs2 and rhs3 are compatible with the result.
>>> There doesn't seem to be any checking of rhs1 vs. the other types.
>>> So I wasn't sure whether anything stopped us from, e.g., comparing two
>>> V4HIs and using the result to select between two V4SIs.
>>
>>
>> Nothing does (or should).
>
>
> The documentation patch you approved in
> https://gcc.gnu.org/ml/gcc-patches/2012-10/msg01109.html says something
> different. If it is really wrong, could you fix it?

Hmm, that simplifies things.  It would be nice if these constraints
would also be
checked in the gimple verifier...

Richard.

> --
> Marc Glisse

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-26  9:59           ` Richard Biener
@ 2015-06-28 14:09             ` Marc Glisse
  2015-06-29  9:16               ` Richard Biener
  0 siblings, 1 reply; 20+ messages in thread
From: Marc Glisse @ 2015-06-28 14:09 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches, richard.sandiford

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1320 bytes --]

On Fri, 26 Jun 2015, Richard Biener wrote:

>>>> OK.  The reason I was being paranoid was that I couldn't see anywhere
>>>> where we enforced that the vector condition in a VEC_COND had to have
>>>> the same element width as the values being selected.
>>>
>>>
>>> We don't require that indeed.
>>>
>>>>  tree-cfg.c
>>>> only checks that rhs2 and rhs3 are compatible with the result.
>>>> There doesn't seem to be any checking of rhs1 vs. the other types.
>>>> So I wasn't sure whether anything stopped us from, e.g., comparing two
>>>> V4HIs and using the result to select between two V4SIs.
>>>
>>>
>>> Nothing does (or should).
>>
>>
>> The documentation patch you approved in
>> https://gcc.gnu.org/ml/gcc-patches/2012-10/msg01109.html says something
>> different. If it is really wrong, could you fix it?
>
> Hmm, that simplifies things.

On the other hand, vectors of bools could be (I haven't thought about it 
much) nice to have, especially for avx512 (and at least one other arch, 
maybe sparc).

> It would be nice if these constraints would also be checked in the 
> gimple verifier...

This passed bootstrap+testsuite on powerpc64le-unknown-linux-gnu.

2015-06-29  Marc Glisse  <marc.glisse@inria.fr>

 	* tree-cfg.c (verify_gimple_assign_ternary) <VEC_COND_EXPR>: Check
 	the first argument.

-- 
Marc Glisse

[-- Attachment #2: Type: TEXT/PLAIN, Size: 930 bytes --]

Index: gcc/tree-cfg.c
===================================================================
--- gcc/tree-cfg.c	(revision 225104)
+++ gcc/tree-cfg.c	(working copy)
@@ -4001,8 +4001,22 @@
 	}
       break;
 
+    case VEC_COND_EXPR:
+      if (!VECTOR_INTEGER_TYPE_P (rhs1_type)
+	  || TYPE_SIGN (rhs1_type) != SIGNED
+	  || TYPE_SIZE (rhs1_type) != TYPE_SIZE (lhs_type)
+	  || TYPE_VECTOR_SUBPARTS (rhs1_type)
+	     != TYPE_VECTOR_SUBPARTS (lhs_type))
+	{
+	  error ("the first argument of a VEC_COND_EXPR must be of a signed "
+		 "integral vector type of the same size and number of "
+		 "elements as the result");
+	  debug_generic_expr (lhs_type);
+	  debug_generic_expr (rhs1_type);
+	  return true;
+	}
+      /* Fallthrough.  */
     case COND_EXPR:
-    case VEC_COND_EXPR:
       if (!useless_type_conversion_p (lhs_type, rhs2_type)
 	  || !useless_type_conversion_p (lhs_type, rhs3_type))
 	{

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Remove redundant AND from count reduction loop
  2015-06-28 14:09             ` Marc Glisse
@ 2015-06-29  9:16               ` Richard Biener
  0 siblings, 0 replies; 20+ messages in thread
From: Richard Biener @ 2015-06-29  9:16 UTC (permalink / raw)
  To: Marc Glisse; +Cc: GCC Patches, richard.sandiford

On Sun, Jun 28, 2015 at 1:45 PM, Marc Glisse <marc.glisse@inria.fr> wrote:
> On Fri, 26 Jun 2015, Richard Biener wrote:
>
>>>>> OK.  The reason I was being paranoid was that I couldn't see anywhere
>>>>> where we enforced that the vector condition in a VEC_COND had to have
>>>>> the same element width as the values being selected.
>>>>
>>>>
>>>>
>>>> We don't require that indeed.
>>>>
>>>>>  tree-cfg.c
>>>>> only checks that rhs2 and rhs3 are compatible with the result.
>>>>> There doesn't seem to be any checking of rhs1 vs. the other types.
>>>>> So I wasn't sure whether anything stopped us from, e.g., comparing two
>>>>> V4HIs and using the result to select between two V4SIs.
>>>>
>>>>
>>>>
>>>> Nothing does (or should).
>>>
>>>
>>>
>>> The documentation patch you approved in
>>> https://gcc.gnu.org/ml/gcc-patches/2012-10/msg01109.html says something
>>> different. If it is really wrong, could you fix it?
>>
>>
>> Hmm, that simplifies things.
>
>
> On the other hand, vectors of bools could be (I haven't thought about it
> much) nice to have, especially for avx512 (and at least one other arch,
> maybe sparc).
>
>> It would be nice if these constraints would also be checked in the gimple
>> verifier...
>
>
> This passed bootstrap+testsuite on powerpc64le-unknown-linux-gnu.

Ok.

Thanks,
Richard.

> 2015-06-29  Marc Glisse  <marc.glisse@inria.fr>
>
>         * tree-cfg.c (verify_gimple_assign_ternary) <VEC_COND_EXPR>: Check
>         the first argument.
>
> --
> Marc Glisse
> Index: gcc/tree-cfg.c
> ===================================================================
> --- gcc/tree-cfg.c      (revision 225104)
> +++ gcc/tree-cfg.c      (working copy)
> @@ -4001,8 +4001,22 @@
>         }
>        break;
>
> +    case VEC_COND_EXPR:
> +      if (!VECTOR_INTEGER_TYPE_P (rhs1_type)
> +         || TYPE_SIGN (rhs1_type) != SIGNED
> +         || TYPE_SIZE (rhs1_type) != TYPE_SIZE (lhs_type)
> +         || TYPE_VECTOR_SUBPARTS (rhs1_type)
> +            != TYPE_VECTOR_SUBPARTS (lhs_type))
> +       {
> +         error ("the first argument of a VEC_COND_EXPR must be of a signed
> "
> +                "integral vector type of the same size and number of "
> +                "elements as the result");
> +         debug_generic_expr (lhs_type);
> +         debug_generic_expr (rhs1_type);
> +         return true;
> +       }
> +      /* Fallthrough.  */
>      case COND_EXPR:
> -    case VEC_COND_EXPR:
>        if (!useless_type_conversion_p (lhs_type, rhs2_type)
>           || !useless_type_conversion_p (lhs_type, rhs3_type))
>         {
>

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2015-06-29  9:14 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-23 15:42 Remove redundant AND from count reduction loop Richard Sandiford
2015-06-23 21:36 ` Marc Glisse
2015-06-24  9:25   ` Richard Biener
2015-06-24  9:59     ` Richard Sandiford
2015-06-24 10:22       ` Richard Biener
2015-06-24 11:29         ` Richard Sandiford
2015-06-24 11:56           ` Richard Biener
2015-06-24 12:37             ` Richard Sandiford
2015-06-24 13:11               ` Richard Biener
2015-06-24 13:53                 ` Richard Sandiford
2015-06-24 14:09                   ` Richard Biener
2015-06-25  8:19                     ` Richard Sandiford
2015-06-25 10:39                       ` Richard Biener
2015-06-25 11:52                         ` Richard Sandiford
2015-06-25 13:17                           ` Richard Biener
2015-06-24 16:42             ` Jeff Law
2015-06-25 22:48         ` Marc Glisse
2015-06-26  9:59           ` Richard Biener
2015-06-28 14:09             ` Marc Glisse
2015-06-29  9:16               ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).