[PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M - 1)
@ 2020-11-11 10:17 Philipp Tomsich
  2020-11-11 10:30 ` Jakub Jelinek
  0 siblings, 1 reply; 11+ messages in thread
From: Philipp Tomsich @ 2020-11-11 10:17 UTC (permalink / raw)
  To: gcc-patches; +Cc: Philipp Tomsich

From: Philipp Tomsich <prt@gnu.org>

The function
    long f(long a)
    {
    	return(a & 0xFFFFFFFFull) << 3;
    }
is folded into
    _1 = a_2(D) << 3;
    _3 = _1 & 34359738360;
wheras the construction
    return (a & 0xFFFFFFFFull) * 8;
results in
    _1 = a_2(D) & 4294967295;
    _3 = _1 * 8;

This leads to suboptimal code-generation for RISC-V (march=rv64g), as
the shifted constant needs to be expanded into 3 RTX and 2 RTX (one
each for the LSHIFT_EXPR and the BIT_AND_EXPR) which will overwhelm
the combine pass (a sequence of 5 RTX are not considered):
	li	a5,1		# tmp78,	# 23	[c=4 l=4]  *movdi_64bit/1
	slli	a5,a5,35	#, tmp79, tmp78	# 24	[c=4 l=4]  ashldi3
	addi	a5,a5,-8	#, tmp77, tmp79	# 9	[c=4 l=4]  adddi3/1
	slli	a0,a0,3		#, tmp76, tmp80	# 6	[c=4 l=4]  ashldi3
	and	a0,a0,a5	# tmp77,, tmp76	# 15	[c=4 l=4]  anddi3/0
	ret			# 28	[c=0 l=4]  simple_return
instead of:
	slli	a0,a0,32	#, tmp76, tmp79	# 26	[c=4 l=4]  ashldi3
	srli	a0,a0,29	#,, tmp76	# 27	[c=4 l=4]  lshrdi3
	ret			    		# 24	[c=0 l=4]  simple_return

We address this by adding a simplification for
   (a << s) & M, where ((M >> s) << s) == M
to
   (a & M_unshifted) << s, where M_unshifted := (M >> s)
which undistributes the LSHIFT.

Signed-off-by: Philipp Tomsich <prt@gnu.org>
---
 gcc/match.pd                            | 11 +++++++++--
 gcc/testsuite/gcc.target/riscv/zextws.c | 18 ++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/riscv/zextws.c

diff --git a/gcc/match.pd b/gcc/match.pd
index 349eab6..6bb9535 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -3079,6 +3079,12 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 	     }
 	 }
      }
+    (if (GIMPLE && (((mask >> shiftc) << shiftc) == mask)
+	        && (exact_log2((mask >> shiftc) + 1) >= 0)
+	        && (shift == LSHIFT_EXPR))
+	 (with
+	  { tree newmaskt = build_int_cst_type(TREE_TYPE (@2), mask >> shiftc); }
+	  (shift (convert (bit_and:shift_type (convert @0) { newmaskt; })) @1))
      /* ((X << 16) & 0xff00) is (X, 0).  */
      (if ((mask & zerobits) == mask)
       { build_int_cst (type, 0); }
@@ -3100,7 +3106,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
 	   (if (!tree_int_cst_equal (newmaskt, @2))
 	    (if (shift_type != TREE_TYPE (@3))
 	     (bit_and (convert (shift:shift_type (convert @3) @1)) { newmaskt; })
-	     (bit_and @4 { newmaskt; })))))))))))))
+	     (bit_and @4 { newmaskt; }))))))))))))))
 
 /* Fold (X {&,^,|} C2) << C1 into (X << C1) {&,^,|} (C2 << C1)
    (X {&,^,|} C2) >> C1 into (X >> C1) & (C2 >> C1).  */
@@ -3108,7 +3114,8 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
  (for bit_op (bit_and bit_xor bit_ior)
   (simplify
    (shift (convert?:s (bit_op:s @0 INTEGER_CST@2)) INTEGER_CST@1)
-   (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
+   (if (tree_nop_conversion_p (type, TREE_TYPE (@0))
+        && !wi::exact_log2(wi::to_wide(@2) + 1))
     (with { tree mask = int_const_binop (shift, fold_convert (type, @2), @1); }
      (bit_op (shift (convert @0) @1) { mask; }))))))
 
diff --git a/gcc/testsuite/gcc.target/riscv/zextws.c b/gcc/testsuite/gcc.target/riscv/zextws.c
new file mode 100644
index 0000000..8ac93f6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/zextws.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv64g -mabi=lp64 -O2" } */
+
+/* Test for
+     (a << s) & M', where ((M >> s) << s) == M
+   being undistributed into
+     (a & M_unshifted) << s, where M_unshifted := (M >> s)
+   to produce the sequence (or similar)
+     slli	a0,a0,32
+     srli	a0,a0,29
+*/
+long
+zextws_mask (long i)
+{
+  return (i & 0xffffffffULL) << 3;
+}
+/* { dg-final { scan-assembler "slli" } } */
+/* { dg-final { scan-assembler "srli" } } */
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M - 1)
  2020-11-11 10:17 [PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M - 1) Philipp Tomsich
@ 2020-11-11 10:30 ` Jakub Jelinek
  2020-11-11 10:43   ` Philipp Tomsich
  0 siblings, 1 reply; 11+ messages in thread
From: Jakub Jelinek @ 2020-11-11 10:30 UTC (permalink / raw)
  To: Philipp Tomsich; +Cc: gcc-patches, Philipp Tomsich

On Wed, Nov 11, 2020 at 11:17:32AM +0100, Philipp Tomsich wrote:
> From: Philipp Tomsich <prt@gnu.org>
> 
> The function
>     long f(long a)
>     {
>     	return(a & 0xFFFFFFFFull) << 3;
>     }
> is folded into
>     _1 = a_2(D) << 3;
>     _3 = _1 & 34359738360;
> wheras the construction
>     return (a & 0xFFFFFFFFull) * 8;
> results in
>     _1 = a_2(D) & 4294967295;
>     _3 = _1 * 8;
> 
> This leads to suboptimal code-generation for RISC-V (march=rv64g), as
> the shifted constant needs to be expanded into 3 RTX and 2 RTX (one
> each for the LSHIFT_EXPR and the BIT_AND_EXPR) which will overwhelm
> the combine pass (a sequence of 5 RTX are not considered):
> 	li	a5,1		# tmp78,	# 23	[c=4 l=4]  *movdi_64bit/1
> 	slli	a5,a5,35	#, tmp79, tmp78	# 24	[c=4 l=4]  ashldi3
> 	addi	a5,a5,-8	#, tmp77, tmp79	# 9	[c=4 l=4]  adddi3/1
> 	slli	a0,a0,3		#, tmp76, tmp80	# 6	[c=4 l=4]  ashldi3
> 	and	a0,a0,a5	# tmp77,, tmp76	# 15	[c=4 l=4]  anddi3/0
> 	ret			# 28	[c=0 l=4]  simple_return
> instead of:
> 	slli	a0,a0,32	#, tmp76, tmp79	# 26	[c=4 l=4]  ashldi3
> 	srli	a0,a0,29	#,, tmp76	# 27	[c=4 l=4]  lshrdi3
> 	ret			    		# 24	[c=0 l=4]  simple_return
> 
> We address this by adding a simplification for
>    (a << s) & M, where ((M >> s) << s) == M
> to
>    (a & M_unshifted) << s, where M_unshifted := (M >> s)
> which undistributes the LSHIFT.

This is problematic, we have another rule that goes against this:
/* Fold (X {&,^,|} C2) << C1 into (X << C1) {&,^,|} (C2 << C1)
   (X {&,^,|} C2) >> C1 into (X >> C1) & (C2 >> C1).  */
(for shift (lshift rshift)
 (for bit_op (bit_and bit_xor bit_ior)
  (simplify
   (shift (convert?:s (bit_op:s @0 INTEGER_CST@2)) INTEGER_CST@1)
   (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
    (with { tree mask = int_const_binop (shift, fold_convert (type, @2), @1); }
     (bit_op (shift (convert @0) @1) { mask; }))))))
and we don't want the two rules to keep fighting against each other.
It is better to have one form as canonical and only right before expansion
(isel pass) or during expansion decide e.g. based on target costs
whether that (X << C1) & (C2 << C1) is better expanded like that,
or as (X & C2) << C1, or as (X << C3) >> C4.

	Jakub


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M - 1)
  2020-11-11 10:30 ` Jakub Jelinek
@ 2020-11-11 10:43   ` Philipp Tomsich
  2020-11-11 10:55     ` Jakub Jelinek
  0 siblings, 1 reply; 11+ messages in thread
From: Philipp Tomsich @ 2020-11-11 10:43 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Philipp Tomsich, gcc-patches, Philipp Tomsich

Jakub,

On Wed, 11 Nov 2020 at 11:31, Jakub Jelinek <jakub@redhat.com> wrote:
>
> On Wed, Nov 11, 2020 at 11:17:32AM +0100, Philipp Tomsich wrote:
> > From: Philipp Tomsich <prt@gnu.org>
> >
> > The function
> >     long f(long a)
> >     {
> >       return(a & 0xFFFFFFFFull) << 3;
> >     }
> > is folded into
> >     _1 = a_2(D) << 3;
> >     _3 = _1 & 34359738360;
> > wheras the construction
> >     return (a & 0xFFFFFFFFull) * 8;
> > results in
> >     _1 = a_2(D) & 4294967295;
> >     _3 = _1 * 8;
> >
> > This leads to suboptimal code-generation for RISC-V (march=rv64g), as
> > the shifted constant needs to be expanded into 3 RTX and 2 RTX (one
> > each for the LSHIFT_EXPR and the BIT_AND_EXPR) which will overwhelm
> > the combine pass (a sequence of 5 RTX are not considered):
> >       li      a5,1            # tmp78,        # 23    [c=4 l=4]  *movdi_64bit/1
> >       slli    a5,a5,35        #, tmp79, tmp78 # 24    [c=4 l=4]  ashldi3
> >       addi    a5,a5,-8        #, tmp77, tmp79 # 9     [c=4 l=4]  adddi3/1
> >       slli    a0,a0,3         #, tmp76, tmp80 # 6     [c=4 l=4]  ashldi3
> >       and     a0,a0,a5        # tmp77,, tmp76 # 15    [c=4 l=4]  anddi3/0
> >       ret                     # 28    [c=0 l=4]  simple_return
> > instead of:
> >       slli    a0,a0,32        #, tmp76, tmp79 # 26    [c=4 l=4]  ashldi3
> >       srli    a0,a0,29        #,, tmp76       # 27    [c=4 l=4]  lshrdi3
> >       ret                                     # 24    [c=0 l=4]  simple_return
> >
> > We address this by adding a simplification for
> >    (a << s) & M, where ((M >> s) << s) == M
> > to
> >    (a & M_unshifted) << s, where M_unshifted := (M >> s)
> > which undistributes the LSHIFT.
>
> This is problematic, we have another rule that goes against this:
> /* Fold (X {&,^,|} C2) << C1 into (X << C1) {&,^,|} (C2 << C1)
>    (X {&,^,|} C2) >> C1 into (X >> C1) & (C2 >> C1).  */
> (for shift (lshift rshift)
>  (for bit_op (bit_and bit_xor bit_ior)
>   (simplify
>    (shift (convert?:s (bit_op:s @0 INTEGER_CST@2)) INTEGER_CST@1)
>    (if (tree_nop_conversion_p (type, TREE_TYPE (@0)))
>     (with { tree mask = int_const_binop (shift, fold_convert (type, @2), @1); }
>      (bit_op (shift (convert @0) @1) { mask; }))))))
> and we don't want the two rules to keep fighting against each other.
> It is better to have one form as canonical and only right before expansion
> (isel pass) or during expansion decide e.g. based on target costs
> whether that (X << C1) & (C2 << C1) is better expanded like that,
> or as (X & C2) << C1, or as (X << C3) >> C4.


The patch addresses this by disallowing that rule, if an exact power-of-2 is
seen as C1.  The reason why I would prefer to have this canonicalised the
same way the (X & C1) * C2 is canonicalised, is that cleaning this up during
combine is more difficult on some architectures that require multiple insns
to represent the shifted constant (i.e. C1 << C2).

Given that this maps back to another (already used) canonical form, it seems
straightforward enough — assuming that the additional complexity (i.e. an
additional rule and an additional condition for one of the other
rules) is acceptable.

Philipp.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M - 1)
  2020-11-11 10:43   ` Philipp Tomsich
@ 2020-11-11 10:55     ` Jakub Jelinek
  2020-11-11 18:17       ` Jeff Law
  2020-11-11 21:43       ` Jim Wilson
  0 siblings, 2 replies; 11+ messages in thread
From: Jakub Jelinek @ 2020-11-11 10:55 UTC (permalink / raw)
  To: Philipp Tomsich; +Cc: Philipp Tomsich, gcc-patches, Philipp Tomsich

On Wed, Nov 11, 2020 at 11:43:34AM +0100, Philipp Tomsich wrote:
> The patch addresses this by disallowing that rule, if an exact power-of-2 is
> seen as C1.  The reason why I would prefer to have this canonicalised the
> same way the (X & C1) * C2 is canonicalised, is that cleaning this up during
> combine is more difficult on some architectures that require multiple insns
> to represent the shifted constant (i.e. C1 << C2).

It is bad to have many exceptions for the canonicalization
and it is unclear why exactly these were chosen, and it doesn't really deal
with say:
(x & 0xabcdef12ULL) << 13
being less expensive on some targets than
(x << 13) & (0xabcdef12ULL << 13).
(x & 0x7ffff) << 3 vs. (x << 3) & 0x3ffff8 on the other side is a wash on
many targets.
As I said, it is better to decide which one is better before or during
expansion based on target costs, sure, combine can't catch everything.

Also, the patch formatting was incorrect in several ways (indentation,
missing space before ( when calling functions, etc.).

	Jakub

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M - 1)
  2020-11-11 10:55     ` Jakub Jelinek
@ 2020-11-11 18:17       ` Jeff Law
  2020-11-11 19:53         ` Philipp Tomsich
  2020-11-11 21:43       ` Jim Wilson
  1 sibling, 1 reply; 11+ messages in thread
From: Jeff Law @ 2020-11-11 18:17 UTC (permalink / raw)
  To: Jakub Jelinek, Philipp Tomsich
  Cc: gcc-patches, Philipp Tomsich, Philipp Tomsich


On 11/11/20 3:55 AM, Jakub Jelinek via Gcc-patches wrote:
> On Wed, Nov 11, 2020 at 11:43:34AM +0100, Philipp Tomsich wrote:
>> The patch addresses this by disallowing that rule, if an exact power-of-2 is
>> seen as C1.  The reason why I would prefer to have this canonicalised the
>> same way the (X & C1) * C2 is canonicalised, is that cleaning this up during
>> combine is more difficult on some architectures that require multiple insns
>> to represent the shifted constant (i.e. C1 << C2).
> It is bad to have many exceptions for the canonicalization
> and it is unclear why exactly these were chosen, and it doesn't really deal
> with say:
> (x & 0xabcdef12ULL) << 13
> being less expensive on some targets than
> (x << 13) & (0xabcdef12ULL << 13).
> (x & 0x7ffff) << 3 vs. (x << 3) & 0x3ffff8 on the other side is a wash on
> many targets.
> As I said, it is better to decide which one is better before or during
> expansion based on target costs, sure, combine can't catch everything.

I think Jakub is hitting a key point here.  Gimple should canonicalize
on what is simpler from a gimple standpoint, not what is better for some
set of targets.   Target dependencies like this shouldn't be introduced
until expansion time.


Jeff



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M - 1)
  2020-11-11 18:17       ` Jeff Law
@ 2020-11-11 19:53         ` Philipp Tomsich
  2020-11-11 19:59           ` Jakub Jelinek
  0 siblings, 1 reply; 11+ messages in thread
From: Philipp Tomsich @ 2020-11-11 19:53 UTC (permalink / raw)
  To: Jeff Law; +Cc: Jakub Jelinek, Philipp Tomsich, gcc-patches, Philipp Tomsich

On Wed, 11 Nov 2020 at 19:17, Jeff Law <law@redhat.com> wrote:
>
>
> On 11/11/20 3:55 AM, Jakub Jelinek via Gcc-patches wrote:
> > On Wed, Nov 11, 2020 at 11:43:34AM +0100, Philipp Tomsich wrote:
> >> The patch addresses this by disallowing that rule, if an exact power-of-2 is
> >> seen as C1.  The reason why I would prefer to have this canonicalised the
> >> same way the (X & C1) * C2 is canonicalised, is that cleaning this up during
> >> combine is more difficult on some architectures that require multiple insns
> >> to represent the shifted constant (i.e. C1 << C2).
> > It is bad to have many exceptions for the canonicalization
> > and it is unclear why exactly these were chosen, and it doesn't really deal
> > with say:
> > (x & 0xabcdef12ULL) << 13
> > being less expensive on some targets than
> > (x << 13) & (0xabcdef12ULL << 13).
> > (x & 0x7ffff) << 3 vs. (x << 3) & 0x3ffff8 on the other side is a wash on
> > many targets.
> > As I said, it is better to decide which one is better before or during
> > expansion based on target costs, sure, combine can't catch everything.
>
> I think Jakub is hitting a key point here.  Gimple should canonicalize
> on what is simpler from a gimple standpoint, not what is better for some
> set of targets.   Target dependencies like this shouldn't be introduced
> until expansion time.

The simplification that distributes the shift (i.e. the one that Jakub referred
to as fighting the new rule) is also run after GIMPLE has been expanded to
RTX.  In my understanding, this still implies that even if we have a cost-aware
expansion, this existing rule will nonetheless distribute the shift.

Philipp.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M - 1)
  2020-11-11 19:53         ` Philipp Tomsich
@ 2020-11-11 19:59           ` Jakub Jelinek
  2020-11-11 20:04             ` Philipp Tomsich
  0 siblings, 1 reply; 11+ messages in thread
From: Jakub Jelinek @ 2020-11-11 19:59 UTC (permalink / raw)
  To: Philipp Tomsich; +Cc: Jeff Law, Philipp Tomsich, gcc-patches, Philipp Tomsich

On Wed, Nov 11, 2020 at 08:53:56PM +0100, Philipp Tomsich wrote:
> > On 11/11/20 3:55 AM, Jakub Jelinek via Gcc-patches wrote:
> > > On Wed, Nov 11, 2020 at 11:43:34AM +0100, Philipp Tomsich wrote:
> > >> The patch addresses this by disallowing that rule, if an exact power-of-2 is
> > >> seen as C1.  The reason why I would prefer to have this canonicalised the
> > >> same way the (X & C1) * C2 is canonicalised, is that cleaning this up during
> > >> combine is more difficult on some architectures that require multiple insns
> > >> to represent the shifted constant (i.e. C1 << C2).
> > > It is bad to have many exceptions for the canonicalization
> > > and it is unclear why exactly these were chosen, and it doesn't really deal
> > > with say:
> > > (x & 0xabcdef12ULL) << 13
> > > being less expensive on some targets than
> > > (x << 13) & (0xabcdef12ULL << 13).
> > > (x & 0x7ffff) << 3 vs. (x << 3) & 0x3ffff8 on the other side is a wash on
> > > many targets.
> > > As I said, it is better to decide which one is better before or during
> > > expansion based on target costs, sure, combine can't catch everything.
> >
> > I think Jakub is hitting a key point here.  Gimple should canonicalize
> > on what is simpler from a gimple standpoint, not what is better for some
> > set of targets.   Target dependencies like this shouldn't be introduced
> > until expansion time.
> 
> The simplification that distributes the shift (i.e. the one that Jakub referred
> to as fighting the new rule) is also run after GIMPLE has been expanded to
> RTX.  In my understanding, this still implies that even if we have a cost-aware
> expansion, this existing rule will nonetheless distribute the shift.

At the RTL level, such simplifications should not happen if it is
against costs (e.g. combine but various other passes too check costs and
punt if the new code would be more costly than the old one).

	Jakub


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M - 1)
  2020-11-11 19:59           ` Jakub Jelinek
@ 2020-11-11 20:04             ` Philipp Tomsich
  2020-11-11 20:13               ` Jakub Jelinek
  0 siblings, 1 reply; 11+ messages in thread
From: Philipp Tomsich @ 2020-11-11 20:04 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Jeff Law, Philipp Tomsich, gcc-patches, Philipp Tomsich

On Wed, 11 Nov 2020 at 20:59, Jakub Jelinek <jakub@redhat.com> wrote:
> >
> > The simplification that distributes the shift (i.e. the one that Jakub referred
> > to as fighting the new rule) is also run after GIMPLE has been expanded to
> > RTX.  In my understanding, this still implies that even if we have a cost-aware
> > expansion, this existing rule will nonetheless distribute the shift.
>
> At the RTL level, such simplifications should not happen if it is
> against costs (e.g. combine but various other passes too check costs and
> punt if the new code would be more costly than the old one).

I agree.
Let me go back and investigate if the cost-model is misreading things, before we
continue the discussion.

Philipp.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M - 1)
  2020-11-11 20:04             ` Philipp Tomsich
@ 2020-11-11 20:13               ` Jakub Jelinek
  0 siblings, 0 replies; 11+ messages in thread
From: Jakub Jelinek @ 2020-11-11 20:13 UTC (permalink / raw)
  To: Philipp Tomsich; +Cc: Jeff Law, Philipp Tomsich, gcc-patches, Philipp Tomsich

On Wed, Nov 11, 2020 at 09:04:28PM +0100, Philipp Tomsich wrote:
> On Wed, 11 Nov 2020 at 20:59, Jakub Jelinek <jakub@redhat.com> wrote:
> > >
> > > The simplification that distributes the shift (i.e. the one that Jakub referred
> > > to as fighting the new rule) is also run after GIMPLE has been expanded to
> > > RTX.  In my understanding, this still implies that even if we have a cost-aware
> > > expansion, this existing rule will nonetheless distribute the shift.
> >
> > At the RTL level, such simplifications should not happen if it is
> > against costs (e.g. combine but various other passes too check costs and
> > punt if the new code would be more costly than the old one).
> 
> I agree.
> Let me go back and investigate if the cost-model is misreading things, before we
> continue the discussion.

If it is on some targets, surely it should be fixed.
E.g. x86_64 certainly has different costs for constants that fit into
instruction's immediates and for constants that need to be loaded into
registers first.
I know various other targets have much more complex sequences to compute
certain constants in registers though.
sparc costs surprise me:
    case CONST_INT:
      if (SMALL_INT (x))
        *total = 0;
      else
        *total = 2;
because for the else I'd have expect better analysis of the constant to see
how many instructions are needed for it (I think maximum is 6).

	Jakub


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M - 1)
  2020-11-11 10:55     ` Jakub Jelinek
  2020-11-11 18:17       ` Jeff Law
@ 2020-11-11 21:43       ` Jim Wilson
  2020-11-11 22:06         ` Jakub Jelinek
  1 sibling, 1 reply; 11+ messages in thread
From: Jim Wilson @ 2020-11-11 21:43 UTC (permalink / raw)
  To: Jakub Jelinek
  Cc: Philipp Tomsich, GCC Patches, Philipp Tomsich, Philipp Tomsich

On Wed, Nov 11, 2020 at 2:55 AM Jakub Jelinek via Gcc-patches <
gcc-patches@gcc.gnu.org> wrote:

> On Wed, Nov 11, 2020 at 11:43:34AM +0100, Philipp Tomsich wrote:
> > The patch addresses this by disallowing that rule, if an exact
> power-of-2 is
> > seen as C1.  The reason why I would prefer to have this canonicalised the
> > same way the (X & C1) * C2 is canonicalised, is that cleaning this up
> during
> > combine is more difficult on some architectures that require multiple
> insns
> > to represent the shifted constant (i.e. C1 << C2).
>
> As I said, it is better to decide which one is better before or during
> expansion based on target costs, sure, combine can't catch everything.
>

it could be fixed in combine if we allowed 4 instructions to split into 3.
We allow combinations of 4 insns, but we only allow splits into 2 insns as
far as I know.

Trying 7, 8, 9 -> 10:
    7: r80:DI=0x1
    8: r81:DI=r80:DI<<0x23
      REG_DEAD r80:DI
      REG_EQUAL 0x800000000
    9: r79:DI=r81:DI-0x8
      REG_DEAD r81:DI
      REG_EQUAL 0x7fffffff8
   10: r77:DI=r78:DI&r79:DI
      REG_DEAD r79:DI
      REG_DEAD r78:DI
Failed to match this instruction:
(set (reg:DI 77)
    (and:DI (reg:DI 78)
        (const_int 34359738360 [0x7fffffff8])))

The AND operation can be implemented with 3 shifts, a left shift to clear
the upper bits, a right shift to clear the lower bits, and then another
left shift to shift it back to position.  We are then left with 4 shifts,
and we can have a combiner pattern to match those 4 shifts and reduce to
2.  But this would require combine.c changes to work.  Unless maybe we
don't split the pattern into 3 insns and accept it as is, but there is risk
that this could result in worse code.

Or alternatively, maybe we could have an ANDDI3 expander which accepts mask
constants like this and emits the 3 shifts directly instead of forcing
the constant to a register.  Then we just need to be able to recognize that
these 3 shifts plus the fourth one can be combined into 2 shifts which
might work already, and if not it should be a simple combiner pattern.
This doesn't help if the same RTL can be created after initial RTL
expansion.

With the draft B extension we do this operation with a single instruction,
but we should get this right for systems without the B extension also.

Jim

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M - 1)
  2020-11-11 21:43       ` Jim Wilson
@ 2020-11-11 22:06         ` Jakub Jelinek
  0 siblings, 0 replies; 11+ messages in thread
From: Jakub Jelinek @ 2020-11-11 22:06 UTC (permalink / raw)
  To: Jim Wilson; +Cc: GCC Patches, Philipp Tomsich, Philipp Tomsich, Philipp Tomsich

On Wed, Nov 11, 2020 at 01:43:00PM -0800, Jim Wilson wrote:
> > On Wed, Nov 11, 2020 at 11:43:34AM +0100, Philipp Tomsich wrote:
> > > The patch addresses this by disallowing that rule, if an exact
> > power-of-2 is
> > > seen as C1.  The reason why I would prefer to have this canonicalised the
> > > same way the (X & C1) * C2 is canonicalised, is that cleaning this up
> > during
> > > combine is more difficult on some architectures that require multiple
> > insns
> > > to represent the shifted constant (i.e. C1 << C2).
> >
> > As I said, it is better to decide which one is better before or during
> > expansion based on target costs, sure, combine can't catch everything.
> >
> 
> it could be fixed in combine if we allowed 4 instructions to split into 3.
> We allow combinations of 4 insns, but we only allow splits into 2 insns as
> far as I know.

If combiner can do it, good, but I think it would be still helpful for many
targets to decide it during expansion, we have TER and so on
expansion of BIT_AND_EXPR with a constant second operand we can check
if the other one is a left shift and then based on the exact mask
decide if it is useful to try different possibilities and ask for their
costs (try (x & c1) << c2, (x << c3) & c4 and perhaps (x << c5) >> c6).
We already have other cases where during expansion we try multiple things
and compare their costs.  E.g. for division and modulo, if we know that the
most significant bit is clear on both of the operands, we can expand as both
signed or unsigned division/modulo and so we try both and pick the one
which is less costly on the target.  Similarly, I think we could do it
for right shifts, if we know the most significant bit is clear, we can
expand it as both arithmetic and logical right shift, so we can ask the
target what it prefers cost-wise.

E.g. on x86_64
unsigned long long
foo (unsigned long long x)
{
  return (x << 47) >> 17;
}

unsigned long long
bar (unsigned long long x)
{
  return (x & 0x1ffff) << 30;
}

unsigned long long
baz (unsigned long long x)
{
  unsigned long long y;
  __asm ("" : "=r" (y) : "0" (x & 0x1ffff));
  return y << 30;
}

unsigned long long
qux (unsigned long long x)
{
  return (x << 30) & (0x1ffffULL << 30);
}
foo is 4 instructions 12 bytes, baz is 4 instructions 13 bytes,
bar/qux are 5 instructions 21 bytes, which of foo or baz is faster would
need to be benchmarked and no idea what the rtx costs would say, but bar/qux
certainly looks more costly and I'm quite sure the rtx costs would say that
too.

The reason for the match.pd canonicalization is I bet it wants to put
possible multiple shifts adjacent and similarly with the masks, so that when
one uses
(((x & c1) << c2) & c3) << c4 etc. one can simplify that into just one shift
and one masking; and having the canonicalization do it one way for some
constants/shift pairs and another way for others wouldn't achieve that.

	Jakub

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2020-11-11 22:06 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-11 10:17 [PATCH] match.pd: undistribute (a << s) & C, when C = (M << s) and exact_log2(M - 1) Philipp Tomsich
2020-11-11 10:30 ` Jakub Jelinek
2020-11-11 10:43   ` Philipp Tomsich
2020-11-11 10:55     ` Jakub Jelinek
2020-11-11 18:17       ` Jeff Law
2020-11-11 19:53         ` Philipp Tomsich
2020-11-11 19:59           ` Jakub Jelinek
2020-11-11 20:04             ` Philipp Tomsich
2020-11-11 20:13               ` Jakub Jelinek
2020-11-11 21:43       ` Jim Wilson
2020-11-11 22:06         ` Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).