public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] combine: Narrow comparison of memory and constant
@ 2023-06-12  7:57 Stefan Schulze Frielinghaus
  2023-06-12 21:29 ` Jeff Law
  0 siblings, 1 reply; 11+ messages in thread
From: Stefan Schulze Frielinghaus @ 2023-06-12  7:57 UTC (permalink / raw)
  To: gcc-patches; +Cc: Stefan Schulze Frielinghaus

Comparisons between memory and constants might be done in a smaller mode
resulting in smaller constants which might finally end up as immediates
instead of in the literal pool.

For example, on s390x a non-symmetric comparison like
  x <= 0x3fffffffffffffff
results in the constant being spilled to the literal pool and an 8 byte
memory comparison is emitted.  Ideally, an equivalent comparison
  x0 <= 0x3f
where x0 is the most significant byte of x, is emitted where the
constant is smaller and more likely to materialize as an immediate.

Similarly, comparisons of the form
  x >= 0x4000000000000000
can be shortened into x0 >= 0x40.

I'm not entirely sure whether combine is the right place to implement
something like this.  In my first try I implemented it in
TARGET_CANONICALIZE_COMPARISON but then thought other targets might
profit from it, too.  simplify_context::simplify_relational_operation_1
seems to be the wrong place since code/mode may change.  Any opinions?

gcc/ChangeLog:

	* combine.cc (simplify_compare_const): Narrow comparison of
	memory and constant.
	(try_combine): Adapt new function signature.
	(simplify_comparison): Adapt new function signature.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/cmp-mem-const-1.c: New test.
	* gcc.target/s390/cmp-mem-const-2.c: New test.
---
 gcc/combine.cc                                | 82 ++++++++++++++-
 .../gcc.target/s390/cmp-mem-const-1.c         | 99 +++++++++++++++++++
 .../gcc.target/s390/cmp-mem-const-2.c         | 23 +++++
 3 files changed, 200 insertions(+), 4 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
 create mode 100644 gcc/testsuite/gcc.target/s390/cmp-mem-const-2.c

diff --git a/gcc/combine.cc b/gcc/combine.cc
index 5aa0ec5c45a..6ad1600dc1b 100644
--- a/gcc/combine.cc
+++ b/gcc/combine.cc
@@ -460,7 +460,7 @@ static rtx simplify_shift_const (rtx, enum rtx_code, machine_mode, rtx,
 static int recog_for_combine (rtx *, rtx_insn *, rtx *);
 static rtx gen_lowpart_for_combine (machine_mode, rtx);
 static enum rtx_code simplify_compare_const (enum rtx_code, machine_mode,
-					     rtx, rtx *);
+					     rtx *, rtx *);
 static enum rtx_code simplify_comparison (enum rtx_code, rtx *, rtx *);
 static void update_table_tick (rtx);
 static void record_value_for_reg (rtx, rtx_insn *, rtx);
@@ -3185,7 +3185,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
 	  compare_code = orig_compare_code = GET_CODE (*cc_use_loc);
 	  if (is_a <scalar_int_mode> (GET_MODE (i2dest), &mode))
 	    compare_code = simplify_compare_const (compare_code, mode,
-						   op0, &op1);
+						   &op0, &op1);
 	  target_canonicalize_comparison (&compare_code, &op0, &op1, 1);
 	}
 
@@ -11800,9 +11800,10 @@ gen_lowpart_for_combine (machine_mode omode, rtx x)
 
 static enum rtx_code
 simplify_compare_const (enum rtx_code code, machine_mode mode,
-			rtx op0, rtx *pop1)
+			rtx *pop0, rtx *pop1)
 {
   scalar_int_mode int_mode;
+  rtx op0 = *pop0;
   HOST_WIDE_INT const_op = INTVAL (*pop1);
 
   /* Get the constant we are comparing against and turn off all bits
@@ -11987,6 +11988,79 @@ simplify_compare_const (enum rtx_code code, machine_mode mode,
       break;
     }
 
+  /* Narrow non-symmetric comparison of memory and constant as e.g.
+     x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most
+     significant byte.  Likewise, transform x0...x7 >= 0x4000000000000000 into
+     x0 >= 0x40.  */
+  if ((code == LEU || code == LTU || code == GEU || code == GTU)
+      && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode)
+      && MEM_P (op0)
+      && !MEM_VOLATILE_P (op0)
+      && (unsigned HOST_WIDE_INT)const_op > 0xff)
+    {
+      unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT)const_op;
+      enum rtx_code adjusted_code = code;
+
+      /* If the least significant bit is already zero, then adjust the
+	 comparison in the hope that we hit cases like
+	   op0  <= 0x3ffffdfffffffffe
+	 where the adjusted comparison
+	   op0  <  0x3ffffdffffffffff
+	 can be shortened into
+	   op0' <  0x3ffffd.  */
+      if (code == LEU && (n & 1) == 0)
+	{
+	  ++n;
+	  adjusted_code = LTU;
+	}
+      /* or e.g. op0 < 0x4020000000000000  */
+      else if (code == LTU && (n & 1) == 0)
+	{
+	  --n;
+	  adjusted_code = LEU;
+	}
+      /* or op0 >= 0x4000000000000001  */
+      else if (code == GEU && (n & 1) == 1)
+	{
+	  --n;
+	  adjusted_code = GTU;
+	}
+      /* or op0 > 0x3fffffffffffffff.  */
+      else if (code == GTU && (n & 1) == 1)
+	{
+	  ++n;
+	  adjusted_code = GEU;
+	}
+
+      scalar_int_mode narrow_mode_iter;
+      bool lower_p = code == LEU || code == LTU;
+      bool greater_p = !lower_p;
+      FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode)
+	{
+	  unsigned nbits = GET_MODE_PRECISION (int_mode)
+			  - GET_MODE_PRECISION (narrow_mode_iter);
+	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1;
+	  unsigned HOST_WIDE_INT lower_bits = n & mask;
+	  if ((lower_p && lower_bits == mask)
+	      || (greater_p && lower_bits == 0))
+	    {
+	      n >>= nbits;
+	      break;
+	    }
+	}
+
+      if (narrow_mode_iter < int_mode)
+	{
+	  poly_int64 offset = BYTES_BIG_ENDIAN
+				? 0
+				: GET_MODE_SIZE (int_mode)
+				  - GET_MODE_SIZE (narrow_mode_iter);
+	  *pop0 = adjust_address (op0, narrow_mode_iter, offset);
+	  *pop1 = GEN_INT (n);
+	  return adjusted_code;
+	}
+    }
+
   *pop1 = GEN_INT (const_op);
   return code;
 }
@@ -12179,7 +12253,7 @@ simplify_comparison (enum rtx_code code, rtx *pop0, rtx *pop1)
 
       /* Try to simplify the compare to constant, possibly changing the
 	 comparison op, and/or changing op1 to zero.  */
-      code = simplify_compare_const (code, raw_mode, op0, &op1);
+      code = simplify_compare_const (code, raw_mode, &op0, &op1);
       const_op = INTVAL (op1);
 
       /* Compute some predicates to simplify code below.  */
diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
new file mode 100644
index 00000000000..b90c2a8c224
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
@@ -0,0 +1,99 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O1 -march=z13 -mzarch" } */
+/* { dg-final { scan-assembler-not {\tclc\t} } } */
+
+int
+ge_8b (unsigned long *x)
+{
+  return *x >= 0x4000000000000000;
+}
+
+int
+ge_8b_adjust (unsigned long *x)
+{
+  return *x >= 0x4000000000000001;
+}
+
+int
+ge_6b (unsigned long *x)
+{
+  return *x >= 0x400000000000;
+}
+
+int
+ge_6b_adjust (unsigned long *x)
+{
+  return *x >= 0x400000000001;
+}
+
+int
+gt_8b (unsigned long *x)
+{
+  return *x > 0x4000000000000000;
+}
+
+int
+gt_8b_adjust (unsigned long *x)
+{
+  return *x > 0x3fffffffffffffff;
+}
+
+int
+gt_6b (unsigned long *x)
+{
+  return *x > 0x400000000000;
+}
+
+int
+gt_6b_adjust (unsigned long *x)
+{
+  return *x > 0x3fffffffffff;
+}
+
+int
+le_8b (unsigned long *x)
+{
+  return *x <= 0x3ff7ffffffffffff;
+}
+
+int
+le_6b (unsigned long *x)
+{
+  return *x <= 0x3f7fffffffff;
+}
+
+int
+le_8b_adjust (unsigned long *x)
+{
+  return *x <= 0x3ffffdfffffffffe;
+}
+
+int
+le_6b_adjust (unsigned long *x)
+{
+  return *x <= 0x3ffffffffffe;
+}
+
+int
+lt_8b (unsigned long *x)
+{
+  return *x < 0x3fffffffffffffff;
+}
+
+int
+lt_6b (unsigned long *x)
+{
+  return *x < 0x3fffffffffff;
+}
+
+int
+lt_8b_adjust (unsigned long *x)
+{
+  return *x < 0x4020000000000000;
+}
+
+int
+lt_6b_adjust (unsigned long *x)
+{
+  return *x < 0x400000000000;
+}
diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-2.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-2.c
new file mode 100644
index 00000000000..e3951870d94
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-2.c
@@ -0,0 +1,23 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O1 -march=z13 -mzarch" } */
+/* { dg-final { scan-assembler-not {\tclc\t} } } */
+
+struct s
+{
+  long a;
+  unsigned b : 1;
+  unsigned c : 1;
+};
+
+int foo (struct s *x)
+{
+  /* Expression
+       x->b || x->c
+     is transformed into
+       _1 = BIT_FIELD_REF <*x_4(D), 64, 64>;
+       _2 = _1 > 0x3FFFFFFFFFFFFFFF;
+     where the constant may materialize in the literal pool and an 8 byte CLC
+     may be emitted.  Ensure this is not the case.
+  */
+  return x->b || x->c;
+}
-- 
2.39.2


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] combine: Narrow comparison of memory and constant
  2023-06-12  7:57 [PATCH] combine: Narrow comparison of memory and constant Stefan Schulze Frielinghaus
@ 2023-06-12 21:29 ` Jeff Law
  2023-06-19 14:19   ` Stefan Schulze Frielinghaus
  0 siblings, 1 reply; 11+ messages in thread
From: Jeff Law @ 2023-06-12 21:29 UTC (permalink / raw)
  To: Stefan Schulze Frielinghaus, gcc-patches



On 6/12/23 01:57, Stefan Schulze Frielinghaus via Gcc-patches wrote:
> Comparisons between memory and constants might be done in a smaller mode
> resulting in smaller constants which might finally end up as immediates
> instead of in the literal pool.
> 
> For example, on s390x a non-symmetric comparison like
>    x <= 0x3fffffffffffffff
> results in the constant being spilled to the literal pool and an 8 byte
> memory comparison is emitted.  Ideally, an equivalent comparison
>    x0 <= 0x3f
> where x0 is the most significant byte of x, is emitted where the
> constant is smaller and more likely to materialize as an immediate.
> 
> Similarly, comparisons of the form
>    x >= 0x4000000000000000
> can be shortened into x0 >= 0x40.
> 
> I'm not entirely sure whether combine is the right place to implement
> something like this.  In my first try I implemented it in
> TARGET_CANONICALIZE_COMPARISON but then thought other targets might
> profit from it, too.  simplify_context::simplify_relational_operation_1
> seems to be the wrong place since code/mode may change.  Any opinions?
> 
> gcc/ChangeLog:
> 
> 	* combine.cc (simplify_compare_const): Narrow comparison of
> 	memory and constant.
> 	(try_combine): Adapt new function signature.
> 	(simplify_comparison): Adapt new function signature.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/s390/cmp-mem-const-1.c: New test.
> 	* gcc.target/s390/cmp-mem-const-2.c: New test.
This does seem more general than we'd want to do in the canonicalization 
hook.  So thanks for going the extra mile and doing a generic 
implementation.




> @@ -11987,6 +11988,79 @@ simplify_compare_const (enum rtx_code code, machine_mode mode,
>         break;
>       }
>   
> +  /* Narrow non-symmetric comparison of memory and constant as e.g.
> +     x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most
> +     significant byte.  Likewise, transform x0...x7 >= 0x4000000000000000 into
> +     x0 >= 0x40.  */
> +  if ((code == LEU || code == LTU || code == GEU || code == GTU)
> +      && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode)
> +      && MEM_P (op0)
> +      && !MEM_VOLATILE_P (op0)
> +      && (unsigned HOST_WIDE_INT)const_op > 0xff)
> +    {
> +      unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT)const_op;
> +      enum rtx_code adjusted_code = code;
> +
> +      /* If the least significant bit is already zero, then adjust the
> +	 comparison in the hope that we hit cases like
> +	   op0  <= 0x3ffffdfffffffffe
> +	 where the adjusted comparison
> +	   op0  <  0x3ffffdffffffffff
> +	 can be shortened into
> +	   op0' <  0x3ffffd.  */
> +      if (code == LEU && (n & 1) == 0)
> +	{
> +	  ++n;
> +	  adjusted_code = LTU;
> +	}
> +      /* or e.g. op0 < 0x4020000000000000  */
> +      else if (code == LTU && (n & 1) == 0)
> +	{
> +	  --n;
> +	  adjusted_code = LEU;
> +	}
> +      /* or op0 >= 0x4000000000000001  */
> +      else if (code == GEU && (n & 1) == 1)
> +	{
> +	  --n;
> +	  adjusted_code = GTU;
> +	}
> +      /* or op0 > 0x3fffffffffffffff.  */
> +      else if (code == GTU && (n & 1) == 1)
> +	{
> +	  ++n;
> +	  adjusted_code = GEU;
> +	}
> +
> +      scalar_int_mode narrow_mode_iter;
> +      bool lower_p = code == LEU || code == LTU;
> +      bool greater_p = !lower_p;
> +      FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode)
> +	{
> +	  unsigned nbits = GET_MODE_PRECISION (int_mode)
> +			  - GET_MODE_PRECISION (narrow_mode_iter);
> +	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1;
> +	  unsigned HOST_WIDE_INT lower_bits = n & mask;
> +	  if ((lower_p && lower_bits == mask)
> +	      || (greater_p && lower_bits == 0))
> +	    {
> +	      n >>= nbits;
> +	      break;
> +	    }
> +	}
> +
> +      if (narrow_mode_iter < int_mode)
> +	{
> +	  poly_int64 offset = BYTES_BIG_ENDIAN
> +				? 0
> +				: GET_MODE_SIZE (int_mode)
> +				  - GET_MODE_SIZE (narrow_mode_iter);
Go ahead and add some parenthesis here.  I'd add one pair around the 
whole RHS of that assignment.  The '?' and ':' would line up under the 
'B' in that case.  Similarly add them around the false arm of the 
ternary.  The '-' will line up under the 'G'.

Going to trust you got the little endian adjustment correct here ;-)


>   
>         /* Compute some predicates to simplify code below.  */
> diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> new file mode 100644
> index 00000000000..b90c2a8c224
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> @@ -0,0 +1,99 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -march=z13 -mzarch" } */
> +/* { dg-final { scan-assembler-not {\tclc\t} } } */
> +
> +int
> +ge_8b (unsigned long *x)
> +{
> +  return *x >= 0x4000000000000000;
> +}
Would it be possible to add some debugging output in 
simplify_compare_const so that you could search for that debugging 
output and make these tests generic?

Jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] combine: Narrow comparison of memory and constant
  2023-06-12 21:29 ` Jeff Law
@ 2023-06-19 14:19   ` Stefan Schulze Frielinghaus
  2023-06-19 14:23     ` [PATCH v2] " Stefan Schulze Frielinghaus
  0 siblings, 1 reply; 11+ messages in thread
From: Stefan Schulze Frielinghaus @ 2023-06-19 14:19 UTC (permalink / raw)
  To: Jeff Law; +Cc: gcc-patches

On Mon, Jun 12, 2023 at 03:29:00PM -0600, Jeff Law wrote:
> 
> 
> On 6/12/23 01:57, Stefan Schulze Frielinghaus via Gcc-patches wrote:
> > Comparisons between memory and constants might be done in a smaller mode
> > resulting in smaller constants which might finally end up as immediates
> > instead of in the literal pool.
> > 
> > For example, on s390x a non-symmetric comparison like
> >    x <= 0x3fffffffffffffff
> > results in the constant being spilled to the literal pool and an 8 byte
> > memory comparison is emitted.  Ideally, an equivalent comparison
> >    x0 <= 0x3f
> > where x0 is the most significant byte of x, is emitted where the
> > constant is smaller and more likely to materialize as an immediate.
> > 
> > Similarly, comparisons of the form
> >    x >= 0x4000000000000000
> > can be shortened into x0 >= 0x40.
> > 
> > I'm not entirely sure whether combine is the right place to implement
> > something like this.  In my first try I implemented it in
> > TARGET_CANONICALIZE_COMPARISON but then thought other targets might
> > profit from it, too.  simplify_context::simplify_relational_operation_1
> > seems to be the wrong place since code/mode may change.  Any opinions?
> > 
> > gcc/ChangeLog:
> > 
> > 	* combine.cc (simplify_compare_const): Narrow comparison of
> > 	memory and constant.
> > 	(try_combine): Adapt new function signature.
> > 	(simplify_comparison): Adapt new function signature.
> > 
> > gcc/testsuite/ChangeLog:
> > 
> > 	* gcc.target/s390/cmp-mem-const-1.c: New test.
> > 	* gcc.target/s390/cmp-mem-const-2.c: New test.
> This does seem more general than we'd want to do in the canonicalization
> hook.  So thanks for going the extra mile and doing a generic
> implementation.
> 
> 
> 
> 
> > @@ -11987,6 +11988,79 @@ simplify_compare_const (enum rtx_code code, machine_mode mode,
> >         break;
> >       }
> > +  /* Narrow non-symmetric comparison of memory and constant as e.g.
> > +     x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most
> > +     significant byte.  Likewise, transform x0...x7 >= 0x4000000000000000 into
> > +     x0 >= 0x40.  */
> > +  if ((code == LEU || code == LTU || code == GEU || code == GTU)
> > +      && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode)
> > +      && MEM_P (op0)
> > +      && !MEM_VOLATILE_P (op0)
> > +      && (unsigned HOST_WIDE_INT)const_op > 0xff)
> > +    {
> > +      unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT)const_op;
> > +      enum rtx_code adjusted_code = code;
> > +
> > +      /* If the least significant bit is already zero, then adjust the
> > +	 comparison in the hope that we hit cases like
> > +	   op0  <= 0x3ffffdfffffffffe
> > +	 where the adjusted comparison
> > +	   op0  <  0x3ffffdffffffffff
> > +	 can be shortened into
> > +	   op0' <  0x3ffffd.  */
> > +      if (code == LEU && (n & 1) == 0)
> > +	{
> > +	  ++n;
> > +	  adjusted_code = LTU;
> > +	}
> > +      /* or e.g. op0 < 0x4020000000000000  */
> > +      else if (code == LTU && (n & 1) == 0)
> > +	{
> > +	  --n;
> > +	  adjusted_code = LEU;
> > +	}
> > +      /* or op0 >= 0x4000000000000001  */
> > +      else if (code == GEU && (n & 1) == 1)
> > +	{
> > +	  --n;
> > +	  adjusted_code = GTU;
> > +	}
> > +      /* or op0 > 0x3fffffffffffffff.  */
> > +      else if (code == GTU && (n & 1) == 1)
> > +	{
> > +	  ++n;
> > +	  adjusted_code = GEU;
> > +	}
> > +
> > +      scalar_int_mode narrow_mode_iter;
> > +      bool lower_p = code == LEU || code == LTU;
> > +      bool greater_p = !lower_p;
> > +      FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode)
> > +	{
> > +	  unsigned nbits = GET_MODE_PRECISION (int_mode)
> > +			  - GET_MODE_PRECISION (narrow_mode_iter);
> > +	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1;
> > +	  unsigned HOST_WIDE_INT lower_bits = n & mask;
> > +	  if ((lower_p && lower_bits == mask)
> > +	      || (greater_p && lower_bits == 0))
> > +	    {
> > +	      n >>= nbits;
> > +	      break;
> > +	    }
> > +	}
> > +
> > +      if (narrow_mode_iter < int_mode)
> > +	{
> > +	  poly_int64 offset = BYTES_BIG_ENDIAN
> > +				? 0
> > +				: GET_MODE_SIZE (int_mode)
> > +				  - GET_MODE_SIZE (narrow_mode_iter);
> Go ahead and add some parenthesis here.  I'd add one pair around the whole
> RHS of that assignment.  The '?' and ':' would line up under the 'B' in that
> case.  Similarly add them around the false arm of the ternary.  The '-' will
> line up under the 'G'.

Done.

> 
> Going to trust you got the little endian adjustment correct here ;-)

Sadly I gave it a try on x64, aarch64, and powerpc64le and in all cases
the resulting instructions were rejected either because the costs were
higher or because the new instructions failed to match.  Thus currently I
have tested this only thoroughly on s390x.

> 
> 
> >         /* Compute some predicates to simplify code below.  */
> > diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> > new file mode 100644
> > index 00000000000..b90c2a8c224
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> > @@ -0,0 +1,99 @@
> > +/* { dg-do compile { target { lp64 } } } */
> > +/* { dg-options "-O1 -march=z13 -mzarch" } */
> > +/* { dg-final { scan-assembler-not {\tclc\t} } } */
> > +
> > +int
> > +ge_8b (unsigned long *x)
> > +{
> > +  return *x >= 0x4000000000000000;
> > +}
> Would it be possible to add some debugging output in simplify_compare_const
> so that you could search for that debugging output and make these tests
> generic?

I very much like this idea, i.e., made those tests generic by adding
some debug output.

Beside that I cleaned up the code a bit and will send a V2 shortly.

Cheers,
Stefan

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH v2] combine: Narrow comparison of memory and constant
  2023-06-19 14:19   ` Stefan Schulze Frielinghaus
@ 2023-06-19 14:23     ` Stefan Schulze Frielinghaus
  2023-07-31 13:26       ` Stefan Schulze Frielinghaus
                         ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Stefan Schulze Frielinghaus @ 2023-06-19 14:23 UTC (permalink / raw)
  To: gcc-patches; +Cc: Stefan Schulze Frielinghaus

Comparisons between memory and constants might be done in a smaller mode
resulting in smaller constants which might finally end up as immediates
instead of in the literal pool.

For example, on s390x a non-symmetric comparison like
  x <= 0x3fffffffffffffff
results in the constant being spilled to the literal pool and an 8 byte
memory comparison is emitted.  Ideally, an equivalent comparison
  x0 <= 0x3f
where x0 is the most significant byte of x, is emitted where the
constant is smaller and more likely to materialize as an immediate.

Similarly, comparisons of the form
  x >= 0x4000000000000000
can be shortened into x0 >= 0x40.

Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le.
Note, the new tests show that for the mentioned little-endian targets
the optimization does not materialize since either the costs of the new
instructions are higher or they do not match.  Still ok for mainline?

gcc/ChangeLog:

	* combine.cc (simplify_compare_const): Narrow comparison of
	memory and constant.
	(try_combine): Adapt new function signature.
	(simplify_comparison): Adapt new function signature.

gcc/testsuite/ChangeLog:

	* gcc.dg/cmp-mem-const-1.c: New test.
	* gcc.dg/cmp-mem-const-2.c: New test.
	* gcc.dg/cmp-mem-const-3.c: New test.
	* gcc.dg/cmp-mem-const-4.c: New test.
	* gcc.dg/cmp-mem-const-5.c: New test.
	* gcc.dg/cmp-mem-const-6.c: New test.
	* gcc.target/s390/cmp-mem-const-1.c: New test.
---
 gcc/combine.cc                                | 79 +++++++++++++++++--
 gcc/testsuite/gcc.dg/cmp-mem-const-1.c        | 17 ++++
 gcc/testsuite/gcc.dg/cmp-mem-const-2.c        | 17 ++++
 gcc/testsuite/gcc.dg/cmp-mem-const-3.c        | 17 ++++
 gcc/testsuite/gcc.dg/cmp-mem-const-4.c        | 17 ++++
 gcc/testsuite/gcc.dg/cmp-mem-const-5.c        | 17 ++++
 gcc/testsuite/gcc.dg/cmp-mem-const-6.c        | 17 ++++
 .../gcc.target/s390/cmp-mem-const-1.c         | 24 ++++++
 8 files changed, 200 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-1.c
 create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-2.c
 create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-3.c
 create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-4.c
 create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-5.c
 create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-6.c
 create mode 100644 gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c

diff --git a/gcc/combine.cc b/gcc/combine.cc
index 5aa0ec5c45a..56e15a93409 100644
--- a/gcc/combine.cc
+++ b/gcc/combine.cc
@@ -460,7 +460,7 @@ static rtx simplify_shift_const (rtx, enum rtx_code, machine_mode, rtx,
 static int recog_for_combine (rtx *, rtx_insn *, rtx *);
 static rtx gen_lowpart_for_combine (machine_mode, rtx);
 static enum rtx_code simplify_compare_const (enum rtx_code, machine_mode,
-					     rtx, rtx *);
+					     rtx *, rtx *);
 static enum rtx_code simplify_comparison (enum rtx_code, rtx *, rtx *);
 static void update_table_tick (rtx);
 static void record_value_for_reg (rtx, rtx_insn *, rtx);
@@ -3185,7 +3185,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
 	  compare_code = orig_compare_code = GET_CODE (*cc_use_loc);
 	  if (is_a <scalar_int_mode> (GET_MODE (i2dest), &mode))
 	    compare_code = simplify_compare_const (compare_code, mode,
-						   op0, &op1);
+						   &op0, &op1);
 	  target_canonicalize_comparison (&compare_code, &op0, &op1, 1);
 	}
 
@@ -11796,13 +11796,14 @@ gen_lowpart_for_combine (machine_mode omode, rtx x)
    (CODE OP0 const0_rtx) form.
 
    The result is a possibly different comparison code to use.
-   *POP1 may be updated.  */
+   *POP0 and *POP1 may be updated.  */
 
 static enum rtx_code
 simplify_compare_const (enum rtx_code code, machine_mode mode,
-			rtx op0, rtx *pop1)
+			rtx *pop0, rtx *pop1)
 {
   scalar_int_mode int_mode;
+  rtx op0 = *pop0;
   HOST_WIDE_INT const_op = INTVAL (*pop1);
 
   /* Get the constant we are comparing against and turn off all bits
@@ -11987,6 +11988,74 @@ simplify_compare_const (enum rtx_code code, machine_mode mode,
       break;
     }
 
+  /* Narrow non-symmetric comparison of memory and constant as e.g.
+     x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most
+     significant byte.  Likewise, transform x0...x7 >= 0x4000000000000000 into
+     x0 >= 0x40.  */
+  if ((code == LEU || code == LTU || code == GEU || code == GTU)
+      && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode)
+      && MEM_P (op0)
+      && !MEM_VOLATILE_P (op0)
+      /* The optimization makes only sense for constants which are big enough
+	 so that we have a chance to chop off something at all.  */
+      && (unsigned HOST_WIDE_INT) const_op > 0xff
+      /* Ensure that we do not overflow during normalization.  */
+      && (code != GTU || (unsigned HOST_WIDE_INT) const_op < HOST_WIDE_INT_M1U))
+    {
+      unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT) const_op;
+      enum rtx_code adjusted_code;
+
+      /* Normalize code to either LEU or GEU.  */
+      if (code == LTU)
+	{
+	  --n;
+	  adjusted_code = LEU;
+	}
+      else if (code == GTU)
+	{
+	  ++n;
+	  adjusted_code = GEU;
+	}
+      else
+	adjusted_code = code;
+
+      scalar_int_mode narrow_mode_iter;
+      FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode)
+	{
+	  unsigned nbits = GET_MODE_PRECISION (int_mode)
+			   - GET_MODE_PRECISION (narrow_mode_iter);
+	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1;
+	  unsigned HOST_WIDE_INT lower_bits = n & mask;
+	  if ((adjusted_code == LEU && lower_bits == mask)
+	      || (adjusted_code == GEU && lower_bits == 0))
+	    {
+	      n >>= nbits;
+	      break;
+	    }
+	}
+
+      if (narrow_mode_iter < int_mode)
+	{
+	  if (dump_file && (dump_flags & TDF_DETAILS))
+	    {
+	      fprintf (
+		dump_file, "narrow comparison from mode %s to %s: (MEM %s "
+		HOST_WIDE_INT_PRINT_HEX ") to (MEM %s "
+		HOST_WIDE_INT_PRINT_HEX ").\n", GET_MODE_NAME (int_mode),
+		GET_MODE_NAME (narrow_mode_iter), GET_RTX_NAME (code),
+		(unsigned HOST_WIDE_INT)const_op, GET_RTX_NAME (adjusted_code),
+		n);
+	    }
+	  poly_int64 offset = (BYTES_BIG_ENDIAN
+			       ? 0
+			       : (GET_MODE_SIZE (int_mode)
+				  - GET_MODE_SIZE (narrow_mode_iter)));
+	  *pop0 = adjust_address_nv (op0, narrow_mode_iter, offset);
+	  *pop1 = GEN_INT (n);
+	  return adjusted_code;
+	}
+    }
+
   *pop1 = GEN_INT (const_op);
   return code;
 }
@@ -12179,7 +12248,7 @@ simplify_comparison (enum rtx_code code, rtx *pop0, rtx *pop1)
 
       /* Try to simplify the compare to constant, possibly changing the
 	 comparison op, and/or changing op1 to zero.  */
-      code = simplify_compare_const (code, raw_mode, op0, &op1);
+      code = simplify_compare_const (code, raw_mode, &op0, &op1);
       const_op = INTVAL (op1);
 
       /* Compute some predicates to simplify code below.  */
diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-1.c b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c
new file mode 100644
index 00000000000..263ad98af79
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O1 -fdump-rtl-combine-details" } */
+/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */
+
+typedef __UINT64_TYPE__ uint64_t;
+
+int
+le_1byte_a (uint64_t *x)
+{
+  return *x <= 0x3fffffffffffffff;
+}
+
+int
+le_1byte_b (uint64_t *x)
+{
+  return *x < 0x4000000000000000;
+}
diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-2.c b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c
new file mode 100644
index 00000000000..a7cc5348295
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O1 -fdump-rtl-combine-details" } */
+/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */
+
+typedef __UINT64_TYPE__ uint64_t;
+
+int
+ge_1byte_a (uint64_t *x)
+{
+  return *x > 0x3fffffffffffffff;
+}
+
+int
+ge_1byte_b (uint64_t *x)
+{
+  return *x >= 0x4000000000000000;
+}
diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-3.c b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c
new file mode 100644
index 00000000000..06f80bf72d8
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O1 -fdump-rtl-combine-details" } */
+/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */
+
+typedef __UINT64_TYPE__ uint64_t;
+
+int
+le_2bytes_a (uint64_t *x)
+{
+  return *x <= 0x3ffdffffffffffff;
+}
+
+int
+le_2bytes_b (uint64_t *x)
+{
+  return *x < 0x3ffe000000000000;
+}
diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-4.c b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c
new file mode 100644
index 00000000000..407999abf7e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O1 -fdump-rtl-combine-details" } */
+/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */
+
+typedef __UINT64_TYPE__ uint64_t;
+
+int
+ge_2bytes_a (uint64_t *x)
+{
+  return *x > 0x400cffffffffffff;
+}
+
+int
+ge_2bytes_b (uint64_t *x)
+{
+  return *x >= 0x400d000000000000;
+}
diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-5.c b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c
new file mode 100644
index 00000000000..e16773f5bcf
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O1 -fdump-rtl-combine-details" } */
+/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */
+
+typedef __UINT64_TYPE__ uint64_t;
+
+int
+le_4bytes_a (uint64_t *x)
+{
+  return *x <= 0x3ffffdffffffffff;
+}
+
+int
+le_4bytes_b (uint64_t *x)
+{
+  return *x < 0x3ffffe0000000000;
+}
diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-6.c b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c
new file mode 100644
index 00000000000..8f53b5678bd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O1 -fdump-rtl-combine-details" } */
+/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */
+
+typedef __UINT64_TYPE__ uint64_t;
+
+int
+ge_4bytes_a (uint64_t *x)
+{
+  return *x > 0x4000cfffffffffff;
+}
+
+int
+ge_4bytes_b (uint64_t *x)
+{
+  return *x >= 0x4000d00000000000;
+}
diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
new file mode 100644
index 00000000000..309aafbec01
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
@@ -0,0 +1,24 @@
+/* { dg-do compile { target { lp64 } } } */
+/* { dg-options "-O1 -march=z13 -mzarch -fdump-rtl-combine-details" } */
+/* { dg-final { scan-assembler-not {\tclc\t} } } */
+/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */
+
+struct s
+{
+  long a;
+  unsigned b : 1;
+  unsigned c : 1;
+};
+
+int foo (struct s *x)
+{
+  /* Expression
+       x->b || x->c
+     is transformed into
+       _1 = BIT_FIELD_REF <*x_4(D), 64, 64>;
+       _2 = _1 > 0x3FFFFFFFFFFFFFFF;
+     where the constant may materialize in the literal pool and an 8 byte CLC
+     may be emitted.  Ensure this is not the case.
+  */
+  return x->b || x->c;
+}
-- 
2.39.2


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] combine: Narrow comparison of memory and constant
  2023-06-19 14:23     ` [PATCH v2] " Stefan Schulze Frielinghaus
@ 2023-07-31 13:26       ` Stefan Schulze Frielinghaus
  2023-07-31 13:59       ` Jeff Law
  2023-07-31 21:43       ` Prathamesh Kulkarni
  2 siblings, 0 replies; 11+ messages in thread
From: Stefan Schulze Frielinghaus @ 2023-07-31 13:26 UTC (permalink / raw)
  To: gcc-patches

ping

On Mon, Jun 19, 2023 at 04:23:57PM +0200, Stefan Schulze Frielinghaus wrote:
> Comparisons between memory and constants might be done in a smaller mode
> resulting in smaller constants which might finally end up as immediates
> instead of in the literal pool.
> 
> For example, on s390x a non-symmetric comparison like
>   x <= 0x3fffffffffffffff
> results in the constant being spilled to the literal pool and an 8 byte
> memory comparison is emitted.  Ideally, an equivalent comparison
>   x0 <= 0x3f
> where x0 is the most significant byte of x, is emitted where the
> constant is smaller and more likely to materialize as an immediate.
> 
> Similarly, comparisons of the form
>   x >= 0x4000000000000000
> can be shortened into x0 >= 0x40.
> 
> Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le.
> Note, the new tests show that for the mentioned little-endian targets
> the optimization does not materialize since either the costs of the new
> instructions are higher or they do not match.  Still ok for mainline?
> 
> gcc/ChangeLog:
> 
> 	* combine.cc (simplify_compare_const): Narrow comparison of
> 	memory and constant.
> 	(try_combine): Adapt new function signature.
> 	(simplify_comparison): Adapt new function signature.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/cmp-mem-const-1.c: New test.
> 	* gcc.dg/cmp-mem-const-2.c: New test.
> 	* gcc.dg/cmp-mem-const-3.c: New test.
> 	* gcc.dg/cmp-mem-const-4.c: New test.
> 	* gcc.dg/cmp-mem-const-5.c: New test.
> 	* gcc.dg/cmp-mem-const-6.c: New test.
> 	* gcc.target/s390/cmp-mem-const-1.c: New test.
> ---
>  gcc/combine.cc                                | 79 +++++++++++++++++--
>  gcc/testsuite/gcc.dg/cmp-mem-const-1.c        | 17 ++++
>  gcc/testsuite/gcc.dg/cmp-mem-const-2.c        | 17 ++++
>  gcc/testsuite/gcc.dg/cmp-mem-const-3.c        | 17 ++++
>  gcc/testsuite/gcc.dg/cmp-mem-const-4.c        | 17 ++++
>  gcc/testsuite/gcc.dg/cmp-mem-const-5.c        | 17 ++++
>  gcc/testsuite/gcc.dg/cmp-mem-const-6.c        | 17 ++++
>  .../gcc.target/s390/cmp-mem-const-1.c         | 24 ++++++
>  8 files changed, 200 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-2.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-3.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-4.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-5.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-6.c
>  create mode 100644 gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> 
> diff --git a/gcc/combine.cc b/gcc/combine.cc
> index 5aa0ec5c45a..56e15a93409 100644
> --- a/gcc/combine.cc
> +++ b/gcc/combine.cc
> @@ -460,7 +460,7 @@ static rtx simplify_shift_const (rtx, enum rtx_code, machine_mode, rtx,
>  static int recog_for_combine (rtx *, rtx_insn *, rtx *);
>  static rtx gen_lowpart_for_combine (machine_mode, rtx);
>  static enum rtx_code simplify_compare_const (enum rtx_code, machine_mode,
> -					     rtx, rtx *);
> +					     rtx *, rtx *);
>  static enum rtx_code simplify_comparison (enum rtx_code, rtx *, rtx *);
>  static void update_table_tick (rtx);
>  static void record_value_for_reg (rtx, rtx_insn *, rtx);
> @@ -3185,7 +3185,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
>  	  compare_code = orig_compare_code = GET_CODE (*cc_use_loc);
>  	  if (is_a <scalar_int_mode> (GET_MODE (i2dest), &mode))
>  	    compare_code = simplify_compare_const (compare_code, mode,
> -						   op0, &op1);
> +						   &op0, &op1);
>  	  target_canonicalize_comparison (&compare_code, &op0, &op1, 1);
>  	}
>  
> @@ -11796,13 +11796,14 @@ gen_lowpart_for_combine (machine_mode omode, rtx x)
>     (CODE OP0 const0_rtx) form.
>  
>     The result is a possibly different comparison code to use.
> -   *POP1 may be updated.  */
> +   *POP0 and *POP1 may be updated.  */
>  
>  static enum rtx_code
>  simplify_compare_const (enum rtx_code code, machine_mode mode,
> -			rtx op0, rtx *pop1)
> +			rtx *pop0, rtx *pop1)
>  {
>    scalar_int_mode int_mode;
> +  rtx op0 = *pop0;
>    HOST_WIDE_INT const_op = INTVAL (*pop1);
>  
>    /* Get the constant we are comparing against and turn off all bits
> @@ -11987,6 +11988,74 @@ simplify_compare_const (enum rtx_code code, machine_mode mode,
>        break;
>      }
>  
> +  /* Narrow non-symmetric comparison of memory and constant as e.g.
> +     x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most
> +     significant byte.  Likewise, transform x0...x7 >= 0x4000000000000000 into
> +     x0 >= 0x40.  */
> +  if ((code == LEU || code == LTU || code == GEU || code == GTU)
> +      && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode)
> +      && MEM_P (op0)
> +      && !MEM_VOLATILE_P (op0)
> +      /* The optimization makes only sense for constants which are big enough
> +	 so that we have a chance to chop off something at all.  */
> +      && (unsigned HOST_WIDE_INT) const_op > 0xff
> +      /* Ensure that we do not overflow during normalization.  */
> +      && (code != GTU || (unsigned HOST_WIDE_INT) const_op < HOST_WIDE_INT_M1U))
> +    {
> +      unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT) const_op;
> +      enum rtx_code adjusted_code;
> +
> +      /* Normalize code to either LEU or GEU.  */
> +      if (code == LTU)
> +	{
> +	  --n;
> +	  adjusted_code = LEU;
> +	}
> +      else if (code == GTU)
> +	{
> +	  ++n;
> +	  adjusted_code = GEU;
> +	}
> +      else
> +	adjusted_code = code;
> +
> +      scalar_int_mode narrow_mode_iter;
> +      FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode)
> +	{
> +	  unsigned nbits = GET_MODE_PRECISION (int_mode)
> +			   - GET_MODE_PRECISION (narrow_mode_iter);
> +	  unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1;
> +	  unsigned HOST_WIDE_INT lower_bits = n & mask;
> +	  if ((adjusted_code == LEU && lower_bits == mask)
> +	      || (adjusted_code == GEU && lower_bits == 0))
> +	    {
> +	      n >>= nbits;
> +	      break;
> +	    }
> +	}
> +
> +      if (narrow_mode_iter < int_mode)
> +	{
> +	  if (dump_file && (dump_flags & TDF_DETAILS))
> +	    {
> +	      fprintf (
> +		dump_file, "narrow comparison from mode %s to %s: (MEM %s "
> +		HOST_WIDE_INT_PRINT_HEX ") to (MEM %s "
> +		HOST_WIDE_INT_PRINT_HEX ").\n", GET_MODE_NAME (int_mode),
> +		GET_MODE_NAME (narrow_mode_iter), GET_RTX_NAME (code),
> +		(unsigned HOST_WIDE_INT)const_op, GET_RTX_NAME (adjusted_code),
> +		n);
> +	    }
> +	  poly_int64 offset = (BYTES_BIG_ENDIAN
> +			       ? 0
> +			       : (GET_MODE_SIZE (int_mode)
> +				  - GET_MODE_SIZE (narrow_mode_iter)));
> +	  *pop0 = adjust_address_nv (op0, narrow_mode_iter, offset);
> +	  *pop1 = GEN_INT (n);
> +	  return adjusted_code;
> +	}
> +    }
> +
>    *pop1 = GEN_INT (const_op);
>    return code;
>  }
> @@ -12179,7 +12248,7 @@ simplify_comparison (enum rtx_code code, rtx *pop0, rtx *pop1)
>  
>        /* Try to simplify the compare to constant, possibly changing the
>  	 comparison op, and/or changing op1 to zero.  */
> -      code = simplify_compare_const (code, raw_mode, op0, &op1);
> +      code = simplify_compare_const (code, raw_mode, &op0, &op1);
>        const_op = INTVAL (op1);
>  
>        /* Compute some predicates to simplify code below.  */
> diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-1.c b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c
> new file mode 100644
> index 00000000000..263ad98af79
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */
> +
> +typedef __UINT64_TYPE__ uint64_t;
> +
> +int
> +le_1byte_a (uint64_t *x)
> +{
> +  return *x <= 0x3fffffffffffffff;
> +}
> +
> +int
> +le_1byte_b (uint64_t *x)
> +{
> +  return *x < 0x4000000000000000;
> +}
> diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-2.c b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c
> new file mode 100644
> index 00000000000..a7cc5348295
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */
> +
> +typedef __UINT64_TYPE__ uint64_t;
> +
> +int
> +ge_1byte_a (uint64_t *x)
> +{
> +  return *x > 0x3fffffffffffffff;
> +}
> +
> +int
> +ge_1byte_b (uint64_t *x)
> +{
> +  return *x >= 0x4000000000000000;
> +}
> diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-3.c b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c
> new file mode 100644
> index 00000000000..06f80bf72d8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */
> +
> +typedef __UINT64_TYPE__ uint64_t;
> +
> +int
> +le_2bytes_a (uint64_t *x)
> +{
> +  return *x <= 0x3ffdffffffffffff;
> +}
> +
> +int
> +le_2bytes_b (uint64_t *x)
> +{
> +  return *x < 0x3ffe000000000000;
> +}
> diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-4.c b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c
> new file mode 100644
> index 00000000000..407999abf7e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */
> +
> +typedef __UINT64_TYPE__ uint64_t;
> +
> +int
> +ge_2bytes_a (uint64_t *x)
> +{
> +  return *x > 0x400cffffffffffff;
> +}
> +
> +int
> +ge_2bytes_b (uint64_t *x)
> +{
> +  return *x >= 0x400d000000000000;
> +}
> diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-5.c b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c
> new file mode 100644
> index 00000000000..e16773f5bcf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */
> +
> +typedef __UINT64_TYPE__ uint64_t;
> +
> +int
> +le_4bytes_a (uint64_t *x)
> +{
> +  return *x <= 0x3ffffdffffffffff;
> +}
> +
> +int
> +le_4bytes_b (uint64_t *x)
> +{
> +  return *x < 0x3ffffe0000000000;
> +}
> diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-6.c b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c
> new file mode 100644
> index 00000000000..8f53b5678bd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */
> +
> +typedef __UINT64_TYPE__ uint64_t;
> +
> +int
> +ge_4bytes_a (uint64_t *x)
> +{
> +  return *x > 0x4000cfffffffffff;
> +}
> +
> +int
> +ge_4bytes_b (uint64_t *x)
> +{
> +  return *x >= 0x4000d00000000000;
> +}
> diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> new file mode 100644
> index 00000000000..309aafbec01
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -march=z13 -mzarch -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-assembler-not {\tclc\t} } } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */
> +
> +struct s
> +{
> +  long a;
> +  unsigned b : 1;
> +  unsigned c : 1;
> +};
> +
> +int foo (struct s *x)
> +{
> +  /* Expression
> +       x->b || x->c
> +     is transformed into
> +       _1 = BIT_FIELD_REF <*x_4(D), 64, 64>;
> +       _2 = _1 > 0x3FFFFFFFFFFFFFFF;
> +     where the constant may materialize in the literal pool and an 8 byte CLC
> +     may be emitted.  Ensure this is not the case.
> +  */
> +  return x->b || x->c;
> +}
> -- 
> 2.39.2
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] combine: Narrow comparison of memory and constant
  2023-06-19 14:23     ` [PATCH v2] " Stefan Schulze Frielinghaus
  2023-07-31 13:26       ` Stefan Schulze Frielinghaus
@ 2023-07-31 13:59       ` Jeff Law
  2023-07-31 21:43       ` Prathamesh Kulkarni
  2 siblings, 0 replies; 11+ messages in thread
From: Jeff Law @ 2023-07-31 13:59 UTC (permalink / raw)
  To: Stefan Schulze Frielinghaus, gcc-patches



On 6/19/23 08:23, Stefan Schulze Frielinghaus via Gcc-patches wrote:
> Comparisons between memory and constants might be done in a smaller mode
> resulting in smaller constants which might finally end up as immediates
> instead of in the literal pool.
> 
> For example, on s390x a non-symmetric comparison like
>    x <= 0x3fffffffffffffff
> results in the constant being spilled to the literal pool and an 8 byte
> memory comparison is emitted.  Ideally, an equivalent comparison
>    x0 <= 0x3f
> where x0 is the most significant byte of x, is emitted where the
> constant is smaller and more likely to materialize as an immediate.
> 
> Similarly, comparisons of the form
>    x >= 0x4000000000000000
> can be shortened into x0 >= 0x40.
> 
> Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le.
> Note, the new tests show that for the mentioned little-endian targets
> the optimization does not materialize since either the costs of the new
> instructions are higher or they do not match.  Still ok for mainline?
> 
> gcc/ChangeLog:
> 
> 	* combine.cc (simplify_compare_const): Narrow comparison of
> 	memory and constant.
> 	(try_combine): Adapt new function signature.
> 	(simplify_comparison): Adapt new function signature.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.dg/cmp-mem-const-1.c: New test.
> 	* gcc.dg/cmp-mem-const-2.c: New test.
> 	* gcc.dg/cmp-mem-const-3.c: New test.
> 	* gcc.dg/cmp-mem-const-4.c: New test.
> 	* gcc.dg/cmp-mem-const-5.c: New test.
> 	* gcc.dg/cmp-mem-const-6.c: New test.
> 	* gcc.target/s390/cmp-mem-const-1.c: New test.
Sorry.  I'd looked at this a while back, wanted to take another looksie 
and totally forgot about it.

OK for the trunk.  Thanks for your patience.

jeff


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] combine: Narrow comparison of memory and constant
  2023-06-19 14:23     ` [PATCH v2] " Stefan Schulze Frielinghaus
  2023-07-31 13:26       ` Stefan Schulze Frielinghaus
  2023-07-31 13:59       ` Jeff Law
@ 2023-07-31 21:43       ` Prathamesh Kulkarni
  2023-07-31 21:46         ` Prathamesh Kulkarni
  2023-07-31 23:50         ` Jeff Law
  2 siblings, 2 replies; 11+ messages in thread
From: Prathamesh Kulkarni @ 2023-07-31 21:43 UTC (permalink / raw)
  To: Stefan Schulze Frielinghaus; +Cc: gcc-patches

On Mon, 19 Jun 2023 at 19:59, Stefan Schulze Frielinghaus via
Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>
> Comparisons between memory and constants might be done in a smaller mode
> resulting in smaller constants which might finally end up as immediates
> instead of in the literal pool.
>
> For example, on s390x a non-symmetric comparison like
>   x <= 0x3fffffffffffffff
> results in the constant being spilled to the literal pool and an 8 byte
> memory comparison is emitted.  Ideally, an equivalent comparison
>   x0 <= 0x3f
> where x0 is the most significant byte of x, is emitted where the
> constant is smaller and more likely to materialize as an immediate.
>
> Similarly, comparisons of the form
>   x >= 0x4000000000000000
> can be shortened into x0 >= 0x40.
>
> Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le.
> Note, the new tests show that for the mentioned little-endian targets
> the optimization does not materialize since either the costs of the new
> instructions are higher or they do not match.  Still ok for mainline?
Hi Stefan,
Unfortunately this patch (committed in 7cdd0860949c6c3232e6cff1d7ca37bb5234074c)
caused the following ICE on armv8l-unknown-linux-gnu:
during RTL pass: combine
../../../gcc/libgcc/fixed-bit.c: In function ‘__gnu_saturate1sq’:
../../../gcc/libgcc/fixed-bit.c:210:1: internal compiler error: in
decompose, at rtl.h:2297
  210 | }
      | ^
0xaa23e3 wi::int_traits<std::pair<rtx_def*, machine_mode>
>::decompose(long long*, unsigned int, std::pair<rtx_def*,
machine_mode> const&)
        ../../gcc/gcc/rtl.h:2297
0xaf5ab3 wide_int_ref_storage<false,
true>::wide_int_ref_storage<std::pair<rtx_def*, machine_mode>
>(std::pair<rtx_def*, machine_mode> const&)
        ../../gcc/gcc/wide-int.h:1030
0xaf5023 generic_wide_int<wide_int_ref_storage<false, true>
>::generic_wide_int<std::pair<rtx_def*, machine_mode>
>(std::pair<rtx_def*, machine_mode> const&)
        ../../gcc/gcc/wide-int.h:788
0xf916f9 simplify_const_unary_operation(rtx_code, machine_mode,
rtx_def*, machine_mode)
        ../../gcc/gcc/simplify-rtx.cc:2131
0xf8bad5 simplify_context::simplify_unary_operation(rtx_code,
machine_mode, rtx_def*, machine_mode)
        ../../gcc/gcc/simplify-rtx.cc:889
0xf8a591 simplify_context::simplify_gen_unary(rtx_code, machine_mode,
rtx_def*, machine_mode)
        ../../gcc/gcc/simplify-rtx.cc:360
0x9bd1b7 simplify_gen_unary(rtx_code, machine_mode, rtx_def*, machine_mode)
        ../../gcc/gcc/rtl.h:3520
0x1bd5677 simplify_comparison
        ../../gcc/gcc/combine.cc:13125
0x1bc2b2b simplify_set
        ../../gcc/gcc/combine.cc:6848
0x1bc1647 combine_simplify_rtx
        ../../gcc/gcc/combine.cc:6353
0x1bbf97f subst
        ../../gcc/gcc/combine.cc:5609
0x1bb864b try_combine
        ../../gcc/gcc/combine.cc:3302
0x1bb30fb combine_instructions
        ../../gcc/gcc/combine.cc:1264
0x1bd8d25 rest_of_handle_combine
        ../../gcc/gcc/combine.cc:15059
0x1bd8dd5 execute
        ../../gcc/gcc/combine.cc:15103
Please submit a full bug report, with preprocessed source (by using
-freport-bug).
Please include the complete backtrace with any bug report.
See <https://gcc.gnu.org/bugs/> for instructions.

Could you please take a look ?

Thanks,
Prathamesh
>
> gcc/ChangeLog:
>
>         * combine.cc (simplify_compare_const): Narrow comparison of
>         memory and constant.
>         (try_combine): Adapt new function signature.
>         (simplify_comparison): Adapt new function signature.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.dg/cmp-mem-const-1.c: New test.
>         * gcc.dg/cmp-mem-const-2.c: New test.
>         * gcc.dg/cmp-mem-const-3.c: New test.
>         * gcc.dg/cmp-mem-const-4.c: New test.
>         * gcc.dg/cmp-mem-const-5.c: New test.
>         * gcc.dg/cmp-mem-const-6.c: New test.
>         * gcc.target/s390/cmp-mem-const-1.c: New test.
> ---
>  gcc/combine.cc                                | 79 +++++++++++++++++--
>  gcc/testsuite/gcc.dg/cmp-mem-const-1.c        | 17 ++++
>  gcc/testsuite/gcc.dg/cmp-mem-const-2.c        | 17 ++++
>  gcc/testsuite/gcc.dg/cmp-mem-const-3.c        | 17 ++++
>  gcc/testsuite/gcc.dg/cmp-mem-const-4.c        | 17 ++++
>  gcc/testsuite/gcc.dg/cmp-mem-const-5.c        | 17 ++++
>  gcc/testsuite/gcc.dg/cmp-mem-const-6.c        | 17 ++++
>  .../gcc.target/s390/cmp-mem-const-1.c         | 24 ++++++
>  8 files changed, 200 insertions(+), 5 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-1.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-2.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-3.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-4.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-5.c
>  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-6.c
>  create mode 100644 gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
>
> diff --git a/gcc/combine.cc b/gcc/combine.cc
> index 5aa0ec5c45a..56e15a93409 100644
> --- a/gcc/combine.cc
> +++ b/gcc/combine.cc
> @@ -460,7 +460,7 @@ static rtx simplify_shift_const (rtx, enum rtx_code, machine_mode, rtx,
>  static int recog_for_combine (rtx *, rtx_insn *, rtx *);
>  static rtx gen_lowpart_for_combine (machine_mode, rtx);
>  static enum rtx_code simplify_compare_const (enum rtx_code, machine_mode,
> -                                            rtx, rtx *);
> +                                            rtx *, rtx *);
>  static enum rtx_code simplify_comparison (enum rtx_code, rtx *, rtx *);
>  static void update_table_tick (rtx);
>  static void record_value_for_reg (rtx, rtx_insn *, rtx);
> @@ -3185,7 +3185,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
>           compare_code = orig_compare_code = GET_CODE (*cc_use_loc);
>           if (is_a <scalar_int_mode> (GET_MODE (i2dest), &mode))
>             compare_code = simplify_compare_const (compare_code, mode,
> -                                                  op0, &op1);
> +                                                  &op0, &op1);
>           target_canonicalize_comparison (&compare_code, &op0, &op1, 1);
>         }
>
> @@ -11796,13 +11796,14 @@ gen_lowpart_for_combine (machine_mode omode, rtx x)
>     (CODE OP0 const0_rtx) form.
>
>     The result is a possibly different comparison code to use.
> -   *POP1 may be updated.  */
> +   *POP0 and *POP1 may be updated.  */
>
>  static enum rtx_code
>  simplify_compare_const (enum rtx_code code, machine_mode mode,
> -                       rtx op0, rtx *pop1)
> +                       rtx *pop0, rtx *pop1)
>  {
>    scalar_int_mode int_mode;
> +  rtx op0 = *pop0;
>    HOST_WIDE_INT const_op = INTVAL (*pop1);
>
>    /* Get the constant we are comparing against and turn off all bits
> @@ -11987,6 +11988,74 @@ simplify_compare_const (enum rtx_code code, machine_mode mode,
>        break;
>      }
>
> +  /* Narrow non-symmetric comparison of memory and constant as e.g.
> +     x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most
> +     significant byte.  Likewise, transform x0...x7 >= 0x4000000000000000 into
> +     x0 >= 0x40.  */
> +  if ((code == LEU || code == LTU || code == GEU || code == GTU)
> +      && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode)
> +      && MEM_P (op0)
> +      && !MEM_VOLATILE_P (op0)
> +      /* The optimization makes only sense for constants which are big enough
> +        so that we have a chance to chop off something at all.  */
> +      && (unsigned HOST_WIDE_INT) const_op > 0xff
> +      /* Ensure that we do not overflow during normalization.  */
> +      && (code != GTU || (unsigned HOST_WIDE_INT) const_op < HOST_WIDE_INT_M1U))
> +    {
> +      unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT) const_op;
> +      enum rtx_code adjusted_code;
> +
> +      /* Normalize code to either LEU or GEU.  */
> +      if (code == LTU)
> +       {
> +         --n;
> +         adjusted_code = LEU;
> +       }
> +      else if (code == GTU)
> +       {
> +         ++n;
> +         adjusted_code = GEU;
> +       }
> +      else
> +       adjusted_code = code;
> +
> +      scalar_int_mode narrow_mode_iter;
> +      FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode)
> +       {
> +         unsigned nbits = GET_MODE_PRECISION (int_mode)
> +                          - GET_MODE_PRECISION (narrow_mode_iter);
> +         unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1;
> +         unsigned HOST_WIDE_INT lower_bits = n & mask;
> +         if ((adjusted_code == LEU && lower_bits == mask)
> +             || (adjusted_code == GEU && lower_bits == 0))
> +           {
> +             n >>= nbits;
> +             break;
> +           }
> +       }
> +
> +      if (narrow_mode_iter < int_mode)
> +       {
> +         if (dump_file && (dump_flags & TDF_DETAILS))
> +           {
> +             fprintf (
> +               dump_file, "narrow comparison from mode %s to %s: (MEM %s "
> +               HOST_WIDE_INT_PRINT_HEX ") to (MEM %s "
> +               HOST_WIDE_INT_PRINT_HEX ").\n", GET_MODE_NAME (int_mode),
> +               GET_MODE_NAME (narrow_mode_iter), GET_RTX_NAME (code),
> +               (unsigned HOST_WIDE_INT)const_op, GET_RTX_NAME (adjusted_code),
> +               n);
> +           }
> +         poly_int64 offset = (BYTES_BIG_ENDIAN
> +                              ? 0
> +                              : (GET_MODE_SIZE (int_mode)
> +                                 - GET_MODE_SIZE (narrow_mode_iter)));
> +         *pop0 = adjust_address_nv (op0, narrow_mode_iter, offset);
> +         *pop1 = GEN_INT (n);
> +         return adjusted_code;
> +       }
> +    }
> +
>    *pop1 = GEN_INT (const_op);
>    return code;
>  }
> @@ -12179,7 +12248,7 @@ simplify_comparison (enum rtx_code code, rtx *pop0, rtx *pop1)
>
>        /* Try to simplify the compare to constant, possibly changing the
>          comparison op, and/or changing op1 to zero.  */
> -      code = simplify_compare_const (code, raw_mode, op0, &op1);
> +      code = simplify_compare_const (code, raw_mode, &op0, &op1);
>        const_op = INTVAL (op1);
>
>        /* Compute some predicates to simplify code below.  */
> diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-1.c b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c
> new file mode 100644
> index 00000000000..263ad98af79
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */
> +
> +typedef __UINT64_TYPE__ uint64_t;
> +
> +int
> +le_1byte_a (uint64_t *x)
> +{
> +  return *x <= 0x3fffffffffffffff;
> +}
> +
> +int
> +le_1byte_b (uint64_t *x)
> +{
> +  return *x < 0x4000000000000000;
> +}
> diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-2.c b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c
> new file mode 100644
> index 00000000000..a7cc5348295
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */
> +
> +typedef __UINT64_TYPE__ uint64_t;
> +
> +int
> +ge_1byte_a (uint64_t *x)
> +{
> +  return *x > 0x3fffffffffffffff;
> +}
> +
> +int
> +ge_1byte_b (uint64_t *x)
> +{
> +  return *x >= 0x4000000000000000;
> +}
> diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-3.c b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c
> new file mode 100644
> index 00000000000..06f80bf72d8
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */
> +
> +typedef __UINT64_TYPE__ uint64_t;
> +
> +int
> +le_2bytes_a (uint64_t *x)
> +{
> +  return *x <= 0x3ffdffffffffffff;
> +}
> +
> +int
> +le_2bytes_b (uint64_t *x)
> +{
> +  return *x < 0x3ffe000000000000;
> +}
> diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-4.c b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c
> new file mode 100644
> index 00000000000..407999abf7e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */
> +
> +typedef __UINT64_TYPE__ uint64_t;
> +
> +int
> +ge_2bytes_a (uint64_t *x)
> +{
> +  return *x > 0x400cffffffffffff;
> +}
> +
> +int
> +ge_2bytes_b (uint64_t *x)
> +{
> +  return *x >= 0x400d000000000000;
> +}
> diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-5.c b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c
> new file mode 100644
> index 00000000000..e16773f5bcf
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */
> +
> +typedef __UINT64_TYPE__ uint64_t;
> +
> +int
> +le_4bytes_a (uint64_t *x)
> +{
> +  return *x <= 0x3ffffdffffffffff;
> +}
> +
> +int
> +le_4bytes_b (uint64_t *x)
> +{
> +  return *x < 0x3ffffe0000000000;
> +}
> diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-6.c b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c
> new file mode 100644
> index 00000000000..8f53b5678bd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */
> +
> +typedef __UINT64_TYPE__ uint64_t;
> +
> +int
> +ge_4bytes_a (uint64_t *x)
> +{
> +  return *x > 0x4000cfffffffffff;
> +}
> +
> +int
> +ge_4bytes_b (uint64_t *x)
> +{
> +  return *x >= 0x4000d00000000000;
> +}
> diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> new file mode 100644
> index 00000000000..309aafbec01
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> @@ -0,0 +1,24 @@
> +/* { dg-do compile { target { lp64 } } } */
> +/* { dg-options "-O1 -march=z13 -mzarch -fdump-rtl-combine-details" } */
> +/* { dg-final { scan-assembler-not {\tclc\t} } } */
> +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */
> +
> +struct s
> +{
> +  long a;
> +  unsigned b : 1;
> +  unsigned c : 1;
> +};
> +
> +int foo (struct s *x)
> +{
> +  /* Expression
> +       x->b || x->c
> +     is transformed into
> +       _1 = BIT_FIELD_REF <*x_4(D), 64, 64>;
> +       _2 = _1 > 0x3FFFFFFFFFFFFFFF;
> +     where the constant may materialize in the literal pool and an 8 byte CLC
> +     may be emitted.  Ensure this is not the case.
> +  */
> +  return x->b || x->c;
> +}
> --
> 2.39.2
>

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] combine: Narrow comparison of memory and constant
  2023-07-31 21:43       ` Prathamesh Kulkarni
@ 2023-07-31 21:46         ` Prathamesh Kulkarni
  2023-07-31 23:50         ` Jeff Law
  1 sibling, 0 replies; 11+ messages in thread
From: Prathamesh Kulkarni @ 2023-07-31 21:46 UTC (permalink / raw)
  To: Stefan Schulze Frielinghaus; +Cc: gcc-patches

On Tue, 1 Aug 2023 at 03:13, Prathamesh Kulkarni
<prathamesh.kulkarni@linaro.org> wrote:
>
> On Mon, 19 Jun 2023 at 19:59, Stefan Schulze Frielinghaus via
> Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
> >
> > Comparisons between memory and constants might be done in a smaller mode
> > resulting in smaller constants which might finally end up as immediates
> > instead of in the literal pool.
> >
> > For example, on s390x a non-symmetric comparison like
> >   x <= 0x3fffffffffffffff
> > results in the constant being spilled to the literal pool and an 8 byte
> > memory comparison is emitted.  Ideally, an equivalent comparison
> >   x0 <= 0x3f
> > where x0 is the most significant byte of x, is emitted where the
> > constant is smaller and more likely to materialize as an immediate.
> >
> > Similarly, comparisons of the form
> >   x >= 0x4000000000000000
> > can be shortened into x0 >= 0x40.
> >
> > Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le.
> > Note, the new tests show that for the mentioned little-endian targets
> > the optimization does not materialize since either the costs of the new
> > instructions are higher or they do not match.  Still ok for mainline?
> Hi Stefan,
> Unfortunately this patch (committed in 7cdd0860949c6c3232e6cff1d7ca37bb5234074c)
> caused the following ICE on armv8l-unknown-linux-gnu:
Sorry I meant armv8l-unknown-linux-gnueabihf.
> during RTL pass: combine
> ../../../gcc/libgcc/fixed-bit.c: In function ‘__gnu_saturate1sq’:
> ../../../gcc/libgcc/fixed-bit.c:210:1: internal compiler error: in
> decompose, at rtl.h:2297
>   210 | }
>       | ^
> 0xaa23e3 wi::int_traits<std::pair<rtx_def*, machine_mode>
> >::decompose(long long*, unsigned int, std::pair<rtx_def*,
> machine_mode> const&)
>         ../../gcc/gcc/rtl.h:2297
> 0xaf5ab3 wide_int_ref_storage<false,
> true>::wide_int_ref_storage<std::pair<rtx_def*, machine_mode>
> >(std::pair<rtx_def*, machine_mode> const&)
>         ../../gcc/gcc/wide-int.h:1030
> 0xaf5023 generic_wide_int<wide_int_ref_storage<false, true>
> >::generic_wide_int<std::pair<rtx_def*, machine_mode>
> >(std::pair<rtx_def*, machine_mode> const&)
>         ../../gcc/gcc/wide-int.h:788
> 0xf916f9 simplify_const_unary_operation(rtx_code, machine_mode,
> rtx_def*, machine_mode)
>         ../../gcc/gcc/simplify-rtx.cc:2131
> 0xf8bad5 simplify_context::simplify_unary_operation(rtx_code,
> machine_mode, rtx_def*, machine_mode)
>         ../../gcc/gcc/simplify-rtx.cc:889
> 0xf8a591 simplify_context::simplify_gen_unary(rtx_code, machine_mode,
> rtx_def*, machine_mode)
>         ../../gcc/gcc/simplify-rtx.cc:360
> 0x9bd1b7 simplify_gen_unary(rtx_code, machine_mode, rtx_def*, machine_mode)
>         ../../gcc/gcc/rtl.h:3520
> 0x1bd5677 simplify_comparison
>         ../../gcc/gcc/combine.cc:13125
> 0x1bc2b2b simplify_set
>         ../../gcc/gcc/combine.cc:6848
> 0x1bc1647 combine_simplify_rtx
>         ../../gcc/gcc/combine.cc:6353
> 0x1bbf97f subst
>         ../../gcc/gcc/combine.cc:5609
> 0x1bb864b try_combine
>         ../../gcc/gcc/combine.cc:3302
> 0x1bb30fb combine_instructions
>         ../../gcc/gcc/combine.cc:1264
> 0x1bd8d25 rest_of_handle_combine
>         ../../gcc/gcc/combine.cc:15059
> 0x1bd8dd5 execute
>         ../../gcc/gcc/combine.cc:15103
> Please submit a full bug report, with preprocessed source (by using
> -freport-bug).
> Please include the complete backtrace with any bug report.
> See <https://gcc.gnu.org/bugs/> for instructions.
>
> Could you please take a look ?
>
> Thanks,
> Prathamesh
> >
> > gcc/ChangeLog:
> >
> >         * combine.cc (simplify_compare_const): Narrow comparison of
> >         memory and constant.
> >         (try_combine): Adapt new function signature.
> >         (simplify_comparison): Adapt new function signature.
> >
> > gcc/testsuite/ChangeLog:
> >
> >         * gcc.dg/cmp-mem-const-1.c: New test.
> >         * gcc.dg/cmp-mem-const-2.c: New test.
> >         * gcc.dg/cmp-mem-const-3.c: New test.
> >         * gcc.dg/cmp-mem-const-4.c: New test.
> >         * gcc.dg/cmp-mem-const-5.c: New test.
> >         * gcc.dg/cmp-mem-const-6.c: New test.
> >         * gcc.target/s390/cmp-mem-const-1.c: New test.
> > ---
> >  gcc/combine.cc                                | 79 +++++++++++++++++--
> >  gcc/testsuite/gcc.dg/cmp-mem-const-1.c        | 17 ++++
> >  gcc/testsuite/gcc.dg/cmp-mem-const-2.c        | 17 ++++
> >  gcc/testsuite/gcc.dg/cmp-mem-const-3.c        | 17 ++++
> >  gcc/testsuite/gcc.dg/cmp-mem-const-4.c        | 17 ++++
> >  gcc/testsuite/gcc.dg/cmp-mem-const-5.c        | 17 ++++
> >  gcc/testsuite/gcc.dg/cmp-mem-const-6.c        | 17 ++++
> >  .../gcc.target/s390/cmp-mem-const-1.c         | 24 ++++++
> >  8 files changed, 200 insertions(+), 5 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-1.c
> >  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-2.c
> >  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-3.c
> >  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-4.c
> >  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-5.c
> >  create mode 100644 gcc/testsuite/gcc.dg/cmp-mem-const-6.c
> >  create mode 100644 gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> >
> > diff --git a/gcc/combine.cc b/gcc/combine.cc
> > index 5aa0ec5c45a..56e15a93409 100644
> > --- a/gcc/combine.cc
> > +++ b/gcc/combine.cc
> > @@ -460,7 +460,7 @@ static rtx simplify_shift_const (rtx, enum rtx_code, machine_mode, rtx,
> >  static int recog_for_combine (rtx *, rtx_insn *, rtx *);
> >  static rtx gen_lowpart_for_combine (machine_mode, rtx);
> >  static enum rtx_code simplify_compare_const (enum rtx_code, machine_mode,
> > -                                            rtx, rtx *);
> > +                                            rtx *, rtx *);
> >  static enum rtx_code simplify_comparison (enum rtx_code, rtx *, rtx *);
> >  static void update_table_tick (rtx);
> >  static void record_value_for_reg (rtx, rtx_insn *, rtx);
> > @@ -3185,7 +3185,7 @@ try_combine (rtx_insn *i3, rtx_insn *i2, rtx_insn *i1, rtx_insn *i0,
> >           compare_code = orig_compare_code = GET_CODE (*cc_use_loc);
> >           if (is_a <scalar_int_mode> (GET_MODE (i2dest), &mode))
> >             compare_code = simplify_compare_const (compare_code, mode,
> > -                                                  op0, &op1);
> > +                                                  &op0, &op1);
> >           target_canonicalize_comparison (&compare_code, &op0, &op1, 1);
> >         }
> >
> > @@ -11796,13 +11796,14 @@ gen_lowpart_for_combine (machine_mode omode, rtx x)
> >     (CODE OP0 const0_rtx) form.
> >
> >     The result is a possibly different comparison code to use.
> > -   *POP1 may be updated.  */
> > +   *POP0 and *POP1 may be updated.  */
> >
> >  static enum rtx_code
> >  simplify_compare_const (enum rtx_code code, machine_mode mode,
> > -                       rtx op0, rtx *pop1)
> > +                       rtx *pop0, rtx *pop1)
> >  {
> >    scalar_int_mode int_mode;
> > +  rtx op0 = *pop0;
> >    HOST_WIDE_INT const_op = INTVAL (*pop1);
> >
> >    /* Get the constant we are comparing against and turn off all bits
> > @@ -11987,6 +11988,74 @@ simplify_compare_const (enum rtx_code code, machine_mode mode,
> >        break;
> >      }
> >
> > +  /* Narrow non-symmetric comparison of memory and constant as e.g.
> > +     x0...x7 <= 0x3fffffffffffffff into x0 <= 0x3f where x0 is the most
> > +     significant byte.  Likewise, transform x0...x7 >= 0x4000000000000000 into
> > +     x0 >= 0x40.  */
> > +  if ((code == LEU || code == LTU || code == GEU || code == GTU)
> > +      && is_a <scalar_int_mode> (GET_MODE (op0), &int_mode)
> > +      && MEM_P (op0)
> > +      && !MEM_VOLATILE_P (op0)
> > +      /* The optimization makes only sense for constants which are big enough
> > +        so that we have a chance to chop off something at all.  */
> > +      && (unsigned HOST_WIDE_INT) const_op > 0xff
> > +      /* Ensure that we do not overflow during normalization.  */
> > +      && (code != GTU || (unsigned HOST_WIDE_INT) const_op < HOST_WIDE_INT_M1U))
> > +    {
> > +      unsigned HOST_WIDE_INT n = (unsigned HOST_WIDE_INT) const_op;
> > +      enum rtx_code adjusted_code;
> > +
> > +      /* Normalize code to either LEU or GEU.  */
> > +      if (code == LTU)
> > +       {
> > +         --n;
> > +         adjusted_code = LEU;
> > +       }
> > +      else if (code == GTU)
> > +       {
> > +         ++n;
> > +         adjusted_code = GEU;
> > +       }
> > +      else
> > +       adjusted_code = code;
> > +
> > +      scalar_int_mode narrow_mode_iter;
> > +      FOR_EACH_MODE_UNTIL (narrow_mode_iter, int_mode)
> > +       {
> > +         unsigned nbits = GET_MODE_PRECISION (int_mode)
> > +                          - GET_MODE_PRECISION (narrow_mode_iter);
> > +         unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << nbits) - 1;
> > +         unsigned HOST_WIDE_INT lower_bits = n & mask;
> > +         if ((adjusted_code == LEU && lower_bits == mask)
> > +             || (adjusted_code == GEU && lower_bits == 0))
> > +           {
> > +             n >>= nbits;
> > +             break;
> > +           }
> > +       }
> > +
> > +      if (narrow_mode_iter < int_mode)
> > +       {
> > +         if (dump_file && (dump_flags & TDF_DETAILS))
> > +           {
> > +             fprintf (
> > +               dump_file, "narrow comparison from mode %s to %s: (MEM %s "
> > +               HOST_WIDE_INT_PRINT_HEX ") to (MEM %s "
> > +               HOST_WIDE_INT_PRINT_HEX ").\n", GET_MODE_NAME (int_mode),
> > +               GET_MODE_NAME (narrow_mode_iter), GET_RTX_NAME (code),
> > +               (unsigned HOST_WIDE_INT)const_op, GET_RTX_NAME (adjusted_code),
> > +               n);
> > +           }
> > +         poly_int64 offset = (BYTES_BIG_ENDIAN
> > +                              ? 0
> > +                              : (GET_MODE_SIZE (int_mode)
> > +                                 - GET_MODE_SIZE (narrow_mode_iter)));
> > +         *pop0 = adjust_address_nv (op0, narrow_mode_iter, offset);
> > +         *pop1 = GEN_INT (n);
> > +         return adjusted_code;
> > +       }
> > +    }
> > +
> >    *pop1 = GEN_INT (const_op);
> >    return code;
> >  }
> > @@ -12179,7 +12248,7 @@ simplify_comparison (enum rtx_code code, rtx *pop0, rtx *pop1)
> >
> >        /* Try to simplify the compare to constant, possibly changing the
> >          comparison op, and/or changing op1 to zero.  */
> > -      code = simplify_compare_const (code, raw_mode, op0, &op1);
> > +      code = simplify_compare_const (code, raw_mode, &op0, &op1);
> >        const_op = INTVAL (op1);
> >
> >        /* Compute some predicates to simplify code below.  */
> > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-1.c b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c
> > new file mode 100644
> > index 00000000000..263ad98af79
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-1.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile { target { lp64 } } } */
> > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */
> > +
> > +typedef __UINT64_TYPE__ uint64_t;
> > +
> > +int
> > +le_1byte_a (uint64_t *x)
> > +{
> > +  return *x <= 0x3fffffffffffffff;
> > +}
> > +
> > +int
> > +le_1byte_b (uint64_t *x)
> > +{
> > +  return *x < 0x4000000000000000;
> > +}
> > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-2.c b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c
> > new file mode 100644
> > index 00000000000..a7cc5348295
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-2.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile { target { lp64 } } } */
> > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */
> > +
> > +typedef __UINT64_TYPE__ uint64_t;
> > +
> > +int
> > +ge_1byte_a (uint64_t *x)
> > +{
> > +  return *x > 0x3fffffffffffffff;
> > +}
> > +
> > +int
> > +ge_1byte_b (uint64_t *x)
> > +{
> > +  return *x >= 0x4000000000000000;
> > +}
> > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-3.c b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c
> > new file mode 100644
> > index 00000000000..06f80bf72d8
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-3.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile { target { lp64 } } } */
> > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */
> > +
> > +typedef __UINT64_TYPE__ uint64_t;
> > +
> > +int
> > +le_2bytes_a (uint64_t *x)
> > +{
> > +  return *x <= 0x3ffdffffffffffff;
> > +}
> > +
> > +int
> > +le_2bytes_b (uint64_t *x)
> > +{
> > +  return *x < 0x3ffe000000000000;
> > +}
> > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-4.c b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c
> > new file mode 100644
> > index 00000000000..407999abf7e
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-4.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile { target { lp64 } } } */
> > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to HI" "combine" } } */
> > +
> > +typedef __UINT64_TYPE__ uint64_t;
> > +
> > +int
> > +ge_2bytes_a (uint64_t *x)
> > +{
> > +  return *x > 0x400cffffffffffff;
> > +}
> > +
> > +int
> > +ge_2bytes_b (uint64_t *x)
> > +{
> > +  return *x >= 0x400d000000000000;
> > +}
> > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-5.c b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c
> > new file mode 100644
> > index 00000000000..e16773f5bcf
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-5.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile { target { lp64 } } } */
> > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */
> > +
> > +typedef __UINT64_TYPE__ uint64_t;
> > +
> > +int
> > +le_4bytes_a (uint64_t *x)
> > +{
> > +  return *x <= 0x3ffffdffffffffff;
> > +}
> > +
> > +int
> > +le_4bytes_b (uint64_t *x)
> > +{
> > +  return *x < 0x3ffffe0000000000;
> > +}
> > diff --git a/gcc/testsuite/gcc.dg/cmp-mem-const-6.c b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c
> > new file mode 100644
> > index 00000000000..8f53b5678bd
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/cmp-mem-const-6.c
> > @@ -0,0 +1,17 @@
> > +/* { dg-do compile { target { lp64 } } } */
> > +/* { dg-options "-O1 -fdump-rtl-combine-details" } */
> > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to SI" "combine" } } */
> > +
> > +typedef __UINT64_TYPE__ uint64_t;
> > +
> > +int
> > +ge_4bytes_a (uint64_t *x)
> > +{
> > +  return *x > 0x4000cfffffffffff;
> > +}
> > +
> > +int
> > +ge_4bytes_b (uint64_t *x)
> > +{
> > +  return *x >= 0x4000d00000000000;
> > +}
> > diff --git a/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> > new file mode 100644
> > index 00000000000..309aafbec01
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.target/s390/cmp-mem-const-1.c
> > @@ -0,0 +1,24 @@
> > +/* { dg-do compile { target { lp64 } } } */
> > +/* { dg-options "-O1 -march=z13 -mzarch -fdump-rtl-combine-details" } */
> > +/* { dg-final { scan-assembler-not {\tclc\t} } } */
> > +/* { dg-final { scan-rtl-dump "narrow comparison from mode DI to QI" "combine" } } */
> > +
> > +struct s
> > +{
> > +  long a;
> > +  unsigned b : 1;
> > +  unsigned c : 1;
> > +};
> > +
> > +int foo (struct s *x)
> > +{
> > +  /* Expression
> > +       x->b || x->c
> > +     is transformed into
> > +       _1 = BIT_FIELD_REF <*x_4(D), 64, 64>;
> > +       _2 = _1 > 0x3FFFFFFFFFFFFFFF;
> > +     where the constant may materialize in the literal pool and an 8 byte CLC
> > +     may be emitted.  Ensure this is not the case.
> > +  */
> > +  return x->b || x->c;
> > +}
> > --
> > 2.39.2
> >

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] combine: Narrow comparison of memory and constant
  2023-07-31 21:43       ` Prathamesh Kulkarni
  2023-07-31 21:46         ` Prathamesh Kulkarni
@ 2023-07-31 23:50         ` Jeff Law
  2023-08-01  8:22           ` Prathamesh Kulkarni
  1 sibling, 1 reply; 11+ messages in thread
From: Jeff Law @ 2023-07-31 23:50 UTC (permalink / raw)
  To: Prathamesh Kulkarni, Stefan Schulze Frielinghaus; +Cc: gcc-patches



On 7/31/23 15:43, Prathamesh Kulkarni via Gcc-patches wrote:
> On Mon, 19 Jun 2023 at 19:59, Stefan Schulze Frielinghaus via
> Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
>>
>> Comparisons between memory and constants might be done in a smaller mode
>> resulting in smaller constants which might finally end up as immediates
>> instead of in the literal pool.
>>
>> For example, on s390x a non-symmetric comparison like
>>    x <= 0x3fffffffffffffff
>> results in the constant being spilled to the literal pool and an 8 byte
>> memory comparison is emitted.  Ideally, an equivalent comparison
>>    x0 <= 0x3f
>> where x0 is the most significant byte of x, is emitted where the
>> constant is smaller and more likely to materialize as an immediate.
>>
>> Similarly, comparisons of the form
>>    x >= 0x4000000000000000
>> can be shortened into x0 >= 0x40.
>>
>> Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le.
>> Note, the new tests show that for the mentioned little-endian targets
>> the optimization does not materialize since either the costs of the new
>> instructions are higher or they do not match.  Still ok for mainline?
> Hi Stefan,
> Unfortunately this patch (committed in 7cdd0860949c6c3232e6cff1d7ca37bb5234074c)
> caused the following ICE on armv8l-unknown-linux-gnu:
> during RTL pass: combine
> ../../../gcc/libgcc/fixed-bit.c: In function ‘__gnu_saturate1sq’:
> ../../../gcc/libgcc/fixed-bit.c:210:1: internal compiler error: in
> decompose, at rtl.h:2297
>    210 | }
>        | ^
> 0xaa23e3 wi::int_traits<std::pair<rtx_def*, machine_mode>
>> ::decompose(long long*, unsigned int, std::pair<rtx_def*,
> machine_mode> const&)
>          ../../gcc/gcc/rtl.h:2297
[ ... ]
Yea, we're seeing something very similar on nios2-linux-gnu building the 
kernel.

Prathamesh, can you extract the .i file for fixed-bit on armv8 and open 
a bug for this issue, attaching the .i file as well as the right command 
line options necessary to reproduce the failure.  THat way Stefan can 
tackle it with a cross compiler.

Thanks,
jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] combine: Narrow comparison of memory and constant
  2023-07-31 23:50         ` Jeff Law
@ 2023-08-01  8:22           ` Prathamesh Kulkarni
  2023-08-01  9:36             ` Stefan Schulze Frielinghaus
  0 siblings, 1 reply; 11+ messages in thread
From: Prathamesh Kulkarni @ 2023-08-01  8:22 UTC (permalink / raw)
  To: Jeff Law; +Cc: Stefan Schulze Frielinghaus, gcc-patches

On Tue, 1 Aug 2023 at 05:20, Jeff Law <jeffreyalaw@gmail.com> wrote:
>
>
>
> On 7/31/23 15:43, Prathamesh Kulkarni via Gcc-patches wrote:
> > On Mon, 19 Jun 2023 at 19:59, Stefan Schulze Frielinghaus via
> > Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
> >>
> >> Comparisons between memory and constants might be done in a smaller mode
> >> resulting in smaller constants which might finally end up as immediates
> >> instead of in the literal pool.
> >>
> >> For example, on s390x a non-symmetric comparison like
> >>    x <= 0x3fffffffffffffff
> >> results in the constant being spilled to the literal pool and an 8 byte
> >> memory comparison is emitted.  Ideally, an equivalent comparison
> >>    x0 <= 0x3f
> >> where x0 is the most significant byte of x, is emitted where the
> >> constant is smaller and more likely to materialize as an immediate.
> >>
> >> Similarly, comparisons of the form
> >>    x >= 0x4000000000000000
> >> can be shortened into x0 >= 0x40.
> >>
> >> Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le.
> >> Note, the new tests show that for the mentioned little-endian targets
> >> the optimization does not materialize since either the costs of the new
> >> instructions are higher or they do not match.  Still ok for mainline?
> > Hi Stefan,
> > Unfortunately this patch (committed in 7cdd0860949c6c3232e6cff1d7ca37bb5234074c)
> > caused the following ICE on armv8l-unknown-linux-gnu:
> > during RTL pass: combine
> > ../../../gcc/libgcc/fixed-bit.c: In function ‘__gnu_saturate1sq’:
> > ../../../gcc/libgcc/fixed-bit.c:210:1: internal compiler error: in
> > decompose, at rtl.h:2297
> >    210 | }
> >        | ^
> > 0xaa23e3 wi::int_traits<std::pair<rtx_def*, machine_mode>
> >> ::decompose(long long*, unsigned int, std::pair<rtx_def*,
> > machine_mode> const&)
> >          ../../gcc/gcc/rtl.h:2297
> [ ... ]
> Yea, we're seeing something very similar on nios2-linux-gnu building the
> kernel.
>
> Prathamesh, can you extract the .i file for fixed-bit on armv8 and open
> a bug for this issue, attaching the .i file as well as the right command
> line options necessary to reproduce the failure.  THat way Stefan can
> tackle it with a cross compiler.
Hi Jeff,
Filed the issue in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110867

Thanks,
Prathamesh
>
> Thanks,
> jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH v2] combine: Narrow comparison of memory and constant
  2023-08-01  8:22           ` Prathamesh Kulkarni
@ 2023-08-01  9:36             ` Stefan Schulze Frielinghaus
  0 siblings, 0 replies; 11+ messages in thread
From: Stefan Schulze Frielinghaus @ 2023-08-01  9:36 UTC (permalink / raw)
  To: Prathamesh Kulkarni; +Cc: Jeff Law, gcc-patches

On Tue, Aug 01, 2023 at 01:52:16PM +0530, Prathamesh Kulkarni wrote:
> On Tue, 1 Aug 2023 at 05:20, Jeff Law <jeffreyalaw@gmail.com> wrote:
> >
> >
> >
> > On 7/31/23 15:43, Prathamesh Kulkarni via Gcc-patches wrote:
> > > On Mon, 19 Jun 2023 at 19:59, Stefan Schulze Frielinghaus via
> > > Gcc-patches <gcc-patches@gcc.gnu.org> wrote:
> > >>
> > >> Comparisons between memory and constants might be done in a smaller mode
> > >> resulting in smaller constants which might finally end up as immediates
> > >> instead of in the literal pool.
> > >>
> > >> For example, on s390x a non-symmetric comparison like
> > >>    x <= 0x3fffffffffffffff
> > >> results in the constant being spilled to the literal pool and an 8 byte
> > >> memory comparison is emitted.  Ideally, an equivalent comparison
> > >>    x0 <= 0x3f
> > >> where x0 is the most significant byte of x, is emitted where the
> > >> constant is smaller and more likely to materialize as an immediate.
> > >>
> > >> Similarly, comparisons of the form
> > >>    x >= 0x4000000000000000
> > >> can be shortened into x0 >= 0x40.
> > >>
> > >> Bootstrapped and regtested on s390x, x64, aarch64, and powerpc64le.
> > >> Note, the new tests show that for the mentioned little-endian targets
> > >> the optimization does not materialize since either the costs of the new
> > >> instructions are higher or they do not match.  Still ok for mainline?
> > > Hi Stefan,
> > > Unfortunately this patch (committed in 7cdd0860949c6c3232e6cff1d7ca37bb5234074c)
> > > caused the following ICE on armv8l-unknown-linux-gnu:
> > > during RTL pass: combine
> > > ../../../gcc/libgcc/fixed-bit.c: In function ‘__gnu_saturate1sq’:
> > > ../../../gcc/libgcc/fixed-bit.c:210:1: internal compiler error: in
> > > decompose, at rtl.h:2297
> > >    210 | }
> > >        | ^
> > > 0xaa23e3 wi::int_traits<std::pair<rtx_def*, machine_mode>
> > >> ::decompose(long long*, unsigned int, std::pair<rtx_def*,
> > > machine_mode> const&)
> > >          ../../gcc/gcc/rtl.h:2297
> > [ ... ]
> > Yea, we're seeing something very similar on nios2-linux-gnu building the
> > kernel.
> >
> > Prathamesh, can you extract the .i file for fixed-bit on armv8 and open
> > a bug for this issue, attaching the .i file as well as the right command
> > line options necessary to reproduce the failure.  THat way Stefan can
> > tackle it with a cross compiler.
> Hi Jeff,
> Filed the issue in https://gcc.gnu.org/bugzilla/show_bug.cgi?id=110867

Hi Prathamesh,

Sorry for the inconvenience.  I will have a look at this and thanks for
the small reproducer.  I already started to come up with a cross
compiler.

Thanks,
Stefan

> 
> Thanks,
> Prathamesh
> >
> > Thanks,
> > jeff

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2023-08-01  9:36 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-12  7:57 [PATCH] combine: Narrow comparison of memory and constant Stefan Schulze Frielinghaus
2023-06-12 21:29 ` Jeff Law
2023-06-19 14:19   ` Stefan Schulze Frielinghaus
2023-06-19 14:23     ` [PATCH v2] " Stefan Schulze Frielinghaus
2023-07-31 13:26       ` Stefan Schulze Frielinghaus
2023-07-31 13:59       ` Jeff Law
2023-07-31 21:43       ` Prathamesh Kulkarni
2023-07-31 21:46         ` Prathamesh Kulkarni
2023-07-31 23:50         ` Jeff Law
2023-08-01  8:22           ` Prathamesh Kulkarni
2023-08-01  9:36             ` Stefan Schulze Frielinghaus

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).