[gcc r14-8680] PR target/113560: Enhance is_widening_mult_rhs

public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed

* [gcc r14-8680] PR target/113560: Enhance is_widening_mult_rhs_p.
@ 2024-02-01  6:12 Roger Sayle
  0 siblings, 0 replies; only message in thread
From: Roger Sayle @ 2024-02-01  6:12 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:2f14c0dbb789852947cb58fdf7d3162413f053fa

commit r14-8680-g2f14c0dbb789852947cb58fdf7d3162413f053fa
Author: Roger Sayle <roger@nextmovesoftware.com>
Date:   Thu Feb 1 06:10:42 2024 +0000

    PR target/113560: Enhance is_widening_mult_rhs_p.
    
    This patch resolves PR113560, a code quality regression from GCC12
    affecting x86_64, by enhancing the middle-end's tree-ssa-math-opts.cc
    to recognize more instances of widening multiplications.
    
    The widening multiplication perception code identifies cases like:
    
            _1 = (unsigned __int128) x;
            __res = _1 * 100;
    
    but in the reported test case, the original input looks like:
    
            _1 = (unsigned long long) x;
            _2 = (unsigned __int128) _1;
            __res = _2 * 100;
    
    which gets optimized by constant folding during tree-ssa to:
    
            _2 = x & 18446744073709551615;  // x & 0xffffffffffffffff
            __res = _2 * 100;
    
    where the BIT_AND_EXPR hides (has consumed) the extension operation.
    This reveals the more general deficiency (missed optimization
    opportunity) in widening multiplication perception that additionally
    both
    
    __int128 foo(__int128 x, __int128 y) {
      return (x & 1000) * (y & 1000)
    }
    
    and
    
    unsigned __int128 bar(unsigned __int128 x, unsigned __int128) {
      return (x >> 80) * (y >> 80);
    }
    
    should be recognized as widening multiplications.  Hence rather than
    test explicitly for BIT_AND_EXPR (as in the first version of this patch)
    the more general solution is to make use of range information, as
    provided by tree_non_zero_bits.
    
    As a demonstration of the observed improvements, function foo above
    currently with -O2 compiles on x86_64 to:
    
    foo:    movq    %rdi, %rsi
            movq    %rdx, %r8
            xorl    %edi, %edi
            xorl    %r9d, %r9d
            andl    $1000, %esi
            andl    $1000, %r8d
            movq    %rdi, %rcx
            movq    %r9, %rdx
            imulq   %rsi, %rdx
            movq    %rsi, %rax
            imulq   %r8, %rcx
            addq    %rdx, %rcx
            mulq    %r8
            addq    %rdx, %rcx
            movq    %rcx, %rdx
            ret
    
    with this patch, GCC recognizes the *w and instead generates:
    
    foo:    movq    %rdi, %rsi
            movq    %rdx, %r8
            andl    $1000, %esi
            andl    $1000, %r8d
            movq    %rsi, %rax
            imulq   %r8
            ret
    
    which is perhaps easier to understand at the tree-level where
    
    __int128 foo (__int128 x, __int128 y)
    {
      __int128 _1;
      __int128 _2;
      __int128 _5;
    
      <bb 2> [local count: 1073741824]:
      _1 = x_3(D) & 1000;
      _2 = y_4(D) & 1000;
      _5 = _1 * _2;
      return _5;
    
    }
    
    gets transformed to:
    
    __int128 foo (__int128 x, __int128 y)
    {
      __int128 _1;
      __int128 _2;
      __int128 _5;
      signed long _7;
      signed long _8;
    
      <bb 2> [local count: 1073741824]:
      _1 = x_3(D) & 1000;
      _2 = y_4(D) & 1000;
      _7 = (signed long) _1;
      _8 = (signed long) _2;
      _5 = _7 w* _8;
      return _5;
    }
    
    2023-02-01  Roger Sayle  <roger@nextmovesoftware.com>
                Richard Biener  <rguenther@suse.de>
    
    gcc/ChangeLog
            PR target/113560
            * tree-ssa-math-opts.cc (is_widening_mult_rhs_p): Use range
            information via tree_non_zero_bits to check if this operand
            is suitably extended for a widening (or highpart) multiplication.
            (convert_mult_to_widen): Insert explicit casts if the RHS or LHS
            isn't already of the claimed type.
    
    gcc/testsuite/ChangeLog
            PR target/113560
            * g++.target/i386/pr113560.C: New test case.
            * gcc.target/i386/pr113560.c: Likewise.
            * gcc.dg/pr87954.c: Update test case.

Diff:
---
 gcc/testsuite/g++.target/i386/pr113560.C | 19 ++++++++++++
 gcc/testsuite/gcc.dg/pr87954.c           |  2 +-
 gcc/testsuite/gcc.target/i386/pr113560.c | 17 ++++++++++
 gcc/tree-ssa-math-opts.cc                | 53 ++++++++++++++++++++++++++++----
 4 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/gcc/testsuite/g++.target/i386/pr113560.C b/gcc/testsuite/g++.target/i386/pr113560.C
new file mode 100644
index 000000000000..179b68f9833f
--- /dev/null
+++ b/gcc/testsuite/g++.target/i386/pr113560.C
@@ -0,0 +1,19 @@
+/* { dg-do compile { target { ! ia32 } } } */
+/* { dg-options "-Ofast -std=c++23 -march=znver4" } */
+
+#include <immintrin.h>
+auto f(char *buf, unsigned long long in) noexcept
+{
+    unsigned long long hi{};
+    auto lo{_mulx_u64(in, 0x2af31dc462ull, &hi)};
+    lo = _mulx_u64(lo, 100, &hi);
+    __builtin_memcpy(buf + 2, &hi, 2);
+    return buf + 10;
+}
+
+/* { dg-final { scan-assembler-times "mulx" 1 } } */
+/* { dg-final { scan-assembler-times "mulq" 1 } } */
+/* { dg-final { scan-assembler-not "addq" } } */
+/* { dg-final { scan-assembler-not "adcq" } } */
+/* { dg-final { scan-assembler-not "salq" } } */
+/* { dg-final { scan-assembler-not "shldq" } } */
diff --git a/gcc/testsuite/gcc.dg/pr87954.c b/gcc/testsuite/gcc.dg/pr87954.c
index 620657cb1f5d..80d0543bbb38 100644
--- a/gcc/testsuite/gcc.dg/pr87954.c
+++ b/gcc/testsuite/gcc.dg/pr87954.c
@@ -18,4 +18,4 @@ imul(unsigned int flags)
   return type_dma + (is_rec * !is_dma) * KMALLOC_RECLAIM;
 }
 
-/* { dg-final { scan-tree-dump-times { \* } 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times { \*w? |WIDEN_MULT_PLUS_EXPR} 1 "optimized" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr113560.c b/gcc/testsuite/gcc.target/i386/pr113560.c
new file mode 100644
index 000000000000..ac2e01a4589c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr113560.c
@@ -0,0 +1,17 @@
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2" } */
+
+unsigned __int128 foo(unsigned __int128 x, unsigned __int128 y)
+{
+  return (x & 1000) * (y & 1000);
+}
+
+__int128 bar(__int128 x, __int128 y)
+{
+  return (x & 1000) * (y & 1000);
+}
+
+/* { dg-final { scan-assembler-times "\tmulq" 1 } } */
+/* { dg-final { scan-assembler-times "\timulq" 1 } } */
+/* { dg-final { scan-assembler-not "addq" } } */
+/* { dg-final { scan-assembler-not "xorl" } } */
diff --git a/gcc/tree-ssa-math-opts.cc b/gcc/tree-ssa-math-opts.cc
index 2db26e4ed486..cffe7573f794 100644
--- a/gcc/tree-ssa-math-opts.cc
+++ b/gcc/tree-ssa-math-opts.cc
@@ -2552,12 +2552,47 @@ is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
 
   if (TREE_CODE (rhs) == SSA_NAME)
     {
+      /* Use tree_non_zero_bits to see if this operand is zero_extended
+	 for unsigned widening multiplications or non-negative for
+	 signed widening multiplications.  */
+      if (TREE_CODE (type) == INTEGER_TYPE
+	  && (TYPE_PRECISION (type) & 1) == 0
+	  && int_mode_for_size (TYPE_PRECISION (type) / 2, 1).exists ())
+	{
+	  unsigned int prec = TYPE_PRECISION (type);
+	  unsigned int hprec = prec / 2;
+	  wide_int bits = wide_int::from (tree_nonzero_bits (rhs), prec,
+					  TYPE_SIGN (TREE_TYPE (rhs)));
+	  if (TYPE_UNSIGNED (type)
+	      && wi::bit_and (bits, wi::mask (hprec, true, prec)) == 0)
+	    {
+	      *type_out = build_nonstandard_integer_type (hprec, true);
+	      /* X & MODE_MASK can be simplified to (T)X.  */
+	      stmt = SSA_NAME_DEF_STMT (rhs);
+	      if (is_gimple_assign (stmt)
+		  && gimple_assign_rhs_code (stmt) == BIT_AND_EXPR
+		  && TREE_CODE (gimple_assign_rhs2 (stmt)) == INTEGER_CST
+		  && wi::to_wide (gimple_assign_rhs2 (stmt))
+		     == wi::mask (hprec, false, prec))
+		*new_rhs_out = gimple_assign_rhs1 (stmt);
+	      else
+		*new_rhs_out = rhs;
+	      return true;
+	    }
+	  else if (!TYPE_UNSIGNED (type)
+		   && wi::bit_and (bits, wi::mask (hprec - 1, true, prec)) == 0)
+	    {
+	      *type_out = build_nonstandard_integer_type (hprec, false);
+	      *new_rhs_out = rhs;
+	      return true;
+	    }
+	}
+
       stmt = SSA_NAME_DEF_STMT (rhs);
       if (is_gimple_assign (stmt))
 	{
-	  if (! widening_mult_conversion_strippable_p (type, stmt))
-	    rhs1 = rhs;
-	  else
+
+	  if (widening_mult_conversion_strippable_p (type, stmt))
 	    {
 	      rhs1 = gimple_assign_rhs1 (stmt);
 
@@ -2568,6 +2603,8 @@ is_widening_mult_rhs_p (tree type, tree rhs, tree *type_out,
 		  return true;
 		}
 	    }
+	  else
+	    rhs1 = rhs;
 	}
       else
 	rhs1 = rhs;
@@ -2827,12 +2864,16 @@ convert_mult_to_widen (gimple *stmt, gimple_stmt_iterator *gsi)
   if (2 * actual_precision > TYPE_PRECISION (type))
     return false;
   if (actual_precision != TYPE_PRECISION (type1)
-      || from_unsigned1 != TYPE_UNSIGNED (type1))
+      || from_unsigned1 != TYPE_UNSIGNED (type1)
+      || (TREE_TYPE (rhs1) != type1
+	  && TREE_CODE (rhs1) != INTEGER_CST))
     rhs1 = build_and_insert_cast (gsi, loc,
 				  build_nonstandard_integer_type
 				    (actual_precision, from_unsigned1), rhs1);
   if (actual_precision != TYPE_PRECISION (type2)
-      || from_unsigned2 != TYPE_UNSIGNED (type2))
+      || from_unsigned2 != TYPE_UNSIGNED (type2)
+      || (TREE_TYPE (rhs2) != type2
+	  && TREE_CODE (rhs2) != INTEGER_CST))
     rhs2 = build_and_insert_cast (gsi, loc,
 				  build_nonstandard_integer_type
 				    (actual_precision, from_unsigned2), rhs2);
@@ -3046,7 +3087,7 @@ convert_plusminus_to_widen (gimple_stmt_iterator *gsi, gimple *stmt,
       || from_unsigned1 != TYPE_UNSIGNED (type1))
     mult_rhs1 = build_and_insert_cast (gsi, loc,
 				       build_nonstandard_integer_type
-				         (actual_precision, from_unsigned1),
+					 (actual_precision, from_unsigned1),
 				       mult_rhs1);
   if (actual_precision != TYPE_PRECISION (type2)
       || from_unsigned2 != TYPE_UNSIGNED (type2))

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2024-02-01  6:12 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-02-01  6:12 [gcc r14-8680] PR target/113560: Enhance is_widening_mult_rhs_p Roger Sayle

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).